Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release 4.7] Bug 1935605: Adds memory usage to the metadata #364

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
220 changes: 220 additions & 0 deletions docs/insights-archive-sample/insights-operator/gathers.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
{
"status_reports": [
{
"name": "clusterconfig.GatherMostRecentMetrics",
"duration_in_ms": 0,
"records_count": 0,
"errors": null
},
{
"name": "clusterconfig.GatherSAPConfig",
"duration_in_ms": 204,
"records_count": 0,
"errors": [
"datahubs.installers.datahub.sap.com is forbidden: User \"system:serviceaccount:openshift-insights:gather\" cannot list resource \"datahubs\" in API group \"installers.datahub.sap.com\" at the cluster scope"
]
},
{
"name": "clusterconfig.GatherClusterVersion",
"duration_in_ms": 205,
"records_count": 1,
"errors": null
},
{
"name": "clusterconfig.GatherClusterImagePruner",
"duration_in_ms": 207,
"records_count": 1,
"errors": null
},
{
"name": "clusterconfig.GatherOLMOperators",
"duration_in_ms": 335,
"records_count": 0,
"errors": [
"operators.operators.coreos.com is forbidden: User \"system:serviceaccount:openshift-insights:gather\" cannot list resource \"operators\" in API group \"operators.coreos.com\" at the cluster scope"
]
},
{
"name": "clusterconfig.GatherClusterNetwork",
"duration_in_ms": 337,
"records_count": 1,
"errors": null
},
{
"name": "clusterconfig.GatherContainerRuntimeConfig",
"duration_in_ms": 338,
"records_count": 0,
"errors": null
},
{
"name": "clusterconfig.GatherHostSubnet",
"duration_in_ms": 339,
"records_count": 6,
"errors": null
},
{
"name": "clusterconfig.GatherClusterInfrastructure",
"duration_in_ms": 470,
"records_count": 1,
"errors": null
},
{
"name": "clusterconfig.GatherClusterAuthentication",
"duration_in_ms": 871,
"records_count": 1,
"errors": null
},
{
"name": "clusterconfig.GatherClusterIngress",
"duration_in_ms": 879,
"records_count": 1,
"errors": null
},
{
"name": "clusterconfig.GatherClusterProxy",
"duration_in_ms": 882,
"records_count": 1,
"errors": null
},
{
"name": "clusterconfig.GatherMachineSet",
"duration_in_ms": 891,
"records_count": 3,
"errors": null
},
{
"name": "clusterconfig.GatherClusterID",
"duration_in_ms": 1002,
"records_count": 1,
"errors": null
},
{
"name": "clusterconfig.GatherClusterOAuth",
"duration_in_ms": 1152,
"records_count": 1,
"errors": null
},
{
"name": "clusterconfig.GatherSAPVsystemIptablesLogs",
"duration_in_ms": 1289,
"records_count": 0,
"errors": [
"datahubs.installers.datahub.sap.com is forbidden: User \"system:serviceaccount:openshift-insights:gather\" cannot list resource \"datahubs\" in API group \"installers.datahub.sap.com\" at the cluster scope"
]
},
{
"name": "clusterconfig.GatherClusterImageRegistry",
"duration_in_ms": 1289,
"records_count": 1,
"errors": null
},
{
"name": "clusterconfig.GatherCertificateSigningRequests",
"duration_in_ms": 1424,
"records_count": 0,
"errors": null
},
{
"name": "clusterconfig.GatherSAPPods",
"duration_in_ms": 1558,
"records_count": 0,
"errors": [
"datahubs.installers.datahub.sap.com is forbidden: User \"system:serviceaccount:openshift-insights:gather\" cannot list resource \"datahubs\" in API group \"installers.datahub.sap.com\" at the cluster scope"
]
},
{
"name": "clusterconfig.GatherPodDisruptionBudgets",
"duration_in_ms": 1560,
"records_count": 2,
"errors": null
},
{
"name": "clusterconfig.GatherClusterFeatureGates",
"duration_in_ms": 1699,
"records_count": 1,
"errors": null
},
{
"name": "clusterconfig.GatherNetNamespace",
"duration_in_ms": 2176,
"records_count": 1,
"errors": null
},
{
"name": "clusterconfig.GatherMachineConfigPool",
"duration_in_ms": 2228,
"records_count": 2,
"errors": null
},
{
"name": "clusterconfig.GatherCRD",
"duration_in_ms": 2396,
"records_count": 2,
"errors": null
},
{
"name": "clusterconfig.GatherNodes",
"duration_in_ms": 2640,
"records_count": 6,
"errors": null
},
{
"name": "clusterconfig.GatherConfigMaps",
"duration_in_ms": 3047,
"records_count": 10,
"errors": [
"configmaps \"cluster-monitoring-config\" not found"
]
},
{
"name": "clusterconfig.GatherOpenShiftAPIServerOperatorLogs",
"duration_in_ms": 3316,
"records_count": 0,
"errors": null
},
{
"name": "clusterconfig.GatherOpenshiftAuthenticationLogs",
"duration_in_ms": 3578,
"records_count": 0,
"errors": null
},
{
"name": "clusterconfig.GatherOpenshiftSDNControllerLogs",
"duration_in_ms": 3578,
"records_count": 0,
"errors": null
},
{
"name": "clusterconfig.GatherContainerImages",
"duration_in_ms": 4162,
"records_count": 19,
"errors": null
},
{
"name": "clusterconfig.GatherOpenshiftSDNLogs",
"duration_in_ms": 5635,
"records_count": 0,
"errors": null
},
{
"name": "clusterconfig.GatherClusterOperators",
"duration_in_ms": 7744,
"records_count": 31,
"errors": null
},
{
"name": "clusterconfig.GatherInstallPlans",
"duration_in_ms": 13408,
"records_count": 1,
"errors": null
},
{
"name": "clusterconfig.GatherServiceAccounts",
"duration_in_ms": 14211,
"records_count": 1,
"errors": null
}
],
"memory_alloc_bytes": 20949008,
"uptime_seconds": 58.282
}
104 changes: 77 additions & 27 deletions pkg/gather/clusterconfig/0_gatherer.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ import (
"github.com/openshift/insights-operator/pkg/record"
)

type gatherMetadata struct {
StatusReports []gatherStatusReport `json:"status_reports"`
MemoryAlloc uint64 `json:"memory_alloc_bytes"`
Uptime float64 `json:"uptime_seconds"`
}

type gatherStatusReport struct {
Name string `json:"name"`
Duration time.Duration `json:"duration_in_ms"`
Expand All @@ -41,6 +47,27 @@ type gathering struct {
canFail bool
}

// GatherInfo from reflection
type GatherInfo struct {
name string
result gatherResult
function gatherFunction
canFail bool
rvString string
}

// NewGatherInfo that holds reflection information
func NewGatherInfo(gather string, rv reflect.Value) *GatherInfo {
gatherFunc := gatherFunctions[gather].function
return &GatherInfo{
name: runtime.FuncForPC(reflect.ValueOf(gatherFunc).Pointer()).Name(),
result: rv.Interface().(gatherResult),
function: gatherFunc,
canFail: gatherFunctions[gather].canFail,
rvString: rv.String(),
}
}

func important(function gatherFunction) gathering {
return gathering{function, false}
}
Expand Down Expand Up @@ -86,8 +113,11 @@ var gatherFunctions = map[string]gathering{
"olm_operators": failable(GatherOLMOperators),
}

var startTime time.Time

// New creates new Gatherer
func New(gatherKubeConfig *rest.Config, gatherProtoKubeConfig *rest.Config, metricsGatherKubeConfig *rest.Config) *Gatherer {
startTime = time.Now()
return &Gatherer{
gatherKubeConfig: gatherKubeConfig,
gatherProtoKubeConfig: gatherProtoKubeConfig,
Expand Down Expand Up @@ -116,36 +146,19 @@ func (g *Gatherer) Gather(ctx context.Context, gatherList []string, recorder rec
}

// Gets the info from the Go routines
remaining := len(cases)
for remaining > 0 {
for range gatherList {
chosen, value, _ := reflect.Select(cases)
// The chosen channel has been closed, so zero out the channel to disable the case
cases[chosen].Chan = reflect.ValueOf(nil)
remaining -= 1

elapsed := time.Since(starts[chosen]).Truncate(time.Millisecond)

gatherResults, _ := value.Interface().(gatherResult)
gatherFunc := gatherFunctions[gatherList[chosen]].function
gatherCanFail := gatherFunctions[gatherList[chosen]].canFail
gatherName := runtime.FuncForPC(reflect.ValueOf(gatherFunc).Pointer()).Name()
klog.V(4).Infof("Gather %s took %s to process %d records", gatherName, elapsed, len(gatherResults.records))
gatherReport = append(gatherReport, gatherStatusReport{gatherName, time.Duration(elapsed.Milliseconds()), len(gatherResults.records), extractErrors(gatherResults.errors)})

if gatherCanFail {
for _, err := range gatherResults.errors {
klog.V(5).Infof("Couldn't gather %s' received following error: %s\n", gatherName, err.Error())
}
} else {
errors = append(errors, extractErrors(gatherResults.errors)...)
}
for _, record := range gatherResults.records {
if err := recorder.Record(record); err != nil {
errors = append(errors, fmt.Sprintf("unable to record %s: %v", record.Name, err))
continue
}
gather := gatherList[chosen]

gi := NewGatherInfo(gather, value)
statusReport, errorsReport := createStatusReport(gi, recorder, starts[chosen])

if len(errorsReport) > 0 {
errors = append(errors, errorsReport...)
}
klog.V(5).Infof("Read from %s's channel and received %s\n", gatherName, value.String())
gatherReport = append(gatherReport, statusReport)
}

// Creates the gathering performance report
Expand All @@ -159,6 +172,40 @@ func (g *Gatherer) Gather(ctx context.Context, gatherList []string, recorder rec
return nil
}

func createStatusReport(gather *GatherInfo, recorder record.Interface, starts time.Time) (gatherStatusReport, []string) {
var errors []string
elapsed := time.Since(starts).Truncate(time.Millisecond)

klog.V(4).Infof("Gather %s took %s to process %d records", gather.name, elapsed, len(gather.result.records))

shortName := strings.Replace(gather.name, "github.com/openshift/insights-operator/pkg/gather/", "", 1)
report := gatherStatusReport{shortName, time.Duration(elapsed.Milliseconds()), len(gather.result.records), extractErrors(gather.result.errors)}

if gather.canFail {
for _, err := range gather.result.errors {
klog.V(5).Infof("Couldn't gather %s' received following error: %s\n", gather.name, err.Error())
}
} else {
errors = extractErrors(gather.result.errors)
}

errors = append(errors, recordStatusReport(recorder, gather.result.records)...)
klog.V(5).Infof("Read from %s's channel and received %s\n", gather.name, gather.rvString)

return report, errors
}

func recordStatusReport(recorder record.Interface, records []record.Record) []string {
var errors []string
for _, record := range records {
if err := recorder.Record(record); err != nil {
errors = append(errors, fmt.Sprintf("unable to record %s: %v", record.Name, err))
continue
}
}
return errors
}

// Runs each gather functions in a goroutine.
// Every gather function is given its own channel to send back the results.
// 1. return value: `cases` list, used for dynamically reading from the channels.
Expand Down Expand Up @@ -190,7 +237,10 @@ func (g *Gatherer) startGathering(gatherList []string, errors *[]string) ([]refl
}

func recordGatherReport(recorder record.Interface, report []gatherStatusReport) error {
r := record.Record{Name: "insights-operator/gathers", Item: record.JSONMarshaller{Object: report}}
var m runtime.MemStats
runtime.ReadMemStats(&m)
metadata := gatherMetadata{report, m.HeapAlloc, time.Since(startTime).Truncate(time.Millisecond).Seconds()}
r := record.Record{Name: "insights-operator/gathers", Item: record.JSONMarshaller{Object: metadata}}
return recorder.Record(r)
}

Expand Down