Skip to content

Commit

Permalink
Merge pull request #52 from fcuny-rbx/fcuny/detector-check-metrics
Browse files Browse the repository at this point in the history
feat(detector): add metrics for health checks
  • Loading branch information
fcuny-rbx authored Sep 15, 2022
2 parents 958727d + e17e310 commit 0d31871
Showing 1 changed file with 43 additions and 0 deletions.
43 changes: 43 additions & 0 deletions detector/detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ var (
Name: "npd_detector_info",
Help: "Information about the npd detector",
}, []string{"version"})

healthCheckErrorCounter = &prometheus.CounterVec{}
healthCheckProblemCounter = &prometheus.CounterVec{}
healthCheckProblemGauge = &prometheus.GaugeVec{}
)

//Todo: Add comments to describe locking/contention.
Expand Down Expand Up @@ -233,6 +237,19 @@ func collect(done chan bool, detectorCycleTime time.Duration, limits *Limits) {
startServer = true
done <- startServer
}

for _, hc := range m {
// After we're done running all the checks, we update the
// counter and gauges. If the test failed, we increase by
// 1 the counter and set the gauge to 1.
var failed = 0
if hc.Result == "true" || hc.Result == "Unhealthy" {
failed = 1
}
healthCheckProblemGauge.With(prometheus.Labels{"check": hc.Type}).Set(float64(failed))
healthCheckProblemCounter.With(prometheus.Labels{"check": hc.Type}).Add(float64(failed))
}

time.Sleep(detectorCycleTime)
}

Expand All @@ -246,6 +263,7 @@ func getCPUStats(cpuLimit float64) {
cpuStats, err := collectCPUStats()
if err != nil {
hc.Update("true", err.Error())
healthCheckErrorCounter.With(prometheus.Labels{"check": hc.Type}).Inc()
} else if cpuStats.User >= cpuLimit {
hc.Update("true", fmt.Sprintf("CPU usage: %f %%", cpuStats.User))
} else {
Expand All @@ -267,6 +285,7 @@ func getMemoryStats(memoryLimit float64) {
memoryStats, err := collectMemoryStats()
if err != nil {
hc.Update("true", err.Error())
healthCheckErrorCounter.With(prometheus.Labels{"check": hc.Type}).Inc()
} else {
availableMemory := units.HumanSize(float64(memoryStats.Available))
availableMemoryPercent := (float64(memoryStats.Available) / float64(memoryStats.Total)) * 100
Expand All @@ -293,6 +312,7 @@ func getDiskStats(diskLimit float64) {
diskStats, err := collectDiskStats()
if err != nil {
hc.Update("true", err.Error())
healthCheckErrorCounter.With(prometheus.Labels{"check": hc.Type}).Inc()
} else if diskStats.UsedPercent >= diskLimit {
hc.Update("true", fmt.Sprintf("disk usage is %f %%", diskStats.UsedPercent))
} else {
Expand Down Expand Up @@ -399,9 +419,32 @@ func metricsHandler(registry *prometheus.Registry) http.Handler {
}

func registerMetrics() *prometheus.Registry {
counterOpts := prometheus.CounterOpts{}
gaugeOpts := prometheus.GaugeOpts{}

if nomadDC, ok := os.LookupEnv("NOMAD_DC"); ok {
counterOpts.ConstLabels = prometheus.Labels{"nomad_dc": nomadDC}
gaugeOpts.ConstLabels = prometheus.Labels{"nomad_dc": nomadDC}
}

counterOpts.Name = "npd_detector_check_error_count"
counterOpts.Help = "Number of time a specific health check errored out"
healthCheckErrorCounter = prometheus.NewCounterVec(counterOpts, []string{"check"})

counterOpts.Name = "npd_detector_problem_count"
counterOpts.Help = "Number of time a specific health checks failed"
healthCheckProblemCounter = prometheus.NewCounterVec(counterOpts, []string{"check"})

gaugeOpts.Name = "npd_detector_problem"
gaugeOpts.Help = "If a specific check is affecting the host or not"
healthCheckProblemGauge = prometheus.NewGaugeVec(gaugeOpts, []string{"check"})

r := prometheus.NewRegistry()
r.MustRegister(prometheus.NewGoCollector())
r.MustRegister(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{}))
r.MustRegister(detectorInfo)
r.MustRegister(healthCheckProblemCounter)
r.MustRegister(healthCheckProblemGauge)
r.MustRegister(healthCheckErrorCounter)
return r
}

0 comments on commit 0d31871

Please sign in to comment.