Skip to content

Commit

Permalink
Merge pull request PelicanPlatform#991 from haoming29/cancel-dir-timeout
Browse files Browse the repository at this point in the history
Do not log "director test timeout" warning if director can't be reached out
  • Loading branch information
turetske authored Apr 4, 2024
2 parents dd9cbae + e7e1294 commit e1772bd
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 11 deletions.
30 changes: 23 additions & 7 deletions metrics/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package metrics

import (
"fmt"
"sync"
"time"

Expand All @@ -41,15 +42,16 @@ type (
}

HealthStatus struct {
OverallStatus string `json:"status"`
ComponentStatus map[string]ComponentStatus `json:"components"`
OverallStatus string `json:"status"`
ComponentStatus map[HealthStatusComponent]ComponentStatus `json:"components"`
}

HealthStatusEnum int

HealthStatusComponent string
)

// HealthStatusEnum are stored as Prometheus values and internal struct
const (
StatusCritical HealthStatusEnum = iota + 1
StatusWarning
Expand All @@ -75,7 +77,7 @@ const (
)

var (
healthStatus = sync.Map{}
healthStatus = sync.Map{} // In-memory map of component health status, key is HealthStatusComponent, value is componentStatusInternal

PelicanHealthStatus = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "pelican_component_health_status",
Expand Down Expand Up @@ -110,7 +112,7 @@ func (component HealthStatusComponent) String() string {
// use only, please try to avoid setting this as your component status
func SetComponentHealthStatus(name HealthStatusComponent, state HealthStatusEnum, msg string) {
now := time.Now()
healthStatus.Store(name.String(), componentStatusInternal{state, msg, now})
healthStatus.Store(name, componentStatusInternal{state, msg, now})

PelicanHealthStatus.With(
prometheus.Labels{"component": name.String()}).
Expand All @@ -121,7 +123,7 @@ func SetComponentHealthStatus(name HealthStatusComponent, state HealthStatusEnum
}

func DeleteComponentHealthStatus(name HealthStatusComponent) {
healthStatus.Delete(name.String())
healthStatus.Delete(name)
}

func GetHealthStatus() HealthStatus {
Expand All @@ -133,12 +135,12 @@ func GetHealthStatus() HealthStatus {
if !ok {
return true
}
componentString, ok := component.(string)
componentString, ok := component.(HealthStatusComponent)
if !ok {
return true
}
if status.ComponentStatus == nil {
status.ComponentStatus = make(map[string]ComponentStatus)
status.ComponentStatus = make(map[HealthStatusComponent]ComponentStatus)
}
status.ComponentStatus[componentString] = ComponentStatus{
componentStatus.Status.String(),
Expand All @@ -153,3 +155,17 @@ func GetHealthStatus() HealthStatus {
status.OverallStatus = overallStatus.String()
return status
}

// Get the current health status of a component.
// Status can be critical|warning|ok|unknown
func GetComponentStatus(comp HealthStatusComponent) (status string, err error) {
component, ok := healthStatus.Load(comp)
if !ok {
return "", fmt.Errorf("component %s does not exist", comp.String())
}
statusInt, ok := component.(componentStatusInternal)
if !ok {
return "", fmt.Errorf("wrong format of component status for component %s", comp.String())
}
return statusInt.Status.String(), nil
}
14 changes: 10 additions & 4 deletions server_utils/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,17 @@ func LaunchPeriodicDirectorTimeout(ctx context.Context, egrp *errgroup.Group, nC
for {
select {
case <-directorTimeoutTicker.C:
// Timer fired because no message was received in time.
log.Warningln("No director test report received within the time limit")
metrics.SetComponentHealthStatus(metrics.OriginCache_Director, metrics.StatusCritical, "No director test report received within the time limit")
// If origin can't contact the director, record the error without warning
status, err := metrics.GetComponentStatus(metrics.OriginCache_Federation)
if err == nil && status == "critical" {
metrics.SetComponentHealthStatus(metrics.OriginCache_Director, metrics.StatusCritical, "Failed to advertise to the director. Tests are not expected")
} else {
// Timer fired because no message was received in time.
log.Warningln("No director test report received within the time limit")
metrics.SetComponentHealthStatus(metrics.OriginCache_Director, metrics.StatusCritical, "No director test report received within the time limit")
}
case <-nChan:
log.Debugln("Got notification from director")
log.Debugln("Received director report of health test result")
directorTimeoutTicker.Reset(directorTimeoutDuration)
case <-ctx.Done():
log.Infoln("Director health test timeout loop has been terminated")
Expand Down

0 comments on commit e1772bd

Please sign in to comment.