Skip to content

Commit

Permalink
Add Avalanche liveness health checks (#1287)
Browse files Browse the repository at this point in the history
Co-authored-by: Dan Laine <daniel.laine@avalabs.org>
  • Loading branch information
StephenButtolph and Dan Laine authored Apr 3, 2023
1 parent b77ed9f commit 5f45fe5
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 3 deletions.
9 changes: 9 additions & 0 deletions snow/consensus/avalanche/topological.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,12 @@ func (ta *Topological) HealthCheck(ctx context.Context) (interface{}, error) {
"outstandingVertices": numOutstandingVtx,
}

// check for long running vertices
oldestProcessingDuration := ta.Latency.MeasureAndGetOldestDuration()
processingTimeOK := oldestProcessingDuration <= ta.params.MaxItemProcessingTime
healthy = healthy && processingTimeOK
details["longestRunningVertex"] = oldestProcessingDuration.String()

snowstormReport, err := ta.cg.HealthCheck(ctx)
healthy = healthy && err == nil
details["snowstorm"] = snowstormReport
Expand All @@ -306,6 +312,9 @@ func (ta *Topological) HealthCheck(ctx context.Context) (interface{}, error) {
if isOutstandingVtx {
errorReasons = append(errorReasons, fmt.Sprintf("number outstanding vertexes %d > %d", numOutstandingVtx, ta.params.MaxOutstandingItems))
}
if !processingTimeOK {
errorReasons = append(errorReasons, fmt.Sprintf("vertex processing time %s > %s", oldestProcessingDuration, ta.params.MaxItemProcessingTime))
}
if err != nil {
errorReasons = append(errorReasons, err.Error())
}
Expand Down
21 changes: 18 additions & 3 deletions snow/consensus/snowstorm/directed.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package snowstorm
import (
"context"
"fmt"
"strings"

"github.com/prometheus/client_golang/prometheus"

Expand Down Expand Up @@ -182,12 +183,26 @@ func (dg *Directed) Finalized() bool {
func (dg *Directed) HealthCheck(context.Context) (interface{}, error) {
numOutstandingTxs := dg.Latency.NumProcessing()
isOutstandingTxs := numOutstandingTxs <= dg.params.MaxOutstandingItems
healthy := isOutstandingTxs
details := map[string]interface{}{
"outstandingTransactions": numOutstandingTxs,
}
if !isOutstandingTxs {
errorReason := fmt.Sprintf("number of outstanding txs %d > %d", numOutstandingTxs, dg.params.MaxOutstandingItems)
return details, fmt.Errorf("snowstorm consensus is not healthy reason: %s", errorReason)

// check for long running transactions
oldestProcessingDuration := dg.Latency.MeasureAndGetOldestDuration()
processingTimeOK := oldestProcessingDuration <= dg.params.MaxItemProcessingTime
healthy = healthy && processingTimeOK
details["longestRunningTransaction"] = oldestProcessingDuration.String()

if !healthy {
var errorReasons []string
if !isOutstandingTxs {
errorReasons = append(errorReasons, fmt.Sprintf("number outstanding transactions %d > %d", numOutstandingTxs, dg.params.MaxOutstandingItems))
}
if !processingTimeOK {
errorReasons = append(errorReasons, fmt.Sprintf("transaction processing time %s > %s", oldestProcessingDuration, dg.params.MaxItemProcessingTime))
}
return details, fmt.Errorf("snowstorm consensus is not healthy reason: %s", strings.Join(errorReasons, ", "))
}
return details, nil
}
Expand Down

0 comments on commit 5f45fe5

Please sign in to comment.