Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an API method for determining the best status #2544

Merged
merged 2 commits into from
Nov 30, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions api/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package api

import (
"fmt"
"strings"
)

const (
Expand All @@ -11,6 +12,15 @@ const (
HealthPassing = "passing"
HealthWarning = "warning"
HealthCritical = "critical"
HealthMaint = "maintenance"
)

const (
// NodeMaint is the special key set by a node in maintenance mode.
NodeMaint = "_node_maintenance"

// ServiceMaintPrefix is the prefix for a service in maintenance mode.
ServiceMaintPrefix = "_service_maintenance:"
)

// HealthCheck is used to represent a single check
Expand All @@ -25,6 +35,51 @@ type HealthCheck struct {
ServiceName string
}

// HealthChecks is a collection of HealthCheck structs.
type HealthChecks []*HealthCheck

// AggregatedStatus returns the "best" status for the list of health checks.
// Because a given entry may have many service and node-level health checks
// attached, this function determines the best representative of the status as
// as single string using the following heuristic:
//
// maintenance > critical > warning > passing
//
func (c HealthChecks) AggregatedStatus() string {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd make this have an error return and bail with an error for the return "" case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Talked about this and just returning the empty string 😄

var passing, warning, critical, maintenance bool
for _, check := range c {
id := string(check.CheckID)
if id == NodeMaint || strings.HasPrefix(id, ServiceMaintPrefix) {
maintenance = true
continue
}

switch check.Status {
case HealthPassing:
passing = true
case HealthWarning:
warning = true
case HealthCritical:
critical = true
default:
return ""
}
}

switch {
case maintenance:
return HealthMaint
case critical:
return HealthCritical
case warning:
return HealthWarning
case passing:
return HealthPassing
default:
return HealthPassing
}
}

// ServiceEntry is used for the health service endpoint
type ServiceEntry struct {
Node *Node
Expand Down
133 changes: 133 additions & 0 deletions api/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,139 @@ func TestHealth_Node(t *testing.T) {
})
}

func TestHealthChecks_AggregatedStatus(t *testing.T) {
t.Parallel()

cases := []struct {
name string
checks HealthChecks
exp string
}{
{
"empty",
nil,
HealthPassing,
},
{
"passing",
HealthChecks{
&HealthCheck{
Status: HealthPassing,
},
},
HealthPassing,
},
{
"warning",
HealthChecks{
&HealthCheck{
Status: HealthWarning,
},
},
HealthWarning,
},
{
"critical",
HealthChecks{
&HealthCheck{
Status: HealthCritical,
},
},
HealthCritical,
},
{
"node_maintenance",
HealthChecks{
&HealthCheck{
CheckID: NodeMaint,
},
},
HealthMaint,
},
{
"service_maintenance",
HealthChecks{
&HealthCheck{
CheckID: ServiceMaintPrefix + "service",
},
},
HealthMaint,
},
{
"unknown",
HealthChecks{
&HealthCheck{
Status: "nope-nope-noper",
},
},
"",
},
{
"maintenance_over_critical",
HealthChecks{
&HealthCheck{
CheckID: NodeMaint,
},
&HealthCheck{
Status: HealthCritical,
},
},
HealthMaint,
},
{
"critical_over_warning",
HealthChecks{
&HealthCheck{
Status: HealthCritical,
},
&HealthCheck{
Status: HealthWarning,
},
},
HealthCritical,
},
{
"warning_over_passing",
HealthChecks{
&HealthCheck{
Status: HealthWarning,
},
&HealthCheck{
Status: HealthPassing,
},
},
HealthWarning,
},
{
"lots",
HealthChecks{
&HealthCheck{
Status: HealthPassing,
},
&HealthCheck{
Status: HealthPassing,
},
&HealthCheck{
Status: HealthPassing,
},
&HealthCheck{
Status: HealthWarning,
},
},
HealthWarning,
},
}

for i, tc := range cases {
t.Run(fmt.Sprintf("%d_%s", i, tc.name), func(t *testing.T) {
act := tc.checks.AggregatedStatus()
if tc.exp != act {
t.Errorf("\nexp: %#v\nact: %#v", tc.exp, act)
}
})
}
}

func TestHealth_Checks(t *testing.T) {
t.Parallel()
c, s := makeClient(t)
Expand Down
14 changes: 5 additions & 9 deletions command/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,6 @@ const (
checksDir = "checks"
checkStateDir = "checks/state"

// The ID of the faux health checks for maintenance mode
serviceMaintCheckPrefix = "_service_maintenance"
nodeMaintCheckID = "_node_maintenance"

// Default reasons for node/service maintenance mode
defaultNodeMaintReason = "Maintenance mode is enabled for this node, " +
"but no reason was provided. This is a default message."
Expand Down Expand Up @@ -1532,7 +1528,7 @@ func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) {

// serviceMaintCheckID returns the ID of a given service's maintenance check
func serviceMaintCheckID(serviceID string) types.CheckID {
return types.CheckID(fmt.Sprintf("%s:%s", serviceMaintCheckPrefix, serviceID))
return types.CheckID(structs.ServiceMaintPrefix + serviceID)
}

// EnableServiceMaintenance will register a false health check against the given
Expand Down Expand Up @@ -1593,7 +1589,7 @@ func (a *Agent) DisableServiceMaintenance(serviceID string) error {
// EnableNodeMaintenance places a node into maintenance mode.
func (a *Agent) EnableNodeMaintenance(reason, token string) {
// Ensure node maintenance is not already enabled
if _, ok := a.state.Checks()[nodeMaintCheckID]; ok {
if _, ok := a.state.Checks()[structs.NodeMaint]; ok {
return
}

Expand All @@ -1605,7 +1601,7 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {
// Create and register the node maintenance check
check := &structs.HealthCheck{
Node: a.config.NodeName,
CheckID: nodeMaintCheckID,
CheckID: structs.NodeMaint,
Name: "Node Maintenance Mode",
Notes: reason,
Status: structs.HealthCritical,
Expand All @@ -1616,10 +1612,10 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {

// DisableNodeMaintenance removes a node from maintenance mode
func (a *Agent) DisableNodeMaintenance() {
if _, ok := a.state.Checks()[nodeMaintCheckID]; !ok {
if _, ok := a.state.Checks()[structs.NodeMaint]; !ok {
return
}
a.RemoveCheck(nodeMaintCheckID, true)
a.RemoveCheck(structs.NodeMaint, true)
a.logger.Printf("[INFO] agent: Node left maintenance mode")
}

Expand Down
6 changes: 3 additions & 3 deletions command/agent/agent_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -926,13 +926,13 @@ func TestHTTPAgent_EnableNodeMaintenance(t *testing.T) {
}

// Ensure the maintenance check was registered
check, ok := srv.agent.state.Checks()[nodeMaintCheckID]
check, ok := srv.agent.state.Checks()[structs.NodeMaint]
if !ok {
t.Fatalf("should have registered maintenance check")
}

// Check that the token was used
if token := srv.agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" {
if token := srv.agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
t.Fatalf("expected 'mytoken', got '%s'", token)
}

Expand Down Expand Up @@ -962,7 +962,7 @@ func TestHTTPAgent_DisableNodeMaintenance(t *testing.T) {
}

// Ensure the maintenance check was removed
if _, ok := srv.agent.state.Checks()[nodeMaintCheckID]; ok {
if _, ok := srv.agent.state.Checks()[structs.NodeMaint]; ok {
t.Fatalf("should have removed maintenance check")
}
}
Expand Down
8 changes: 4 additions & 4 deletions command/agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1577,13 +1577,13 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
agent.EnableNodeMaintenance("broken", "mytoken")

// Make sure the critical health check was added
check, ok := agent.state.Checks()[nodeMaintCheckID]
check, ok := agent.state.Checks()[structs.NodeMaint]
if !ok {
t.Fatalf("should have registered critical node check")
}

// Check that the token was used to register the check
if token := agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" {
if token := agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
t.Fatalf("expected 'mytoken', got: '%s'", token)
}

Expand All @@ -1596,15 +1596,15 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
agent.DisableNodeMaintenance()

// Ensure the check was deregistered
if _, ok := agent.state.Checks()[nodeMaintCheckID]; ok {
if _, ok := agent.state.Checks()[structs.NodeMaint]; ok {
t.Fatalf("should have deregistered critical node check")
}

// Enter maintenance mode without passing a reason
agent.EnableNodeMaintenance("", "")

// Make sure the check was registered with the default note
check, ok = agent.state.Checks()[nodeMaintCheckID]
check, ok = agent.state.Checks()[structs.NodeMaint]
if !ok {
t.Fatalf("should have registered critical node check")
}
Expand Down
12 changes: 11 additions & 1 deletion consul/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,15 @@ const (
HealthPassing = "passing"
HealthWarning = "warning"
HealthCritical = "critical"
HealthMaint = "maintenance"
)

const (
// NodeMaint is the special key set by a node in maintenance mode.
NodeMaint = "_node_maintenance"

// ServiceMaintPrefix is the prefix for a service in maintenance mode.
ServiceMaintPrefix = "_service_maintenance:"
)

func ValidStatus(s string) bool {
Expand Down Expand Up @@ -412,6 +421,7 @@ func (c *HealthCheck) Clone() *HealthCheck {
return clone
}

// HealthChecks is a collection of HealthCheck structs.
type HealthChecks []*HealthCheck

// CheckServiceNode is used to provide the node, its service
Expand Down Expand Up @@ -460,7 +470,7 @@ type NodeInfo struct {
Address string
TaggedAddresses map[string]string
Services []*NodeService
Checks []*HealthCheck
Checks HealthChecks
}

// NodeDump is used to dump all the nodes with all their
Expand Down