-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat[healthcheck]: new health endpoint (#807)
- Loading branch information
Showing
8 changed files
with
503 additions
and
97 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
from abc import ABC, abstractmethod | ||
from dataclasses import dataclass | ||
from datetime import datetime | ||
from enum import Enum | ||
from typing import Any, Optional | ||
|
||
|
||
class ComponentType(str, Enum): | ||
"""Enum used to store the component types that can be used in the HealthCheckComponentStatus class.""" | ||
|
||
DATASTORE = 'datastore' | ||
INTERNAL = 'internal' | ||
FULLNODE = 'fullnode' | ||
|
||
|
||
class HealthCheckStatus(str, Enum): | ||
"""Enum used to store the component status that can be used in the HealthCheckComponentStatus class.""" | ||
|
||
PASS = 'pass' | ||
WARN = 'warn' | ||
FAIL = 'fail' | ||
|
||
|
||
@dataclass | ||
class ComponentHealthCheck: | ||
"""This class is used to store the result of a health check in a specific component.""" | ||
|
||
component_name: str | ||
component_type: ComponentType | ||
status: HealthCheckStatus | ||
output: str | ||
time: Optional[str] = None | ||
component_id: Optional[str] = None | ||
observed_value: Optional[str] = None | ||
observed_unit: Optional[str] = None | ||
|
||
def __post_init__(self) -> None: | ||
self.time = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') | ||
|
||
def to_json(self) -> dict[str, str]: | ||
"""Return a dict representation of the object. All field names are converted to camel case.""" | ||
json = { | ||
'componentType': self.component_type.value, | ||
'status': self.status.value, | ||
'output': self.output, | ||
} | ||
|
||
if self.time: | ||
json['time'] = self.time | ||
|
||
if self.component_id: | ||
json['componentId'] = self.component_id | ||
|
||
if self.observed_value: | ||
assert ( | ||
self.observed_unit is not None | ||
), 'observed_unit must be set if observed_value is set' | ||
|
||
json['observedValue'] = self.observed_value | ||
json['observedUnit'] = self.observed_unit | ||
|
||
return json | ||
|
||
|
||
@dataclass | ||
class ServiceHealthCheck: | ||
"""This class is used to store the result of a service health check.""" | ||
|
||
description: str | ||
checks: dict[str, list[ComponentHealthCheck]] | ||
|
||
@property | ||
def status(self) -> HealthCheckStatus: | ||
"""Return the status of the health check based on the status of the components.""" | ||
status = HealthCheckStatus.PASS | ||
|
||
for component_checks in self.checks.values(): | ||
for check in component_checks: | ||
if check.status == HealthCheckStatus.FAIL: | ||
return HealthCheckStatus.FAIL | ||
elif check.status == HealthCheckStatus.WARN: | ||
status = HealthCheckStatus.WARN | ||
|
||
return status | ||
|
||
def __post_init__(self) -> None: | ||
"""Perform some validations after the object is initialized.""" | ||
# Make sure the checks dict is not empty | ||
if not self.checks: | ||
raise ValueError('checks dict cannot be empty') | ||
|
||
def get_http_status_code(self) -> int: | ||
"""Return the HTTP status code for the status.""" | ||
if self.status in [HealthCheckStatus.PASS]: | ||
return 200 | ||
elif self.status in [HealthCheckStatus.WARN, HealthCheckStatus.FAIL]: | ||
return 503 | ||
else: | ||
raise ValueError(f'Missing treatment for status {self.status}') | ||
|
||
def to_json(self) -> dict[str, Any]: | ||
"""Return a dict representation of the object. All field names are converted to camel case.""" | ||
return { | ||
'status': self.status.value, | ||
'description': self.description, | ||
'checks': {k: [c.to_json() for c in v] for k, v in self.checks.items()}, | ||
} | ||
|
||
|
||
class ComponentHealthCheckInterface(ABC): | ||
"""This is an interface to be used by other classes implementing health checks for components.""" | ||
|
||
@abstractmethod | ||
async def get_health_check(self) -> ComponentHealthCheck: | ||
"""Return the health check status for the component.""" | ||
raise NotImplementedError() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# Copyright 2021 Hathor Labs | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from hathor.healthcheck.resources.healthcheck import HealthcheckResource | ||
|
||
__all__ = [ | ||
'HealthcheckResource', | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,199 @@ | ||
import hathor | ||
from hathor.api_util import Resource, get_arg_default, get_args | ||
from hathor.cli.openapi_files.register import register_resource | ||
from hathor.healthcheck.models import ComponentHealthCheck, ComponentType, HealthCheckStatus, ServiceHealthCheck | ||
from hathor.manager import HathorManager | ||
from hathor.util import json_dumpb | ||
|
||
|
||
def build_sync_health_status(manager: HathorManager) -> ComponentHealthCheck: | ||
"""Builds the sync health status object.""" | ||
healthy, reason = manager.is_sync_healthy() | ||
|
||
return ComponentHealthCheck( | ||
component_name='sync', | ||
component_type=ComponentType.INTERNAL, | ||
status=HealthCheckStatus.PASS if healthy else HealthCheckStatus.FAIL, | ||
output=reason or 'Healthy', | ||
) | ||
|
||
|
||
@register_resource | ||
class HealthcheckResource(Resource): | ||
isLeaf = True | ||
|
||
def __init__(self, manager: HathorManager): | ||
self.manager = manager | ||
|
||
def render_GET(self, request): | ||
""" GET request /health/ | ||
Returns the health status of the fullnode | ||
The 'strict_status_code' argument can be used to return 200 even if the fullnode is unhealthy. | ||
This can be useful when integrating with tools that could prefer to pass the response code only | ||
in case the response is 200. | ||
:rtype: string (json) | ||
""" | ||
raw_args = get_args(request) | ||
strict_status_code = get_arg_default(raw_args, 'strict_status_code', '0') == '1' | ||
|
||
components_health_checks = [ | ||
build_sync_health_status(self.manager) | ||
] | ||
|
||
health_check = ServiceHealthCheck( | ||
description=f'Hathor-core {hathor.__version__}', | ||
checks={c.component_name: [c] for c in components_health_checks}, | ||
) | ||
|
||
if strict_status_code: | ||
request.setResponseCode(200) | ||
else: | ||
status_code = health_check.get_http_status_code() | ||
request.setResponseCode(status_code) | ||
|
||
return json_dumpb(health_check.to_json()) | ||
|
||
|
||
HealthcheckResource.openapi = { | ||
'/health': { | ||
'x-visibility': 'public', | ||
'x-rate-limit': { | ||
'global': [ | ||
{ | ||
'rate': '10r/s', | ||
'burst': 10, | ||
'delay': 5 | ||
} | ||
], | ||
'per-ip': [ | ||
{ | ||
'rate': '1r/s', | ||
'burst': 3, | ||
'delay': 2 | ||
} | ||
] | ||
}, | ||
'get': { | ||
'tags': ['healthcheck'], | ||
'operationId': 'get', | ||
'summary': 'Health status of the fullnode', | ||
'description': ''' | ||
Returns 200 if the fullnode should be considered healthy. | ||
Returns 503 otherwise. The response will contain the components that were considered for the healthcheck | ||
and the reason why they were unhealthy. | ||
Returning 503 with a response body is not the standard behavior for our API, but it was chosen because | ||
most healthcheck tools expect a 503 response code to indicate that the service is unhealthy. | ||
Optionally, there is a query parameter 'strict_status_code' that can be used to return 200 even if the fullnode | ||
is unhealthy. When its value is 1, the response will always be 200. | ||
We currently perform 2 checks in the sync mechanism for the healthcheck: | ||
1. Whether the fullnode has recent block activity, i.e. if the fullnode has blocks with recent timestamps. | ||
2. Whether the fullnode has at least one synced peer | ||
''', | ||
'parameters': [ | ||
{ | ||
'name': 'strict_status_code', | ||
'in': 'query', | ||
'description': 'Enables strict status code. If set to 1, the response will always be 200.', | ||
'required': False, | ||
'schema': { | ||
'type': 'string' | ||
} | ||
}, | ||
], | ||
'responses': { | ||
'200': { | ||
'description': 'Healthy', | ||
'content': { | ||
'application/json': { | ||
'examples': { | ||
'healthy': { | ||
'summary': 'Healthy node', | ||
'value': { | ||
'status': 'pass', | ||
'description': 'Hathor-core v0.56.0', | ||
'checks': { | ||
'sync': [ | ||
{ | ||
'componentName': 'sync', | ||
'componentType': 'internal', | ||
'status': 'pass', | ||
'output': 'Healthy' | ||
} | ||
] | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
}, | ||
'503': { | ||
'description': 'Unhealthy', | ||
'content': { | ||
'application/json': { | ||
'examples': { | ||
'no_recent_activity': { | ||
'summary': 'Node with no recent activity', | ||
'value': { | ||
'status': 'fail', | ||
'description': 'Hathor-core v0.56.0', | ||
'checks': { | ||
'sync': [ | ||
{ | ||
'componentName': 'sync', | ||
'componentType': 'internal', | ||
'status': 'fail', | ||
'output': 'Node doesn\'t have recent blocks' | ||
} | ||
] | ||
} | ||
} | ||
}, | ||
'no_synced_peer': { | ||
'summary': 'Node with no synced peer', | ||
'value': { | ||
'status': 'fail', | ||
'description': 'Hathor-core v0.56.0', | ||
'checks': { | ||
'sync': [ | ||
{ | ||
'componentName': 'sync', | ||
'componentType': 'internal', | ||
'status': 'fail', | ||
'output': 'Node doesn\'t have a synced peer' | ||
} | ||
] | ||
} | ||
} | ||
}, | ||
'peer_best_block_far_ahead': { | ||
'summary': 'Peer with best block too far ahead', | ||
'value': { | ||
'status': 'fail', | ||
'description': 'Hathor-core v0.56.0', | ||
'checks': { | ||
'sync': [ | ||
{ | ||
'componentName': 'sync', | ||
'componentType': 'internal', | ||
'status': 'fail', | ||
'output': 'Node\'s peer with highest height is too far ahead.' | ||
} | ||
] | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
}, | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.