Skip to content

Commit

Permalink
feat[healthcheck]: new health endpoint (#807)
Browse files Browse the repository at this point in the history
  • Loading branch information
luislhl authored Oct 19, 2023
1 parent 4f2da10 commit f515f6d
Show file tree
Hide file tree
Showing 8 changed files with 503 additions and 97 deletions.
2 changes: 2 additions & 0 deletions hathor/builder/resources_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def create_resources(self) -> server.Site:
DebugRejectResource,
)
from hathor.feature_activation.resources.feature import FeatureResource
from hathor.healthcheck.resources import HealthcheckResource
from hathor.mining.ws import MiningWebsocketFactory
from hathor.p2p.resources import (
AddPeersResource,
Expand Down Expand Up @@ -179,6 +180,7 @@ def create_resources(self) -> server.Site:
(b'profiler', ProfilerResource(self.manager), root),
(b'top', CPUProfilerResource(self.manager, cpu), root),
(b'mempool', MempoolResource(self.manager), root),
(b'health', HealthcheckResource(self.manager), root),
# mining
(b'mining', MiningResource(self.manager), root),
(b'getmininginfo', MiningInfoResource(self.manager), root),
Expand Down
116 changes: 116 additions & 0 deletions hathor/healthcheck/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
from datetime import datetime
from enum import Enum
from typing import Any, Optional


class ComponentType(str, Enum):
"""Enum used to store the component types that can be used in the HealthCheckComponentStatus class."""

DATASTORE = 'datastore'
INTERNAL = 'internal'
FULLNODE = 'fullnode'


class HealthCheckStatus(str, Enum):
"""Enum used to store the component status that can be used in the HealthCheckComponentStatus class."""

PASS = 'pass'
WARN = 'warn'
FAIL = 'fail'


@dataclass
class ComponentHealthCheck:
"""This class is used to store the result of a health check in a specific component."""

component_name: str
component_type: ComponentType
status: HealthCheckStatus
output: str
time: Optional[str] = None
component_id: Optional[str] = None
observed_value: Optional[str] = None
observed_unit: Optional[str] = None

def __post_init__(self) -> None:
self.time = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')

def to_json(self) -> dict[str, str]:
"""Return a dict representation of the object. All field names are converted to camel case."""
json = {
'componentType': self.component_type.value,
'status': self.status.value,
'output': self.output,
}

if self.time:
json['time'] = self.time

if self.component_id:
json['componentId'] = self.component_id

if self.observed_value:
assert (
self.observed_unit is not None
), 'observed_unit must be set if observed_value is set'

json['observedValue'] = self.observed_value
json['observedUnit'] = self.observed_unit

return json


@dataclass
class ServiceHealthCheck:
"""This class is used to store the result of a service health check."""

description: str
checks: dict[str, list[ComponentHealthCheck]]

@property
def status(self) -> HealthCheckStatus:
"""Return the status of the health check based on the status of the components."""
status = HealthCheckStatus.PASS

for component_checks in self.checks.values():
for check in component_checks:
if check.status == HealthCheckStatus.FAIL:
return HealthCheckStatus.FAIL
elif check.status == HealthCheckStatus.WARN:
status = HealthCheckStatus.WARN

return status

def __post_init__(self) -> None:
"""Perform some validations after the object is initialized."""
# Make sure the checks dict is not empty
if not self.checks:
raise ValueError('checks dict cannot be empty')

def get_http_status_code(self) -> int:
"""Return the HTTP status code for the status."""
if self.status in [HealthCheckStatus.PASS]:
return 200
elif self.status in [HealthCheckStatus.WARN, HealthCheckStatus.FAIL]:
return 503
else:
raise ValueError(f'Missing treatment for status {self.status}')

def to_json(self) -> dict[str, Any]:
"""Return a dict representation of the object. All field names are converted to camel case."""
return {
'status': self.status.value,
'description': self.description,
'checks': {k: [c.to_json() for c in v] for k, v in self.checks.items()},
}


class ComponentHealthCheckInterface(ABC):
"""This is an interface to be used by other classes implementing health checks for components."""

@abstractmethod
async def get_health_check(self) -> ComponentHealthCheck:
"""Return the health check status for the component."""
raise NotImplementedError()
19 changes: 19 additions & 0 deletions hathor/healthcheck/resources/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright 2021 Hathor Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from hathor.healthcheck.resources.healthcheck import HealthcheckResource

__all__ = [
'HealthcheckResource',
]
199 changes: 199 additions & 0 deletions hathor/healthcheck/resources/healthcheck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
import hathor
from hathor.api_util import Resource, get_arg_default, get_args
from hathor.cli.openapi_files.register import register_resource
from hathor.healthcheck.models import ComponentHealthCheck, ComponentType, HealthCheckStatus, ServiceHealthCheck
from hathor.manager import HathorManager
from hathor.util import json_dumpb


def build_sync_health_status(manager: HathorManager) -> ComponentHealthCheck:
"""Builds the sync health status object."""
healthy, reason = manager.is_sync_healthy()

return ComponentHealthCheck(
component_name='sync',
component_type=ComponentType.INTERNAL,
status=HealthCheckStatus.PASS if healthy else HealthCheckStatus.FAIL,
output=reason or 'Healthy',
)


@register_resource
class HealthcheckResource(Resource):
isLeaf = True

def __init__(self, manager: HathorManager):
self.manager = manager

def render_GET(self, request):
""" GET request /health/
Returns the health status of the fullnode
The 'strict_status_code' argument can be used to return 200 even if the fullnode is unhealthy.
This can be useful when integrating with tools that could prefer to pass the response code only
in case the response is 200.
:rtype: string (json)
"""
raw_args = get_args(request)
strict_status_code = get_arg_default(raw_args, 'strict_status_code', '0') == '1'

components_health_checks = [
build_sync_health_status(self.manager)
]

health_check = ServiceHealthCheck(
description=f'Hathor-core {hathor.__version__}',
checks={c.component_name: [c] for c in components_health_checks},
)

if strict_status_code:
request.setResponseCode(200)
else:
status_code = health_check.get_http_status_code()
request.setResponseCode(status_code)

return json_dumpb(health_check.to_json())


HealthcheckResource.openapi = {
'/health': {
'x-visibility': 'public',
'x-rate-limit': {
'global': [
{
'rate': '10r/s',
'burst': 10,
'delay': 5
}
],
'per-ip': [
{
'rate': '1r/s',
'burst': 3,
'delay': 2
}
]
},
'get': {
'tags': ['healthcheck'],
'operationId': 'get',
'summary': 'Health status of the fullnode',
'description': '''
Returns 200 if the fullnode should be considered healthy.
Returns 503 otherwise. The response will contain the components that were considered for the healthcheck
and the reason why they were unhealthy.
Returning 503 with a response body is not the standard behavior for our API, but it was chosen because
most healthcheck tools expect a 503 response code to indicate that the service is unhealthy.
Optionally, there is a query parameter 'strict_status_code' that can be used to return 200 even if the fullnode
is unhealthy. When its value is 1, the response will always be 200.
We currently perform 2 checks in the sync mechanism for the healthcheck:
1. Whether the fullnode has recent block activity, i.e. if the fullnode has blocks with recent timestamps.
2. Whether the fullnode has at least one synced peer
''',
'parameters': [
{
'name': 'strict_status_code',
'in': 'query',
'description': 'Enables strict status code. If set to 1, the response will always be 200.',
'required': False,
'schema': {
'type': 'string'
}
},
],
'responses': {
'200': {
'description': 'Healthy',
'content': {
'application/json': {
'examples': {
'healthy': {
'summary': 'Healthy node',
'value': {
'status': 'pass',
'description': 'Hathor-core v0.56.0',
'checks': {
'sync': [
{
'componentName': 'sync',
'componentType': 'internal',
'status': 'pass',
'output': 'Healthy'
}
]
}
}
}
}
}
}
},
'503': {
'description': 'Unhealthy',
'content': {
'application/json': {
'examples': {
'no_recent_activity': {
'summary': 'Node with no recent activity',
'value': {
'status': 'fail',
'description': 'Hathor-core v0.56.0',
'checks': {
'sync': [
{
'componentName': 'sync',
'componentType': 'internal',
'status': 'fail',
'output': 'Node doesn\'t have recent blocks'
}
]
}
}
},
'no_synced_peer': {
'summary': 'Node with no synced peer',
'value': {
'status': 'fail',
'description': 'Hathor-core v0.56.0',
'checks': {
'sync': [
{
'componentName': 'sync',
'componentType': 'internal',
'status': 'fail',
'output': 'Node doesn\'t have a synced peer'
}
]
}
}
},
'peer_best_block_far_ahead': {
'summary': 'Peer with best block too far ahead',
'value': {
'status': 'fail',
'description': 'Hathor-core v0.56.0',
'checks': {
'sync': [
{
'componentName': 'sync',
'componentType': 'internal',
'status': 'fail',
'output': 'Node\'s peer with highest height is too far ahead.'
}
]
}
}
}
}
}
}
},
}
}
}
}
3 changes: 2 additions & 1 deletion hathor/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -1165,7 +1165,8 @@ def has_recent_activity(self) -> bool:

return True

def is_healthy(self) -> tuple[bool, Optional[str]]:
def is_sync_healthy(self) -> tuple[bool, Optional[str]]:
# This checks whether the last txs (blocks or transactions) we received are recent enough.
if not self.has_recent_activity():
return False, HathorManager.UnhealthinessReason.NO_RECENT_ACTIVITY

Expand Down
2 changes: 1 addition & 1 deletion hathor/p2p/resources/healthcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def render_GET(self, request):
:rtype: string (json)
"""
healthy, reason = self.manager.is_healthy()
healthy, reason = self.manager.is_sync_healthy()

if not healthy:
request.setResponseCode(503)
Expand Down
Loading

0 comments on commit f515f6d

Please sign in to comment.