From f515f6dea249e07f483d098ef8a51fa7fb3915c9 Mon Sep 17 00:00:00 2001 From: Luis Helder Date: Thu, 19 Oct 2023 17:16:43 -0300 Subject: [PATCH] feat[healthcheck]: new health endpoint (#807) --- hathor/builder/resources_builder.py | 2 + hathor/healthcheck/models.py | 116 ++++++++++ hathor/healthcheck/resources/__init__.py | 19 ++ hathor/healthcheck/resources/healthcheck.py | 199 ++++++++++++++++++ hathor/manager.py | 3 +- hathor/p2p/resources/healthcheck.py | 2 +- .../resources/healthcheck/test_healthcheck.py | 164 +++++++++++++++ tests/resources/p2p/test_healthcheck.py | 95 --------- 8 files changed, 503 insertions(+), 97 deletions(-) create mode 100644 hathor/healthcheck/models.py create mode 100644 hathor/healthcheck/resources/__init__.py create mode 100644 hathor/healthcheck/resources/healthcheck.py create mode 100644 tests/resources/healthcheck/test_healthcheck.py delete mode 100644 tests/resources/p2p/test_healthcheck.py diff --git a/hathor/builder/resources_builder.py b/hathor/builder/resources_builder.py index 5fb42ed0a..46d46023d 100644 --- a/hathor/builder/resources_builder.py +++ b/hathor/builder/resources_builder.py @@ -87,6 +87,7 @@ def create_resources(self) -> server.Site: DebugRejectResource, ) from hathor.feature_activation.resources.feature import FeatureResource + from hathor.healthcheck.resources import HealthcheckResource from hathor.mining.ws import MiningWebsocketFactory from hathor.p2p.resources import ( AddPeersResource, @@ -179,6 +180,7 @@ def create_resources(self) -> server.Site: (b'profiler', ProfilerResource(self.manager), root), (b'top', CPUProfilerResource(self.manager, cpu), root), (b'mempool', MempoolResource(self.manager), root), + (b'health', HealthcheckResource(self.manager), root), # mining (b'mining', MiningResource(self.manager), root), (b'getmininginfo', MiningInfoResource(self.manager), root), diff --git a/hathor/healthcheck/models.py b/hathor/healthcheck/models.py new file mode 100644 index 000000000..c75457720 --- /dev/null +++ b/hathor/healthcheck/models.py @@ -0,0 +1,116 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from datetime import datetime +from enum import Enum +from typing import Any, Optional + + +class ComponentType(str, Enum): + """Enum used to store the component types that can be used in the HealthCheckComponentStatus class.""" + + DATASTORE = 'datastore' + INTERNAL = 'internal' + FULLNODE = 'fullnode' + + +class HealthCheckStatus(str, Enum): + """Enum used to store the component status that can be used in the HealthCheckComponentStatus class.""" + + PASS = 'pass' + WARN = 'warn' + FAIL = 'fail' + + +@dataclass +class ComponentHealthCheck: + """This class is used to store the result of a health check in a specific component.""" + + component_name: str + component_type: ComponentType + status: HealthCheckStatus + output: str + time: Optional[str] = None + component_id: Optional[str] = None + observed_value: Optional[str] = None + observed_unit: Optional[str] = None + + def __post_init__(self) -> None: + self.time = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') + + def to_json(self) -> dict[str, str]: + """Return a dict representation of the object. All field names are converted to camel case.""" + json = { + 'componentType': self.component_type.value, + 'status': self.status.value, + 'output': self.output, + } + + if self.time: + json['time'] = self.time + + if self.component_id: + json['componentId'] = self.component_id + + if self.observed_value: + assert ( + self.observed_unit is not None + ), 'observed_unit must be set if observed_value is set' + + json['observedValue'] = self.observed_value + json['observedUnit'] = self.observed_unit + + return json + + +@dataclass +class ServiceHealthCheck: + """This class is used to store the result of a service health check.""" + + description: str + checks: dict[str, list[ComponentHealthCheck]] + + @property + def status(self) -> HealthCheckStatus: + """Return the status of the health check based on the status of the components.""" + status = HealthCheckStatus.PASS + + for component_checks in self.checks.values(): + for check in component_checks: + if check.status == HealthCheckStatus.FAIL: + return HealthCheckStatus.FAIL + elif check.status == HealthCheckStatus.WARN: + status = HealthCheckStatus.WARN + + return status + + def __post_init__(self) -> None: + """Perform some validations after the object is initialized.""" + # Make sure the checks dict is not empty + if not self.checks: + raise ValueError('checks dict cannot be empty') + + def get_http_status_code(self) -> int: + """Return the HTTP status code for the status.""" + if self.status in [HealthCheckStatus.PASS]: + return 200 + elif self.status in [HealthCheckStatus.WARN, HealthCheckStatus.FAIL]: + return 503 + else: + raise ValueError(f'Missing treatment for status {self.status}') + + def to_json(self) -> dict[str, Any]: + """Return a dict representation of the object. All field names are converted to camel case.""" + return { + 'status': self.status.value, + 'description': self.description, + 'checks': {k: [c.to_json() for c in v] for k, v in self.checks.items()}, + } + + +class ComponentHealthCheckInterface(ABC): + """This is an interface to be used by other classes implementing health checks for components.""" + + @abstractmethod + async def get_health_check(self) -> ComponentHealthCheck: + """Return the health check status for the component.""" + raise NotImplementedError() diff --git a/hathor/healthcheck/resources/__init__.py b/hathor/healthcheck/resources/__init__.py new file mode 100644 index 000000000..514b99ff0 --- /dev/null +++ b/hathor/healthcheck/resources/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2021 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from hathor.healthcheck.resources.healthcheck import HealthcheckResource + +__all__ = [ + 'HealthcheckResource', +] diff --git a/hathor/healthcheck/resources/healthcheck.py b/hathor/healthcheck/resources/healthcheck.py new file mode 100644 index 000000000..2cdc29cd9 --- /dev/null +++ b/hathor/healthcheck/resources/healthcheck.py @@ -0,0 +1,199 @@ +import hathor +from hathor.api_util import Resource, get_arg_default, get_args +from hathor.cli.openapi_files.register import register_resource +from hathor.healthcheck.models import ComponentHealthCheck, ComponentType, HealthCheckStatus, ServiceHealthCheck +from hathor.manager import HathorManager +from hathor.util import json_dumpb + + +def build_sync_health_status(manager: HathorManager) -> ComponentHealthCheck: + """Builds the sync health status object.""" + healthy, reason = manager.is_sync_healthy() + + return ComponentHealthCheck( + component_name='sync', + component_type=ComponentType.INTERNAL, + status=HealthCheckStatus.PASS if healthy else HealthCheckStatus.FAIL, + output=reason or 'Healthy', + ) + + +@register_resource +class HealthcheckResource(Resource): + isLeaf = True + + def __init__(self, manager: HathorManager): + self.manager = manager + + def render_GET(self, request): + """ GET request /health/ + Returns the health status of the fullnode + + The 'strict_status_code' argument can be used to return 200 even if the fullnode is unhealthy. + This can be useful when integrating with tools that could prefer to pass the response code only + in case the response is 200. + + :rtype: string (json) + """ + raw_args = get_args(request) + strict_status_code = get_arg_default(raw_args, 'strict_status_code', '0') == '1' + + components_health_checks = [ + build_sync_health_status(self.manager) + ] + + health_check = ServiceHealthCheck( + description=f'Hathor-core {hathor.__version__}', + checks={c.component_name: [c] for c in components_health_checks}, + ) + + if strict_status_code: + request.setResponseCode(200) + else: + status_code = health_check.get_http_status_code() + request.setResponseCode(status_code) + + return json_dumpb(health_check.to_json()) + + +HealthcheckResource.openapi = { + '/health': { + 'x-visibility': 'public', + 'x-rate-limit': { + 'global': [ + { + 'rate': '10r/s', + 'burst': 10, + 'delay': 5 + } + ], + 'per-ip': [ + { + 'rate': '1r/s', + 'burst': 3, + 'delay': 2 + } + ] + }, + 'get': { + 'tags': ['healthcheck'], + 'operationId': 'get', + 'summary': 'Health status of the fullnode', + 'description': ''' +Returns 200 if the fullnode should be considered healthy. + +Returns 503 otherwise. The response will contain the components that were considered for the healthcheck +and the reason why they were unhealthy. + +Returning 503 with a response body is not the standard behavior for our API, but it was chosen because +most healthcheck tools expect a 503 response code to indicate that the service is unhealthy. + +Optionally, there is a query parameter 'strict_status_code' that can be used to return 200 even if the fullnode +is unhealthy. When its value is 1, the response will always be 200. + +We currently perform 2 checks in the sync mechanism for the healthcheck: +1. Whether the fullnode has recent block activity, i.e. if the fullnode has blocks with recent timestamps. +2. Whether the fullnode has at least one synced peer + ''', + 'parameters': [ + { + 'name': 'strict_status_code', + 'in': 'query', + 'description': 'Enables strict status code. If set to 1, the response will always be 200.', + 'required': False, + 'schema': { + 'type': 'string' + } + }, + ], + 'responses': { + '200': { + 'description': 'Healthy', + 'content': { + 'application/json': { + 'examples': { + 'healthy': { + 'summary': 'Healthy node', + 'value': { + 'status': 'pass', + 'description': 'Hathor-core v0.56.0', + 'checks': { + 'sync': [ + { + 'componentName': 'sync', + 'componentType': 'internal', + 'status': 'pass', + 'output': 'Healthy' + } + ] + } + } + } + } + } + } + }, + '503': { + 'description': 'Unhealthy', + 'content': { + 'application/json': { + 'examples': { + 'no_recent_activity': { + 'summary': 'Node with no recent activity', + 'value': { + 'status': 'fail', + 'description': 'Hathor-core v0.56.0', + 'checks': { + 'sync': [ + { + 'componentName': 'sync', + 'componentType': 'internal', + 'status': 'fail', + 'output': 'Node doesn\'t have recent blocks' + } + ] + } + } + }, + 'no_synced_peer': { + 'summary': 'Node with no synced peer', + 'value': { + 'status': 'fail', + 'description': 'Hathor-core v0.56.0', + 'checks': { + 'sync': [ + { + 'componentName': 'sync', + 'componentType': 'internal', + 'status': 'fail', + 'output': 'Node doesn\'t have a synced peer' + } + ] + } + } + }, + 'peer_best_block_far_ahead': { + 'summary': 'Peer with best block too far ahead', + 'value': { + 'status': 'fail', + 'description': 'Hathor-core v0.56.0', + 'checks': { + 'sync': [ + { + 'componentName': 'sync', + 'componentType': 'internal', + 'status': 'fail', + 'output': 'Node\'s peer with highest height is too far ahead.' + } + ] + } + } + } + } + } + } + }, + } + } + } +} diff --git a/hathor/manager.py b/hathor/manager.py index 630c1b54c..749384af6 100644 --- a/hathor/manager.py +++ b/hathor/manager.py @@ -1165,7 +1165,8 @@ def has_recent_activity(self) -> bool: return True - def is_healthy(self) -> tuple[bool, Optional[str]]: + def is_sync_healthy(self) -> tuple[bool, Optional[str]]: + # This checks whether the last txs (blocks or transactions) we received are recent enough. if not self.has_recent_activity(): return False, HathorManager.UnhealthinessReason.NO_RECENT_ACTIVITY diff --git a/hathor/p2p/resources/healthcheck.py b/hathor/p2p/resources/healthcheck.py index 3e1c1e368..a87182b8c 100644 --- a/hathor/p2p/resources/healthcheck.py +++ b/hathor/p2p/resources/healthcheck.py @@ -17,7 +17,7 @@ def render_GET(self, request): :rtype: string (json) """ - healthy, reason = self.manager.is_healthy() + healthy, reason = self.manager.is_sync_healthy() if not healthy: request.setResponseCode(503) diff --git a/tests/resources/healthcheck/test_healthcheck.py b/tests/resources/healthcheck/test_healthcheck.py new file mode 100644 index 000000000..888aac2af --- /dev/null +++ b/tests/resources/healthcheck/test_healthcheck.py @@ -0,0 +1,164 @@ +from unittest.mock import ANY + +from twisted.internet.defer import inlineCallbacks + +from hathor.healthcheck.resources.healthcheck import HealthcheckResource +from hathor.manager import HathorManager +from hathor.simulator import FakeConnection +from tests import unittest +from tests.resources.base_resource import StubSite, _BaseResourceTest +from tests.utils import add_new_blocks + + +class BaseHealthcheckReadinessTest(_BaseResourceTest._ResourceTest): + __test__ = False + + def setUp(self): + super().setUp() + self.web = StubSite(HealthcheckResource(self.manager)) + + @inlineCallbacks + def test_get_no_recent_activity(self): + """Scenario where the node doesn't have a recent block + """ + response = yield self.web.get('/health') + data = response.json_value() + + self.assertEqual(response.responseCode, 503) + self.assertEqual(data, { + 'status': 'fail', + 'description': ANY, + 'checks': { + 'sync': [{ + 'componentType': 'internal', + 'status': 'fail', + 'output': HathorManager.UnhealthinessReason.NO_RECENT_ACTIVITY, + 'time': ANY + }] + } + }) + + @inlineCallbacks + def test_strict_status_code(self): + """Make sure the 'strict_status_code' parameter is working. + The node should return 200 even if it's not ready. + """ + response = yield self.web.get('/health', {b'strict_status_code': b'1'}) + data = response.json_value() + + self.assertEqual(response.responseCode, 200) + self.assertEqual(data, { + 'status': 'fail', + 'description': ANY, + 'checks': { + 'sync': [{ + 'componentType': 'internal', + 'status': 'fail', + 'output': HathorManager.UnhealthinessReason.NO_RECENT_ACTIVITY, + 'time': ANY + }] + } + }) + + @inlineCallbacks + def test_get_no_connected_peer(self): + """Scenario where the node doesn't have any connected peer + """ + # This will make sure the node has recent activity + add_new_blocks(self.manager, 5) + + self.assertEqual(self.manager.has_recent_activity(), True) + + response = yield self.web.get('/health') + data = response.json_value() + + self.assertEqual(response.responseCode, 503) + self.assertEqual(data, { + 'status': 'fail', + 'description': ANY, + 'checks': { + 'sync': [{ + 'componentType': 'internal', + 'status': 'fail', + 'output': HathorManager.UnhealthinessReason.NO_SYNCED_PEER, + 'time': ANY + }] + } + }) + + @inlineCallbacks + def test_get_peer_out_of_sync(self): + """Scenario where the node is connected with a peer but not synced + """ + # This will make sure the node has recent activity + add_new_blocks(self.manager, 5) + + self.manager2 = self.create_peer('testnet') + self.conn1 = FakeConnection(self.manager, self.manager2) + self.conn1.run_one_step() # HELLO + self.conn1.run_one_step() # PEER-ID + self.conn1.run_one_step() # READY + + self.assertEqual(self.manager2.state, self.manager2.NodeState.READY) + + response = yield self.web.get('/health') + data = response.json_value() + + self.assertEqual(response.responseCode, 503) + self.assertEqual(data, { + 'status': 'fail', + 'description': ANY, + 'checks': { + 'sync': [{ + 'componentType': 'internal', + 'status': 'fail', + 'output': HathorManager.UnhealthinessReason.NO_SYNCED_PEER, + 'time': ANY + }] + } + }) + + @inlineCallbacks + def test_get_ready(self): + """Scenario where the node is ready + """ + self.manager2 = self.create_peer('testnet') + self.conn1 = FakeConnection(self.manager, self.manager2) + + # This will make sure the node has recent activity + add_new_blocks(self.manager, 5) + + # This will make sure the peers are synced + for _ in range(600): + self.conn1.run_one_step(debug=True) + self.clock.advance(0.1) + + response = yield self.web.get('/health') + data = response.json_value() + + self.assertEqual(response.responseCode, 200) + self.assertEqual(data, { + 'status': 'pass', + 'description': ANY, + 'checks': { + 'sync': [{ + 'componentType': 'internal', + 'status': 'pass', + 'output': 'Healthy', + 'time': ANY + }] + } + }) + + +class SyncV1StatusTest(unittest.SyncV1Params, BaseHealthcheckReadinessTest): + __test__ = True + + +class SyncV2StatusTest(unittest.SyncV2Params, BaseHealthcheckReadinessTest): + __test__ = True + + +# sync-bridge should behave like sync-v2 +class SyncBridgeStatusTest(unittest.SyncBridgeParams, SyncV2StatusTest): + pass diff --git a/tests/resources/p2p/test_healthcheck.py b/tests/resources/p2p/test_healthcheck.py deleted file mode 100644 index 90bf1e260..000000000 --- a/tests/resources/p2p/test_healthcheck.py +++ /dev/null @@ -1,95 +0,0 @@ -from twisted.internet.defer import inlineCallbacks - -from hathor.manager import HathorManager -from hathor.p2p.resources.healthcheck import HealthcheckReadinessResource -from hathor.simulator import FakeConnection -from tests import unittest -from tests.resources.base_resource import StubSite, _BaseResourceTest -from tests.utils import add_new_blocks - - -class BaseHealthcheckReadinessTest(_BaseResourceTest._ResourceTest): - __test__ = False - - def setUp(self): - super().setUp() - self.web = StubSite(HealthcheckReadinessResource(self.manager)) - - @inlineCallbacks - def test_get_no_recent_activity(self): - """Scenario where the node doesn't have a recent block - """ - response = yield self.web.get("p2p/readiness") - data = response.json_value() - - self.assertEqual(data['success'], False) - self.assertEqual(data['reason'], HathorManager.UnhealthinessReason.NO_RECENT_ACTIVITY) - - @inlineCallbacks - def test_get_no_connected_peer(self): - """Scenario where the node doesn't have any connected peer - """ - # This will make sure the node has recent activity - add_new_blocks(self.manager, 5) - - self.assertEqual(self.manager.has_recent_activity(), True) - - response = yield self.web.get("p2p/readiness") - data = response.json_value() - - self.assertEqual(data['success'], False) - self.assertEqual(data['reason'], HathorManager.UnhealthinessReason.NO_SYNCED_PEER) - - @inlineCallbacks - def test_get_peer_out_of_sync(self): - """Scenario where the node is connected with a peer but not synced - """ - # This will make sure the node has recent activity - add_new_blocks(self.manager, 5) - - self.manager2 = self.create_peer('testnet') - self.conn1 = FakeConnection(self.manager, self.manager2) - self.conn1.run_one_step() # HELLO - self.conn1.run_one_step() # PEER-ID - self.conn1.run_one_step() # READY - - self.assertEqual(self.manager2.state, self.manager2.NodeState.READY) - - response = yield self.web.get("p2p/readiness") - data = response.json_value() - - self.assertEqual(data['success'], False) - self.assertEqual(data['reason'], HathorManager.UnhealthinessReason.NO_SYNCED_PEER) - - @inlineCallbacks - def test_get_ready(self): - """Scenario where the node is ready - """ - self.manager2 = self.create_peer('testnet') - self.conn1 = FakeConnection(self.manager, self.manager2) - - # This will make sure the node has recent activity - add_new_blocks(self.manager, 5) - - # This will make sure the peers are synced - for _ in range(600): - self.conn1.run_one_step(debug=True) - self.clock.advance(0.1) - - response = yield self.web.get("p2p/readiness") - data = response.json_value() - - self.assertEqual(data['success'], True) - - -class SyncV1StatusTest(unittest.SyncV1Params, BaseHealthcheckReadinessTest): - __test__ = True - - -class SyncV2StatusTest(unittest.SyncV2Params, BaseHealthcheckReadinessTest): - __test__ = True - - -# sync-bridge should behave like sync-v2 -class SyncBridgeStatusTest(unittest.SyncBridgeParams, SyncV2StatusTest): - pass