-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[system-health] Add support for monitoring system health (#4835)
* system health first commit * system health daemon first commit * Finish healthd * Changes due to lower layer logic change * Get ASIC temperature from TEMPERATURE_INFO table * Add system health make rule and service files * fix bugs found during manual test * Change make file to install system-health library to host * Set system LED to blink on bootup time * Caught exceptions in system health checker to make it more robust * fix issue that fan/psu presence will always be true * fix issue for external checker * move system-health service to right after rc-local service * Set system-health service start after database service * Get system up time via /proc/uptime * Provide more information in stat for CLI to use * fix typo * Set default category to External for external checker * If external checker reported OK, save it to stat too * Trim string for external checker output * fix issue: PSU voltage check always return OK * Add unit test cases for system health library * Fix LGTM warnings * fix demo comments: 1. get boot up timeout from monit configuration file; 2. set system led in library instead of daemon * Remove boot_timeout configuration because it will get from monit config file * Fix argument miss * fix unit test failure * fix issue: summary status is not correct * Fix format issues found in code review * rename th to threshold to make it clearer * Fix review comment: 1. add a .dep file for system health; 2. deprecated daemon_base and uses sonic-py-common instead * Fix unit test failure * Fix LGTM alert * Fix LGTM alert * Fix review comments * Fix review comment * 1. Add relevant comments for system health; 2. rename external_checker to user_define_checker * Ignore check for unknown service type * Fix unit test issue * Rename user define checker to user defined checker * Rename user_define_checkers to user_defined_checkers for configuration file * Renmae file user_define_checker.py -> user_defined_checker.py * Fix typo * Adjust import order for config.py Co-authored-by: Joe LeVeque <jleveque@users.noreply.github.com> * Adjust import order for src/system-health/health_checker/hardware_checker.py Co-authored-by: Joe LeVeque <jleveque@users.noreply.github.com> * Adjust import order for src/system-health/scripts/healthd Co-authored-by: Joe LeVeque <jleveque@users.noreply.github.com> * Adjust import orders in src/system-health/tests/test_system_health.py * Fix typo * Add new line after import * If system health configuration file not exist, healthd should exit * Fix indent and enable pytest coverage * Fix typo * Fix typo * Remove global logger and use log functions inherited from super class * Change info level logger to notice level Co-authored-by: Joe LeVeque <jleveque@users.noreply.github.com>
- Loading branch information
1 parent
8e0e316
commit 1c97a03
Showing
25 changed files
with
1,242 additions
and
13 deletions.
There are no files selected for viewing
8 changes: 4 additions & 4 deletions
8
device/mellanox/x86_64-mlnx_msn2010-r0/system_health_monitoring_config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,11 @@ | ||
{ | ||
"services_to_ignore": [], | ||
"devices_to_ignore": ["psu.voltage", "psu.temperature"], | ||
"external_checkers": [], | ||
"user_defined_checkers": [], | ||
"polling_interval": 60, | ||
"led_color": { | ||
"fault": "orange", | ||
"normal": "green", | ||
"booting": "orange_blink" | ||
"fault": "orange", | ||
"normal": "green", | ||
"booting": "orange_blink" | ||
} | ||
} |
8 changes: 4 additions & 4 deletions
8
device/mellanox/x86_64-mlnx_msn2700-r0/system_health_monitoring_config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,11 @@ | ||
{ | ||
"services_to_ignore": [], | ||
"devices_to_ignore": ["psu.voltage"], | ||
"external_checkers": [], | ||
"user_defined_checkers": [], | ||
"polling_interval": 60, | ||
"led_color": { | ||
"fault": "orange", | ||
"normal": "green", | ||
"booting": "orange_blink" | ||
"fault": "orange", | ||
"normal": "green", | ||
"booting": "orange_blink" | ||
} | ||
} |
8 changes: 4 additions & 4 deletions
8
device/mellanox/x86_64-mlnx_msn2700_simx-r0/system_health_monitoring_config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,11 @@ | ||
{ | ||
"services_to_ignore": [], | ||
"devices_to_ignore": ["psu","asic","fan"], | ||
"external_checkers": [], | ||
"user_defined_checkers": [], | ||
"polling_interval": 60, | ||
"led_color": { | ||
"fault": "orange", | ||
"normal": "green", | ||
"booting": "orange_blink" | ||
"fault": "orange", | ||
"normal": "green", | ||
"booting": "orange_blink" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
[Unit] | ||
Description=SONiC system health monitor | ||
Requires=database.service updategraph.service | ||
After=database.service updategraph.service | ||
|
||
[Service] | ||
ExecStart=/usr/local/bin/healthd | ||
Restart=always | ||
|
||
[Install] | ||
WantedBy=multi-user.target |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
SPATH := $($(SYSTEM_HEALTH)_SRC_PATH) | ||
DEP_FILES := $(SONIC_COMMON_FILES_LIST) rules/system-health.mk rules/system-health.dep | ||
DEP_FILES += $(SONIC_COMMON_BASE_FILES_LIST) | ||
DEP_FILES += $(shell git ls-files $(SPATH)) | ||
|
||
$(SYSTEM_HEALTH)_CACHE_MODE := GIT_CONTENT_SHA | ||
$(SYSTEM_HEALTH)_DEP_FLAGS := $(SONIC_COMMON_FLAGS_LIST) | ||
$(SYSTEM_HEALTH)_DEP_FILES := $(DEP_FILES) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# system health python2 wheel | ||
|
||
SYSTEM_HEALTH = system_health-1.0-py2-none-any.whl | ||
$(SYSTEM_HEALTH)_SRC_PATH = $(SRC_PATH)/system-health | ||
$(SYSTEM_HEALTH)_PYTHON_VERSION = 2 | ||
$(SYSTEM_HEALTH)_DEPENDS = $(SONIC_PY_COMMON_PY2) $(SWSSSDK_PY2) $(SONIC_CONFIG_ENGINE) | ||
SONIC_PYTHON_WHEELS += $(SYSTEM_HEALTH) | ||
|
||
export system_health_py2_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
*/deb_dist/ | ||
*/dist/ | ||
*/build/ | ||
*/*.tar.gz | ||
*/*.egg-info | ||
*/.cache/ | ||
*.pyc | ||
*/__pycache__/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from . import hardware_checker | ||
from . import service_checker |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
import json | ||
import os | ||
|
||
from sonic_py_common import device_info | ||
|
||
|
||
class Config(object): | ||
""" | ||
Manage configuration of system health. | ||
""" | ||
|
||
# Default system health check interval | ||
DEFAULT_INTERVAL = 60 | ||
|
||
# Default boot up timeout. When reboot system, system health will wait a few seconds before starting to work. | ||
DEFAULT_BOOTUP_TIMEOUT = 300 | ||
|
||
# Default LED configuration. Different platform has different LED capability. This configuration allow vendor to | ||
# override the default behavior. | ||
DEFAULT_LED_CONFIG = { | ||
'fault': 'red', | ||
'normal': 'green', | ||
'booting': 'orange_blink' | ||
} | ||
|
||
# System health configuration file name | ||
CONFIG_FILE = 'system_health_monitoring_config.json' | ||
|
||
# Monit service configuration file path | ||
MONIT_CONFIG_FILE = '/etc/monit/monitrc' | ||
|
||
# Monit service start delay configuration entry | ||
MONIT_START_DELAY_CONFIG = 'with start delay' | ||
|
||
def __init__(self): | ||
""" | ||
Constructor. Initialize all configuration entry to default value in case there is no configuration file. | ||
""" | ||
self.platform_name = device_info.get_platform() | ||
self._config_file = os.path.join('/usr/share/sonic/device/', self.platform_name, Config.CONFIG_FILE) | ||
self._last_mtime = None | ||
self.config_data = None | ||
self.interval = Config.DEFAULT_INTERVAL | ||
self.ignore_services = None | ||
self.ignore_devices = None | ||
self.user_defined_checkers = None | ||
|
||
def config_file_exists(self): | ||
return os.path.exists(self._config_file) | ||
|
||
def load_config(self): | ||
""" | ||
Load the configuration file from disk. | ||
1. If there is no configuration file, current config entries will reset to default value | ||
2. Only read the configuration file is last_mtime changes for better performance | ||
3. If there is any format issues in configuration file, current config entries will reset to default value | ||
:return: | ||
""" | ||
if not self.config_file_exists(): | ||
if self._last_mtime is not None: | ||
self._reset() | ||
return | ||
|
||
mtime = os.stat(self._config_file) | ||
if mtime != self._last_mtime: | ||
try: | ||
self._last_mtime = mtime | ||
with open(self._config_file, 'r') as f: | ||
self.config_data = json.load(f) | ||
|
||
self.interval = self.config_data.get('polling_interval', Config.DEFAULT_INTERVAL) | ||
self.ignore_services = self._get_list_data('services_to_ignore') | ||
self.ignore_devices = self._get_list_data('devices_to_ignore') | ||
self.user_defined_checkers = self._get_list_data('user_defined_checkers') | ||
except Exception as e: | ||
self._reset() | ||
|
||
def _reset(self): | ||
""" | ||
Reset current configuration entry to default value | ||
:return: | ||
""" | ||
self._last_mtime = None | ||
self.config_data = None | ||
self.interval = Config.DEFAULT_INTERVAL | ||
self.ignore_services = None | ||
self.ignore_devices = None | ||
self.user_defined_checkers = None | ||
|
||
def get_led_color(self, status): | ||
""" | ||
Get desired LED color according to the input status | ||
:param status: System health status | ||
:return: StringLED color | ||
""" | ||
if self.config_data and 'led_color' in self.config_data: | ||
if status in self.config_data['led_color']: | ||
return self.config_data['led_color'][status] | ||
|
||
return self.DEFAULT_LED_CONFIG[status] | ||
|
||
def get_bootup_timeout(self): | ||
""" | ||
Get boot up timeout from monit configuration file. | ||
1. If monit configuration file does not exist, return default value | ||
2. If there is any exception while parsing monit config, return default value | ||
:return: Integer timeout value | ||
""" | ||
if not os.path.exists(Config.MONIT_CONFIG_FILE): | ||
return self.DEFAULT_BOOTUP_TIMEOUT | ||
|
||
try: | ||
with open(Config.MONIT_CONFIG_FILE) as f: | ||
lines = f.readlines() | ||
for line in lines: | ||
if not line: | ||
continue | ||
|
||
line = line.strip() | ||
if not line: | ||
continue | ||
|
||
pos = line.find('#') | ||
if pos == 0: | ||
continue | ||
|
||
line = line[:pos] | ||
pos = line.find(Config.MONIT_START_DELAY_CONFIG) | ||
if pos != -1: | ||
return int(line[pos + len(Config.MONIT_START_DELAY_CONFIG):].strip()) | ||
except Exception: | ||
return self.DEFAULT_BOOTUP_TIMEOUT | ||
|
||
def _get_list_data(self, key): | ||
""" | ||
Get list type configuration data by key and remove duplicate element. | ||
:param key: Key of the configuration entry | ||
:return: A set of configuration data if key exists | ||
""" | ||
if key in self.config_data: | ||
data = self.config_data[key] | ||
if isinstance(data, list): | ||
return set(data) | ||
return None |
Oops, something went wrong.