Skip to content

Commit

Permalink
[system-health] Add support for monitoring system health (#4835)
Browse files Browse the repository at this point in the history
* system health first commit

* system health daemon first commit

* Finish healthd

* Changes due to lower layer logic change

* Get ASIC temperature from TEMPERATURE_INFO table

* Add system health make rule and service files

* fix bugs found during manual test

* Change make file to install system-health library to host

* Set system LED to blink on bootup time

* Caught exceptions in system health checker to make it more robust

* fix issue that fan/psu presence will always be true

* fix issue for external checker

* move system-health service to right after rc-local service

* Set system-health service start after database service

* Get system up time via /proc/uptime

* Provide more information in stat for CLI to use

* fix typo

* Set default category to External for external checker

* If external checker reported OK, save it to stat too

* Trim string for external checker output

* fix issue: PSU voltage check always return OK

* Add unit test cases for system health library

* Fix LGTM warnings

* fix demo comments: 1. get boot up timeout from monit configuration file; 2. set system led in library instead of daemon

* Remove boot_timeout configuration because it will get from monit config file

* Fix argument miss

* fix unit test failure

* fix issue: summary status is not correct

* Fix format issues found in code review

* rename th to threshold to make it clearer

* Fix review comment: 1. add a .dep file for system health; 2. deprecated daemon_base and uses sonic-py-common instead

* Fix unit test failure

* Fix LGTM alert

* Fix LGTM alert

* Fix review comments

* Fix review comment

* 1. Add relevant comments for system health; 2. rename external_checker to user_define_checker

* Ignore check for unknown service type

* Fix unit test issue

* Rename user define checker to user defined checker

* Rename user_define_checkers to user_defined_checkers for configuration file

* Renmae file user_define_checker.py -> user_defined_checker.py

* Fix typo

* Adjust import order for config.py

Co-authored-by: Joe LeVeque <jleveque@users.noreply.github.com>

* Adjust import order for src/system-health/health_checker/hardware_checker.py

Co-authored-by: Joe LeVeque <jleveque@users.noreply.github.com>

* Adjust import order for src/system-health/scripts/healthd

Co-authored-by: Joe LeVeque <jleveque@users.noreply.github.com>

* Adjust import orders in src/system-health/tests/test_system_health.py

* Fix typo

* Add new line after import

* If system health configuration file not exist, healthd should exit

* Fix indent and enable pytest coverage

* Fix typo

* Fix typo

* Remove global logger and use log functions inherited from super class

* Change info level logger to notice level

Co-authored-by: Joe LeVeque <jleveque@users.noreply.github.com>
  • Loading branch information
Junchao-Mellanox and jleveque authored Oct 12, 2020
1 parent 8e0e316 commit 1c97a03
Show file tree
Hide file tree
Showing 25 changed files with 1,242 additions and 13 deletions.
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
{
"services_to_ignore": [],
"devices_to_ignore": ["psu.voltage", "psu.temperature"],
"external_checkers": [],
"user_defined_checkers": [],
"polling_interval": 60,
"led_color": {
"fault": "orange",
"normal": "green",
"booting": "orange_blink"
"fault": "orange",
"normal": "green",
"booting": "orange_blink"
}
}
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
{
"services_to_ignore": [],
"devices_to_ignore": ["psu.voltage"],
"external_checkers": [],
"user_defined_checkers": [],
"polling_interval": 60,
"led_color": {
"fault": "orange",
"normal": "green",
"booting": "orange_blink"
"fault": "orange",
"normal": "green",
"booting": "orange_blink"
}
}
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
{
"services_to_ignore": [],
"devices_to_ignore": ["psu","asic","fan"],
"external_checkers": [],
"user_defined_checkers": [],
"polling_interval": 60,
"led_color": {
"fault": "orange",
"normal": "green",
"booting": "orange_blink"
"fault": "orange",
"normal": "green",
"booting": "orange_blink"
}
}
10 changes: 10 additions & 0 deletions files/build_templates/sonic_debian_extension.j2
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,12 @@ sudo cp {{platform_common_py2_wheel_path}} $FILESYSTEM_ROOT/$PLATFORM_COMMON_PY2
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install $PLATFORM_COMMON_PY2_WHEEL_NAME
sudo rm -rf $FILESYSTEM_ROOT/$PLATFORM_COMMON_PY2_WHEEL_NAME

# Install system-health Python 2 package
SYSTEM_HEALTH_PY2_WHEEL_NAME=$(basename {{system_health_py2_wheel_path}})
sudo cp {{system_health_py2_wheel_path}} $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY2_WHEEL_NAME
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install $SYSTEM_HEALTH_PY2_WHEEL_NAME
sudo rm -rf $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY2_WHEEL_NAME

# Install sonic-platform-common Python 3 package
PLATFORM_COMMON_PY3_WHEEL_NAME=$(basename {{platform_common_py3_wheel_path}})
sudo cp {{platform_common_py3_wheel_path}} $FILESYSTEM_ROOT/$PLATFORM_COMMON_PY3_WHEEL_NAME
Expand Down Expand Up @@ -283,6 +289,10 @@ sudo mkdir -p $FILESYSTEM_ROOT/etc/systemd/system/syslog.socket.d
sudo cp $IMAGE_CONFIGS/syslog/override.conf $FILESYSTEM_ROOT/etc/systemd/system/syslog.socket.d/override.conf
sudo cp $IMAGE_CONFIGS/syslog/host_umount.sh $FILESYSTEM_ROOT/usr/bin/

# Copy system-health files
sudo LANG=C cp $IMAGE_CONFIGS/system-health/system-health.service $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM
echo "system-health.service" | sudo tee -a $GENERATED_SERVICE_FILE

# Copy logrotate.d configuration files
sudo cp -f $IMAGE_CONFIGS/logrotate/logrotate.d/* $FILESYSTEM_ROOT/etc/logrotate.d/

Expand Down
11 changes: 11 additions & 0 deletions files/image_config/system-health/system-health.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[Unit]
Description=SONiC system health monitor
Requires=database.service updategraph.service
After=database.service updategraph.service

[Service]
ExecStart=/usr/local/bin/healthd
Restart=always

[Install]
WantedBy=multi-user.target
8 changes: 8 additions & 0 deletions rules/system-health.dep
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
SPATH := $($(SYSTEM_HEALTH)_SRC_PATH)
DEP_FILES := $(SONIC_COMMON_FILES_LIST) rules/system-health.mk rules/system-health.dep
DEP_FILES += $(SONIC_COMMON_BASE_FILES_LIST)
DEP_FILES += $(shell git ls-files $(SPATH))

$(SYSTEM_HEALTH)_CACHE_MODE := GIT_CONTENT_SHA
$(SYSTEM_HEALTH)_DEP_FLAGS := $(SONIC_COMMON_FLAGS_LIST)
$(SYSTEM_HEALTH)_DEP_FILES := $(DEP_FILES)
9 changes: 9 additions & 0 deletions rules/system-health.mk
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# system health python2 wheel

SYSTEM_HEALTH = system_health-1.0-py2-none-any.whl
$(SYSTEM_HEALTH)_SRC_PATH = $(SRC_PATH)/system-health
$(SYSTEM_HEALTH)_PYTHON_VERSION = 2
$(SYSTEM_HEALTH)_DEPENDS = $(SONIC_PY_COMMON_PY2) $(SWSSSDK_PY2) $(SONIC_CONFIG_ENGINE)
SONIC_PYTHON_WHEELS += $(SYSTEM_HEALTH)

export system_health_py2_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))"
3 changes: 2 additions & 1 deletion slave.mk
Original file line number Diff line number Diff line change
Expand Up @@ -819,7 +819,8 @@ $(addprefix $(TARGET_PATH)/, $(SONIC_INSTALLERS)) : $(TARGET_PATH)/% : \
$(addprefix $(PYTHON_WHEELS_PATH)/,$(REDIS_DUMP_LOAD_PY2)) \
$(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_PLATFORM_API_PY2)) \
$(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_YANG_MODELS_PY3)) \
$(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_YANG_MGMT_PY))
$(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_YANG_MGMT_PY)) \
$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))
$(HEADER)
# Pass initramfs and linux kernel explicitly. They are used for all platforms
export debs_path="$(IMAGE_DISTRO_DEBS_PATH)"
Expand Down
8 changes: 8 additions & 0 deletions src/system-health/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
*/deb_dist/
*/dist/
*/build/
*/*.tar.gz
*/*.egg-info
*/.cache/
*.pyc
*/__pycache__/
2 changes: 2 additions & 0 deletions src/system-health/health_checker/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from . import hardware_checker
from . import service_checker
144 changes: 144 additions & 0 deletions src/system-health/health_checker/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import json
import os

from sonic_py_common import device_info


class Config(object):
"""
Manage configuration of system health.
"""

# Default system health check interval
DEFAULT_INTERVAL = 60

# Default boot up timeout. When reboot system, system health will wait a few seconds before starting to work.
DEFAULT_BOOTUP_TIMEOUT = 300

# Default LED configuration. Different platform has different LED capability. This configuration allow vendor to
# override the default behavior.
DEFAULT_LED_CONFIG = {
'fault': 'red',
'normal': 'green',
'booting': 'orange_blink'
}

# System health configuration file name
CONFIG_FILE = 'system_health_monitoring_config.json'

# Monit service configuration file path
MONIT_CONFIG_FILE = '/etc/monit/monitrc'

# Monit service start delay configuration entry
MONIT_START_DELAY_CONFIG = 'with start delay'

def __init__(self):
"""
Constructor. Initialize all configuration entry to default value in case there is no configuration file.
"""
self.platform_name = device_info.get_platform()
self._config_file = os.path.join('/usr/share/sonic/device/', self.platform_name, Config.CONFIG_FILE)
self._last_mtime = None
self.config_data = None
self.interval = Config.DEFAULT_INTERVAL
self.ignore_services = None
self.ignore_devices = None
self.user_defined_checkers = None

def config_file_exists(self):
return os.path.exists(self._config_file)

def load_config(self):
"""
Load the configuration file from disk.
1. If there is no configuration file, current config entries will reset to default value
2. Only read the configuration file is last_mtime changes for better performance
3. If there is any format issues in configuration file, current config entries will reset to default value
:return:
"""
if not self.config_file_exists():
if self._last_mtime is not None:
self._reset()
return

mtime = os.stat(self._config_file)
if mtime != self._last_mtime:
try:
self._last_mtime = mtime
with open(self._config_file, 'r') as f:
self.config_data = json.load(f)

self.interval = self.config_data.get('polling_interval', Config.DEFAULT_INTERVAL)
self.ignore_services = self._get_list_data('services_to_ignore')
self.ignore_devices = self._get_list_data('devices_to_ignore')
self.user_defined_checkers = self._get_list_data('user_defined_checkers')
except Exception as e:
self._reset()

def _reset(self):
"""
Reset current configuration entry to default value
:return:
"""
self._last_mtime = None
self.config_data = None
self.interval = Config.DEFAULT_INTERVAL
self.ignore_services = None
self.ignore_devices = None
self.user_defined_checkers = None

def get_led_color(self, status):
"""
Get desired LED color according to the input status
:param status: System health status
:return: StringLED color
"""
if self.config_data and 'led_color' in self.config_data:
if status in self.config_data['led_color']:
return self.config_data['led_color'][status]

return self.DEFAULT_LED_CONFIG[status]

def get_bootup_timeout(self):
"""
Get boot up timeout from monit configuration file.
1. If monit configuration file does not exist, return default value
2. If there is any exception while parsing monit config, return default value
:return: Integer timeout value
"""
if not os.path.exists(Config.MONIT_CONFIG_FILE):
return self.DEFAULT_BOOTUP_TIMEOUT

try:
with open(Config.MONIT_CONFIG_FILE) as f:
lines = f.readlines()
for line in lines:
if not line:
continue

line = line.strip()
if not line:
continue

pos = line.find('#')
if pos == 0:
continue

line = line[:pos]
pos = line.find(Config.MONIT_START_DELAY_CONFIG)
if pos != -1:
return int(line[pos + len(Config.MONIT_START_DELAY_CONFIG):].strip())
except Exception:
return self.DEFAULT_BOOTUP_TIMEOUT

def _get_list_data(self, key):
"""
Get list type configuration data by key and remove duplicate element.
:param key: Key of the configuration entry
:return: A set of configuration data if key exists
"""
if key in self.config_data:
data = self.config_data[key]
if isinstance(data, list):
return set(data)
return None
Loading

0 comments on commit 1c97a03

Please sign in to comment.