Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[warm-reboot] Add new preboot health check: verify database integrity (
Browse files Browse the repository at this point in the history
…sonic-net#1785)

What I did
Verify database integrity before proceeding with warm reboot or fast reboot.
This integrity check uses a JSON schema to validate DBs. To start with, only counters_db's table COUNTERS_PORT_NAME_MAP presence is verified. But, this list can advance in future.
The test logic is designed to be generic; any more databases or tables within them can be just added to schema list, and the verification logic needs no change.
How I did it
Added a JSON schema, and generic schema validation logic.
vaibhavhd authored Sep 13, 2021

Verified

This commit was signed with the committer’s verified signature.
lann Lann
1 parent 41e31e8 commit c007d65
Showing 2 changed files with 112 additions and 3 deletions.
84 changes: 84 additions & 0 deletions scripts/check_db_integrity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python3

"""
This is to verify if Database has critical tables present before warmboot can proceed.
If warmboot is allowed with missing critical tables, it can lead to issues in going
down path or during the recovery path. This test detects such issues before proceeding.
The verification procedure here uses JSON schemas to verify the DB entities.
In future, to verify new tables or their content, just the schema modification is needed.
No modification may be needed to the integrity check logic.
"""

import os, sys
import json, jsonschema
import syslog
import subprocess
import traceback

DB_SCHEMA = {
"COUNTERS_DB":
{
"$schema": "http://json-schema.org/draft-06/schema",
"type": "object",
"title": "Schema for COUNTERS DB's entities",
"required": ["COUNTERS_PORT_NAME_MAP"],
"properties": {
"COUNTERS_PORT_NAME_MAP": {"$id": "#/properties/COUNTERS_PORT_NAME_MAP", "type": "object"}
}
}
}


def main():
if not DB_SCHEMA:
return 0

for db_name, schema in DB_SCHEMA.items():
db_dump_file = "/tmp/{}.json".format(db_name)
dump_db_cmd = "sonic-db-dump -n 'COUNTERS_DB' -y > {}".format(db_dump_file)
p = subprocess.Popen(dump_db_cmd, shell=True, text=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
(_, err) = p.communicate()
rc = p.wait()
if rc != 0:
print("Failed to dump db {}. Return code: {} with err: {}".format(db_name, rc, err))

try:
with open(db_dump_file) as fp:
db_dump_data = json.load(fp)
except ValueError as err:
syslog.syslog(syslog.LOG_DEBUG, "DB json file is not a valid json file. " +\
"Error: {}".format(str(err)))
return 1

# What: Validate if critical tables and entries are present in DB.
# Why: This is needed to avoid warmbooting with a bad DB; which can
# potentially trigger failures in the reboot recovery path.
# How: Validate DB against a schema which defines required tables.
try:
jsonschema.validate(instance=db_dump_data, schema=schema)
except jsonschema.exceptions.ValidationError as err:
syslog.syslog(syslog.LOG_ERR, "Database is missing tables/entries needed for reboot procedure. " +\
"DB integrity check failed with:\n{}".format(str(err.message)))
return 1
syslog.syslog(syslog.LOG_DEBUG, "Database integrity checks passed.")
return 0


if __name__ == '__main__':
res = 0
try:
res = main()
except KeyboardInterrupt:
syslog.syslog(syslog.LOG_NOTICE, "SIGINT received. Quitting")
res = 1
except Exception as e:
syslog.syslog(syslog.LOG_ERR, "Got an exception %s: Traceback: %s" % (str(e), traceback.format_exc()))
res = 2
finally:
syslog.closelog()
try:
sys.exit(res)
except SystemExit:
os._exit(res)
31 changes: 28 additions & 3 deletions scripts/fast-reboot
Original file line number Diff line number Diff line change
@@ -12,6 +12,7 @@ SHUTDOWN_ORDER_FILE="/etc/sonic/${REBOOT_TYPE}_order"
VERBOSE=no
FORCE=no
IGNORE_ASIC=no
IGNORE_DB_CHECK=no
STRICT=no
REBOOT_METHOD="/sbin/kexec -e"
ASSISTANT_IP_LIST=""
@@ -38,6 +39,7 @@ EXIT_SYNCD_SHUTDOWN=11
EXIT_FAST_REBOOT_DUMP_FAILURE=12
EXIT_FILTER_FDB_ENTRIES_FAILURE=13
EXIT_COUNTERPOLL_DELAY_FAILURE=14
EXIT_DB_INTEGRITY_FAILURE=15
EXIT_NO_CONTROL_PLANE_ASSISTANT=20
EXIT_SONIC_INSTALLER_VERIFY_REBOOT=21

@@ -59,8 +61,9 @@ function showHelpAndExit()
echo "Usage: ${REBOOT_SCRIPT_NAME} [options]"
echo " -h,-? : get this help"
echo " -v : turn on verbose"
echo " -f : force execution"
echo " -i : ignore MD5-checksum-verification of ASIC configuration files"
echo " -f : force execution - ignore Orchagent RESTARTCHECK failure"
echo " -i : force execution - ignore ASIC MD5-checksum-verification"
echo " -d : force execution - ignore database integrity check"
echo " -r : reboot with /sbin/reboot"
echo " -k : reboot with /sbin/kexec -e [default]"
echo " -x : execute script with -x flag"
@@ -74,7 +77,7 @@ function showHelpAndExit()

function parseOptions()
{
while getopts "vfih?rkxc:s" opt; do
while getopts "vfidh?rkxc:s" opt; do
case ${opt} in
h|\? )
showHelpAndExit
@@ -88,6 +91,9 @@ function parseOptions()
i )
IGNORE_ASIC=yes
;;
d )
IGNORE_DB_CHECK=yes
;;
r )
REBOOT_METHOD="/sbin/reboot"
;;
@@ -327,6 +333,23 @@ function check_docker_exec()
done
}
function check_db_integrity()
{
if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
CHECK_DB_INTEGRITY=0
/usr/local/bin/check_db_integrity.py || CHECK_DB_INTEGRITY=$?
if [[ CHECK_DB_INTEGRITY -ne 0 ]]; then
if [[ x"${IGNORE_DB_CHECK}" == x"yes" ]]; then
debug "Ignoring Database integrity checks..."
else
error "Failed to validate DB's integrity. Exit code: ${CHECK_DB_INTEGRITY}. \
Use '-d' option to force ignore this check."
exit ${EXIT_DB_INTEGRITY_FAILURE}
fi
fi
fi
}
function reboot_pre_check()
{
check_docker_exec
@@ -337,6 +360,8 @@ function reboot_pre_check()
fi
rm ${filename}
check_db_integrity
# Make sure /host has enough space for warm reboot temp files
avail=$(df -k /host | tail -1 | awk '{ print $4 }')
if [[ ${avail} -lt ${MIN_HD_SPACE_NEEDED} ]]; then

0 comments on commit c007d65

Please sign in to comment.