Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve SONiC disk checker to handle disk full case and mount overlay fs to allow remote user login. #3700

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 89 additions & 20 deletions scripts/disk_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,40 @@

"""
What:
There have been cases, where disk turns Read-only due to kernel bug.
In Read-only state, system blocks new remote user login via TACACS.
This utility is to check & make transient recovery as needed.
This utility is designed to address two specific issues:
1. Disk becoming read-only due to kernel bugs.
2. Disk running out of space.
When either of these issues occurs, the system prevents new remote user logins via TACACS.

How:
check for Read-Write permission. If Read-only, create writable overlay using tmpfs.
Checks for read-write permissions and available disk space.
If an issue is detected, create writable overlay using tmpfs.

By default "/etc" & "/home" are checked and if in Read-only state, make them Read-Write
By default "/etc" & "/home" are checked and if issue detected, make them Read-Write
using overlay on top of tmpfs.

Making /etc & /home as writable lets successful new remote user login.

If in Read-only state or in Read-Write state with the help of tmpfs overlay,
syslog ERR messages are written, to help raise alerts.
Write syslog ERR messages to help raise alerts in the following cases:
1. Disk in read-only state.
2. Disk out of space.
3. Mounted tmpfs overlay.

Monit may be used to invoke it periodically, to help scan & fix and
report via syslog.

Tidbit:
If you would like to test this script, you could simulate a RO disk
with the following command. Reboot will revert the effect.
To test this script:
1. Simulate a RO disk with the following command. Reboot will revert the effect.
sudo bash -c "echo u > /proc/sysrq-trigger"
2. Use up all disk space by create big file in /var/dump/:
dd if=/dev/zero of=/var/dump/sonic_dump_devicename_20241126_204132.tar bs=1G count=50

"""

import argparse
import os
import shutil
import sys
import syslog
import subprocess
Expand All @@ -40,10 +47,16 @@
WORK_DIR = "/run/mount/work"
MOUNTS_FILE = "/proc/mounts"

# Threshold of free block counts: On most file systems, the block size is 4096 bytes.
FREE_SPACE_THRESHOLD = 1024
Copy link
Contributor

@qiluo-msft qiluo-msft Jan 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FREE_SPACE_THRESHOLD

what is the unit? could you add code comment? #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed, unit is file system block, on ext4 this means 1024*4096 = 4MB


EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
EVENTS_PUBLISHER_TAG = "event-disk"
events_handle = None

DISK_RO_EVENT = "read_only"
DISK_FULL_EVENT = "disk_full"

chk_log_level = syslog.LOG_ERR

def _log_msg(lvl, pfx, msg):
Expand All @@ -64,21 +77,35 @@ def log_debug(m):
_log_msg(syslog.LOG_DEBUG, "Debug", m)


def event_pub():
def event_pub(event):
param_dict = FieldValueMap()
param_dict["fail_type"] = "read_only"
param_dict["fail_type"] = event
event_publish(events_handle, EVENTS_PUBLISHER_TAG, param_dict)


def test_disk_full(dirs):
for d in dirs:
space = os.statvfs(d)
if space.f_bavail < FREE_SPACE_THRESHOLD:
log_err("{} has no free disk space".format(d))
event_pub(DISK_FULL_EVENT)
return True
else:
log_debug("{} has enough disk space".format(d))

return False


def test_writable(dirs):
for d in dirs:
rw = os.access(d, os.W_OK)
if not rw:
log_err("{} is not read-write".format(d))
event_pub()
event_pub(DISK_RO_EVENT)
return False
else:
log_debug("{} is Read-Write".format(d))

return True


Expand All @@ -101,7 +128,7 @@ def get_dname(path_name):
return os.path.basename(os.path.normpath(path_name))


def do_mnt(dirs):
def do_mnt(dirs, overlay_prefix):
if os.path.exists(UPPER_DIR):
log_err("Already mounted")
return 1
Expand All @@ -110,7 +137,7 @@ def do_mnt(dirs):
try:
os.mkdir(i)
except OSError as error:
log_err("Failed to create {}".format(i))
log_err("Failed to create {}, error: {}".format(i, error))
return 1

for d in dirs:
Expand All @@ -120,7 +147,7 @@ def do_mnt(dirs):
os.mkdir(d_upper)
os.mkdir(d_work)

ret = run_cmd(["mount", "-t", "overlay", "overlay_{}".format(d_name),\
ret = run_cmd(["mount", "-t", "overlay", "{}_{}".format(overlay_prefix, d_name),
"-o", "lowerdir={},upperdir={},workdir={}".format(d, d_upper, d_work), d])
if ret:
break
Expand All @@ -132,13 +159,36 @@ def do_mnt(dirs):
return ret


def is_mounted(dirs):
def do_unmnt(dirs, overlay_prefix):
for d in dirs:
d_name = get_dname(d)

ret = run_cmd(["umount", "-l", "{}_{}".format(overlay_prefix, d_name)])
liuh-80 marked this conversation as resolved.
Show resolved Hide resolved
if ret:
break

if ret:
log_err("Failed to umount {}".format(dirs))
else:
log_info("{} are unmounted".format(dirs))

for i in (UPPER_DIR, WORK_DIR):
try:
shutil.rmtree(i)
except OSError as error:
log_err("Failed to delete {}, error: {}".format(i, error))
return 1

return ret


def is_mounted(dirs, overlay_prefix):
if not os.path.exists(UPPER_DIR):
return False

onames = set()
for d in dirs:
onames.add("overlay_{}".format(get_dname(d)))
onames.add("{}_{}".format(overlay_prefix, get_dname(d)))

with open(MOUNTS_FILE, "r") as s:
for ln in s.readlines():
Expand All @@ -153,12 +203,31 @@ def do_check(skip_mount, dirs):
ret = 0
if not test_writable(dirs):
if not skip_mount:
ret = do_mnt(dirs)
ret = do_mnt(dirs, "overlay")

# Check if mounted
if (not ret) and is_mounted(dirs):
if (not ret) and is_mounted(dirs, "overlay"):
log_err("READ-ONLY: Mounted {} to make Read-Write".format(dirs))
event_pub()
event_pub(DISK_RO_EVENT)

if ret:
# When disk mounted, disk no free space issue also been fixed.
return ret

# Handle disk no free space case
if test_disk_full(dirs):
if not skip_mount:
ret = do_mnt(dirs, "overlay_disk_full")

# Check if mounted
if (not ret) and is_mounted(dirs, "overlay_disk_full"):
log_err("DISK-FULL: Mounted {} to make Read-Write".format(dirs))
event_pub(DISK_FULL_EVENT)

# Unmount when disk space issue fixed
if is_mounted(dirs, "overlay_disk_full") and not test_disk_full(["/"]):
log_debug("umount for disk space issue fixed")
do_unmnt(dirs, "overlay_disk_full")

return ret

Expand Down
71 changes: 70 additions & 1 deletion tests/disk_check_test.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import sys
import syslog
from unittest.mock import patch
Expand Down Expand Up @@ -131,7 +132,9 @@ def setup(self):

@patch("disk_check.syslog.syslog")
@patch("disk_check.subprocess.run")
def test_readonly(self, mock_proc, mock_log):
@patch('os.statvfs', return_value=os.statvfs_result((4096, 4096, 1909350, 1491513, 4096,
971520, 883302, 883302, 4096, 255)))
def test_readonly(self, mock_os_statvfs, mock_proc, mock_log):
global err_data, cmds, max_log_lvl

mock_proc.side_effect = mock_subproc_run
Expand Down Expand Up @@ -177,6 +180,72 @@ def test_readonly(self, mock_proc, mock_log):

assert max_log_lvl == syslog.LOG_ERR

@patch("disk_check.syslog.syslog")
@patch("disk_check.subprocess.run")
@patch('os.access', return_value=True)
@patch('os.statvfs', return_value=os.statvfs_result((4096, 4096, 1909350, 1491513, 0,
971520, 883302, 883302, 4096, 255)))
def test_mount_disk_full(self, mock_os_statvfs, mock_os_access, mock_proc, mock_log):
global max_log_lvl
max_log_lvl = -1
mock_proc.side_effect = mock_subproc_run
mock_log.side_effect = report_err_msg

tc = {
"upperdir": "/tmp",
}
swap_upper(tc)

with patch('sys.argv', ["", "-d", "/tmpx"]):
disk_check.main()

@patch("disk_check.syslog.syslog")
@patch("disk_check.subprocess.run")
@patch('shutil.rmtree')
@patch('os.access', return_value=True)
@patch('os.statvfs', return_value=os.statvfs_result((4096, 4096, 1909350, 1491513, 4096,
971520, 883302, 883302, 4096, 255)))
def test_unmount_disk_full(self, mock_os_statvfs, mock_os_access, mock_rmtree, mock_proc, mock_log):
liuh-80 marked this conversation as resolved.
Show resolved Hide resolved
global max_log_lvl
max_log_lvl = -1
mock_proc.side_effect = mock_subproc_run
mock_log.side_effect = report_err_msg

tc = {
"upperdir": "/tmp/tmpx",
"workdir": "/tmp/tmpy"
}
swap_upper(tc)
swap_work(tc)

with patch('sys.argv', ["", "-d", "/tmpx"]):
disk_check.main()

@patch("disk_check.syslog.syslog")
@patch("disk_check.subprocess.run")
@patch('os.access', return_value=True)
@patch('os.statvfs', return_value=os.statvfs_result((4096, 4096, 1909350, 1491513, 0,
971520, 883302, 883302, 4096, 255)))
def test_diskfull(self, mock_os_statvfs, mock_os_access, mock_proc, mock_log):
global max_log_lvl
max_log_lvl = -1
mock_proc.side_effect = mock_subproc_run
mock_log.side_effect = report_err_msg

result = disk_check.test_disk_full(["/etc"])
assert result is True

@patch("disk_check.syslog.syslog")
@patch("disk_check.subprocess.run")
def test_do_unmnt(self, mock_proc, mock_log):
global max_log_lvl
max_log_lvl = -1
mock_proc.side_effect = mock_subproc_run
mock_log.side_effect = report_err_msg

disk_check.do_unmnt(["/etc"], "overlay_prefix")


@classmethod
def teardown_class(cls):
subprocess.run("rm -rf /tmp/tmp*", shell=True) # cleanup the temporary dirs
Expand Down
Loading