Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Install systemd-coredump in SONiC to be used as the core dump utility #6079

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions build_debian.sh
Original file line number Diff line number Diff line change
Expand Up @@ -392,11 +392,8 @@ rm /files/lib/systemd/system/rsyslog.service/Service/ExecStart/arguments
set /files/lib/systemd/system/rsyslog.service/Service/ExecStart/arguments/1 -n
"

sudo mkdir -p $FILESYSTEM_ROOT/var/core

# Config sysctl
sudo augtool --autosave "
set /files/etc/sysctl.conf/kernel.core_pattern '|/usr/local/bin/coredump-compress %e %t %p %P'
set /files/etc/sysctl.conf/kernel.softlockup_panic 1
set /files/etc/sysctl.conf/kernel.panic 10
set /files/etc/sysctl.conf/vm.panic_on_oom 2
Expand Down
3 changes: 3 additions & 0 deletions files/build_templates/docker_image_ctl.j2
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,9 @@ start() {
--uts=host \{# W/A: this should be set per-docker, for those dockers which really need host's UTS namespace #}
{%- if install_debug_image == "y" %}
-v /src:/src:ro -v /debug:/debug:rw \
-v /var/log/journal:/var/log/journal:ro \
-v /var/lib/systemd/coredump:/var/lib/systemd/coredump:ro \
-v /etc/machine-id:/etc/machine-id:ro \
{%- endif %}
{%- if '--log-driver=json-file' in docker_image_run_opt or '--log-driver' not in docker_image_run_opt %}
--log-opt max-size=2M --log-opt max-file=5 \
Expand Down
18 changes: 18 additions & 0 deletions files/build_templates/sonic_debian_extension.j2
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,24 @@ sudo cp $IMAGE_CONFIGS/constants/constants.yml $FILESYSTEM_ROOT/etc/sonic/
sudo cp $IMAGE_CONFIGS/sudoers/sudoers $FILESYSTEM_ROOT/etc/
sudo cp $IMAGE_CONFIGS/sudoers/sudoers.lecture $FILESYSTEM_ROOT/etc/


# Allow systemd-coredump to perform cleanup of core files and not tmpfiles.d
sudo sed -i "/\/var\/lib\/systemd\/coredump/d" $FILESYSTEM_ROOT/usr/lib/tmpfiles.d/systemd.conf

# Customize systemd-coredump configuration
sudo mkdir -p $FILESYSTEM_ROOT/etc/systemd/coredump.conf.d
sudo cp $IMAGE_CONFIGS/coredump/coredump.conf.d/00-sonic-coredump.conf $FILESYSTEM_ROOT/etc/systemd/coredump.conf.d

## Enable persistent journal to store coredump history
sudo mkdir -p $FILESYSTEM_ROOT/etc/systemd/journald.conf.d/
sudo cp files/image_config/journald/journald.conf.d/00-sonic-journald.conf $FILESYSTEM_ROOT/etc/systemd/journald.conf.d/

## Shortcut to access core files
sudo ln -sf /var/lib/systemd/coredump $FILESYSTEM_ROOT/var/core

## Configure application core dump handler
sudo LANG=C DEBIAN_FRONTEND=noninteractive chroot $FILESYSTEM_ROOT apt-get install -y systemd-coredump

# Copy systemd timer configuration
sudo cp $BUILD_TEMPLATES/pcie-check.timer $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable pcie-check.timer
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[Coredump]
Storage=external
Compress=yes
ProcessSizeMax=8G
ExternalSizeMax=8G
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[Journal]
Storage=persistent
SystemMaxUse=256M
RuntimeMaxUse=356M
MaxLevelStore=crit
93 changes: 64 additions & 29 deletions files/scripts/core_cleanup.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,78 @@
#!/usr/bin/env python3

#!/usr/bin/python3
import argparse
from stat import ST_CTIME
import os
from collections import defaultdict
from datetime import datetime

from sonic_py_common.logger import Logger
import sys

CORE_FILE_DIR=r'/var/core'
DEFAULT_INTERVAL=0
DEFAULT_MAX_DUMPS=4
SYSLOG_IDENTIFIER = 'core_cleanup.py'
CORE_FILE_DIR = '/var/core/'
MAX_CORE_FILES = 4

def exe_name(filename):
if filename.count('.') > 0:
return filename.split('.')[1]
else:
return filename

def corefile_cleanup(input_dir, max_dumps, logger):
core_files = (os.path.join(input_dir, fn) for fn in os.listdir(input_dir))
core_file_info = ((os.stat(path), path) for path in core_files)
core_file_detailed_info = ((stat[ST_CTIME], path) for stat, path in core_file_info)
# Groups core files based on the executable program name to identify duplicate core files
exe_groups = dict()
for cdate, path in core_file_detailed_info:
if exe_groups.get(exe_name(os.path.basename(path))) is None:
exe_groups[exe_name(os.path.basename(path))] = [(cdate, path)]
else:
exe_groups.get(exe_name(os.path.basename(path))).append((cdate, path))

for exe, core_files in exe_groups.items():
if len(core_files) > 1:
idx = 0
last_idx = len(core_files) - 1
for cdate, path in sorted(core_files, reverse=True):
if idx >= (max_dumps - 1) and idx != last_idx:
logger.log_info('Deleting the core file {}'.format(path))
try:
os.remove(path)
except FileNotFoundError:
pass
except Exception as e:
logger.log_error('Exception [{}] while deleting the core file {}'.format(e, path))
idx += 1

def main():
logger = Logger(SYSLOG_IDENTIFIER)
logger.set_min_log_priority_info()

if os.getuid() != 0:
logger.log_error('Root required to clean up core files')
return
# Add allowed arguments
parser = argparse.ArgumentParser(description="Core file garbage collector:\n\n"
"Duplicate core files generated for the same exectable program are deleted.\n"
"Only the oldest and n-1 latest duplicate core files are saved, rest of them are deleted.",
formatter_class=argparse.RawTextHelpFormatter)

logger.log_info('Cleaning up core files')
core_files = [f for f in os.listdir(CORE_FILE_DIR) if os.path.isfile(os.path.join(CORE_FILE_DIR, f))]

core_files_by_process = defaultdict(list)
for f in core_files:
process = f.split('.')[0]
curr_files = core_files_by_process[process]
curr_files.append(f)

if len(curr_files) > MAX_CORE_FILES:
curr_files.sort(reverse = True, key = lambda x: datetime.utcfromtimestamp(int(x.split('.')[1])))
oldest_core = curr_files[MAX_CORE_FILES]
logger.log_info('Deleting {}'.format(oldest_core))
try:
os.remove(os.path.join(CORE_FILE_DIR, oldest_core))
except:
logger.log_error('Unexpected error occured trying to delete {}'.format(oldest_core))
core_files_by_process[process] = curr_files[0:MAX_CORE_FILES]
# Directory to clean-up
parser.add_argument("-d", "--input-dir",
metavar='DIR', default=CORE_FILE_DIR,
help="core files directory to cleanup")

# Maximum number of duplicate core files stored
parser.add_argument("-n", "--max-dumps", type=int,
default=DEFAULT_MAX_DUMPS, choices=range(1, 11),
help='maximum number of core files saved for a given executable program. Default: 4')

# Parse command arguments
options = parser.parse_args()

if not os.path.isdir(options.input_dir):
logger.log_error("Error! Directory {} does not exist".format(options.input_dir))
sys.exit(1)

logger.log_info('Cleaning up core files')
corefile_cleanup(options.input_dir, options.max_dumps, logger)
logger.log_info('Finished cleaning up core files')

if __name__ == '__main__':
if __name__== "__main__":
main()
3 changes: 2 additions & 1 deletion rules/docker-base-buster.mk
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ VIM = vim
OPENSSH = openssh-client
SSHPASS = sshpass
STRACE = strace
$(DOCKER_BASE_BUSTER)_DBG_IMAGE_PACKAGES += $(GDB) $(GDBSERVER) $(VIM) $(OPENSSH) $(SSHPASS) $(STRACE)
SYSTEMD_COREDUMP=systemd-coredump
$(DOCKER_BASE_BUSTER)_DBG_IMAGE_PACKAGES += $(GDB) $(GDBSERVER) $(VIM) $(OPENSSH) $(SSHPASS) $(STRACE) $(SYSTEMD_COREDUMP)

SONIC_DOCKER_IMAGES += $(DOCKER_BASE_BUSTER)
3 changes: 2 additions & 1 deletion rules/docker-base-stretch.mk
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ VIM = vim
OPENSSH = openssh-client
SSHPASS = sshpass
STRACE = strace
$(DOCKER_BASE_STRETCH)_DBG_IMAGE_PACKAGES += $(GDB) $(GDBSERVER) $(VIM) $(OPENSSH) $(SSHPASS) $(STRACE)
SYSTEMD_COREDUMP = systemd-coredump
$(DOCKER_BASE_STRETCH)_DBG_IMAGE_PACKAGES += $(GDB) $(GDBSERVER) $(VIM) $(OPENSSH) $(SSHPASS) $(STRACE) $(SYSTEMD_COREDUMP)

SONIC_DOCKER_IMAGES += $(DOCKER_BASE_STRETCH)
SONIC_STRETCH_DOCKERS += $(DOCKER_BASE_STRETCH)
57 changes: 57 additions & 0 deletions src/sonic-host-services/scripts/hostcfgd
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ def obfuscate(data):
return data


def run_cmd(cmd, log_err = True):
try:
subprocess.check_call(cmd, shell = True)
except Exception as err:
if log_err:
syslog.syslog(syslog.LOG_ERR, "{} - failed: return code - {}, output:\n{}"
.format(err.cmd, err.returncode, err.output))

class Iptables(object):
def __init__(self):
Expand Down Expand Up @@ -226,6 +233,47 @@ class AaaCfg(object):
with open(NSS_TACPLUS_CONF, 'w') as f:
f.write(nss_tacplus_conf)

class CoredumpCfg(object):
def __init__(self, CfgDb):
self.config_db = CfgDb
self.coredump_defaults = { "enabled" : "true" }

def load(self, coredump_table):
syslog.syslog(syslog.LOG_INFO, "CoredumpCfg load ...")
data = {}
coredump_conf = coredump_table.get("config", {})
for row in self.coredump_defaults:
value = self.coredump_defaults.get(row)
if coredump_conf.get(row) is not None:
value = coredump_conf.get(row)
else:
self.config_db.mod_entry("COREDUMP", "config", { row : value})
data[row] = value
self.coredump_update("config", data, True)

def coredump_update(self, key, data, isLoad):
syslog.syslog(syslog.LOG_INFO, "Coredump global configuration update")
if key == "config":
coredump_enabled = self.coredump_defaults["enabled"]
if data.get("enabled") is not None:
coredump_enabled = data.get("enabled")
if coredump_enabled.lower() == "false":
enabled = False
else:
enabled = True
DISABLE_COREDUMP_CONF="/etc/sysctl.d/50-disable-coredump.conf"
if not enabled and not os.path.isfile(DISABLE_COREDUMP_CONF):
print('Disabling systemd-coredump')
with open(DISABLE_COREDUMP_CONF, "w") as fp:
fp.write("kernel.core_pattern=")
# Read sysctl conf files again
run_cmd("systemctl restart systemd-sysctl")
elif enabled:
if os.path.isfile(DISABLE_COREDUMP_CONF):
print('Enabling systemd-coredump')
os.remove(DISABLE_COREDUMP_CONF)
# Read sysctl conf files again
run_cmd("systemctl restart systemd-sysctl")

class HostConfigDaemon:
def __init__(self):
Expand All @@ -240,6 +288,10 @@ class HostConfigDaemon:

self.is_multi_npu = device_info.is_multi_npu()

# Load Coredump configuration
self.coredumpCfg = CoredumpCfg(self.config_db)
self.coredumpCfg.load(self.config_db.get_table('COREDUMP'))


def load(self):
aaa = self.config_db.get_table('AAA')
Expand Down Expand Up @@ -378,13 +430,18 @@ class HostConfigDaemon:
self.cached_feature_states[feature_name] = state
self.update_feature_state(feature_name, state, feature_table)

def coredump_handler (self, key, data):
syslog.syslog(syslog.LOG_INFO, 'Coredump handler...')
self.coredumpCfg.coredump_update(key, data, False)

def start(self):

self.config_db.subscribe('AAA', lambda table, key, data: self.aaa_handler(key, data))
self.config_db.subscribe('TACPLUS_SERVER', lambda table, key, data: self.tacacs_server_handler(key, data))
self.config_db.subscribe('TACPLUS', lambda table, key, data: self.tacacs_global_handler(key, data))
self.config_db.subscribe('LOOPBACK_INTERFACE', lambda table, key, data: self.lpbk_handler(key, data))
self.config_db.subscribe('FEATURE', lambda table, key, data: self.feature_state_handler(key, data))
self.config_db.subscribe('COREDUMP', lambda table, key, data: self.coredump_handler(key, data))

# Update all feature states once upon starting
self.update_all_feature_states()
Expand Down