Skip to content

Commit

Permalink
[201803] [services] Restart SwSS service upon unexpected critical pro…
Browse files Browse the repository at this point in the history
…cess exit (#2546)

* [service] Restart SwSS Docker container if orchagent exits unexpectedly

* [dhcp_relay] Use STATE_DB to determine whether interfaces are ready

* Supervisor now autorestarts rsyslogd upon unexpected exit

* Add other critical processes to event listener

* Make supervisor-proc-exit-listener script global, have it read from 'critical_processes' file inside container

* Add SwSS to 'WantedBy=' option of services which should be started along with SwSS
  • Loading branch information
jleveque authored and lguohan committed Feb 26, 2019
1 parent ec694a5 commit 2a8af27
Show file tree
Hide file tree
Showing 19 changed files with 127 additions and 60 deletions.
38 changes: 18 additions & 20 deletions dockers/docker-dhcp-relay/wait_for_intf.sh.j2
Original file line number Diff line number Diff line change
@@ -1,42 +1,40 @@
#!/usr/bin/env bash

function wait_until_iface_ready
{
IFACE=$1
STATE_DB_IDX="6"

echo "Waiting until interface $IFACE is up..."

# Wait for the interface to come up (i.e., 'ip link show' returns 0)
until ip link show dev $IFACE up > /dev/null 2>&1; do
sleep 1
done
PORT_TABLE_PREFIX="PORT_TABLE"
VLAN_TABLE_PREFIX="VLAN_TABLE"
LAG_TABLE_PREFIX="LAG_TABLE"

echo "Interface $IFACE is up"
function wait_until_iface_ready
{
TABLE_PREFIX=$1
IFACE=$2

echo "Waiting until interface $IFACE has an IPv4 address..."
echo "Waiting until interface $IFACE is ready..."

# Wait until the interface gets assigned an IPv4 address
# Wait for the interface to come up
# (i.e., interface is present in STATE_DB and state is "ok")
while true; do
IP=$(ip -4 addr show dev $IFACE | grep "inet " | awk '{ print $2 }' | cut -d '/' -f1)

if [ -n "$IP" ]; then
RESULT=$(redis-cli -n ${STATE_DB_IDX} HGET "${TABLE_PREFIX}|${IFACE}" "state" 2> /dev/null)
if [ x"$RESULT" == x"ok" ]; then
break
fi

sleep 1
done

echo "Interface $IFACE is configured with IP $IP"
echo "Interface ${IFACE} is ready!"
}


# Wait for all interfaces to come up and have IPv4 addresses assigned
# Wait for all interfaces to be up and ready
{% for (name, prefix) in INTERFACE %}
wait_until_iface_ready {{ name }}
wait_until_iface_ready ${PORT_TABLE_PREFIX} {{ name }}
{% endfor %}
{% for (name, prefix) in VLAN_INTERFACE %}
wait_until_iface_ready {{ name }}
wait_until_iface_ready ${VLAN_TABLE_PREFIX} {{ name }}
{% endfor %}
{% for (name, prefix) in PORTCHANNEL_INTERFACE %}
wait_until_iface_ready {{ name }}
wait_until_iface_ready ${LAG_TABLE_PREFIX} {{ name }}
{% endfor %}
2 changes: 2 additions & 0 deletions dockers/docker-orchagent/Dockerfile.j2
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ COPY ["files/arp_update", "/usr/bin"]
COPY ["enable_counters.py", "/usr/bin"]
COPY ["start.sh", "orchagent.sh", "swssconfig.sh", "/usr/bin/"]
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
COPY ["critical_processes", "/etc/supervisor/"]

## Copy all Jinja2 template files into the templates folder
COPY ["*.j2", "/usr/share/sonic/templates/"]
Expand Down
7 changes: 7 additions & 0 deletions dockers/docker-orchagent/critical_processes
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
orchagent
portsyncd
intfsyncd
neighsyncd
vlanmgrd
intfmgrd
buffermgrd
8 changes: 7 additions & 1 deletion dockers/docker-orchagent/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ logfile_maxbytes=1MB
logfile_backups=2
nodaemon=true

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener
events=PROCESS_STATE_EXITED
autostart=true
autorestart=unexpected

[program:start.sh]
command=/usr/bin/start.sh
priority=1
Expand All @@ -15,7 +21,7 @@ stderr_logfile=syslog
command=/usr/sbin/rsyslogd -n
priority=2
autostart=false
autorestart=false
autorestart=unexpected
stdout_logfile=syslog
stderr_logfile=syslog

Expand Down
2 changes: 1 addition & 1 deletion files/build_templates/dhcp_relay.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{ docker_container_name }}.sh attach
ExecStop=/usr/bin/{{ docker_container_name }}.sh stop

[Install]
WantedBy=multi-user.target teamd.service
WantedBy=multi-user.target swss.service teamd.service
2 changes: 1 addition & 1 deletion files/build_templates/radv.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{ docker_container_name }}.sh attach
ExecStop=/usr/bin/{{ docker_container_name }}.sh stop

[Install]
WantedBy=multi-user.target
WantedBy=multi-user.target swss.service
3 changes: 3 additions & 0 deletions files/build_templates/snmp.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,6 @@ Before=ntp-config.service
ExecStartPre=/usr/bin/{{docker_container_name}}.sh start
ExecStart=/usr/bin/{{docker_container_name}}.sh attach
ExecStop=/usr/bin/{{docker_container_name}}.sh stop

[Install]
WantedBy=multi-user.target swss.service
4 changes: 4 additions & 0 deletions files/build_templates/swss.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ After=opennsl-modules-3.16.0-6-amd64.service
After=nps-modules-3.16.0-6-amd64.service
{% endif %}
Before=ntp-config.service
StartLimitInterval=1200
StartLimitBurst=3

[Service]
User=root
Expand Down Expand Up @@ -52,6 +54,8 @@ ExecStopPost=/usr/bin/mst stop
ExecStopPost=/etc/init.d/xpnet.sh stop
ExecStopPost=/etc/init.d/xpnet.sh start
{% endif %}
Restart=always
RestartSec=30

[Install]
WantedBy=multi-user.target
6 changes: 3 additions & 3 deletions files/build_templates/teamd.service.j2
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[Unit]
Description=TEAMD container
Requires=updategraph.service
After=updategraph.service
Requires=updategraph.service swss.service
After=updategraph.service swss.service
Before=ntp-config.service

[Service]
Expand All @@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{docker_container_name}}.sh attach
ExecStop=/usr/bin/{{docker_container_name}}.sh stop

[Install]
WantedBy=multi-user.target
WantedBy=multi-user.target swss.service
45 changes: 45 additions & 0 deletions files/scripts/supervisor-proc-exit-listener
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env python

import os
import signal
import sys
import syslog

from supervisor import childutils

# Contents of file should be the names of critical processes (as defined in
# supervisor.conf file), one per line
CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes'

def main():
# Read the list of critical processes from a file
with open(CRITICAL_PROCESSES_FILE, 'r') as f:
critical_processes = [line.rstrip('\n') for line in f]

while True:
# Transition from ACKNOWLEDGED to READY
childutils.listener.ready()

line = sys.stdin.readline()
headers = childutils.get_headers(line)
payload = sys.stdin.read(int(headers['len']))

# Transition from READY to ACKNOWLEDGED
childutils.listener.ok()

# We only care about PROCESS_STATE_EXITED events
if headers['eventname'] == 'PROCESS_STATE_EXITED':
payload_headers, payload_data = childutils.eventdata(payload + '\n')

expected = int(payload_headers['expected'])
processname = payload_headers['processname']

# If a critical process exited unexpectedly, terminate supervisor
if expected == 0 and processname in critical_processes:
MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..."
msg = MSG_FORMAT_STR.format(payload_headers['processname'])
syslog.syslog(syslog.LOG_INFO, msg)
os.kill(os.getppid(), signal.SIGTERM)

if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion platform/broadcom/docker-orchagent-brcm.mk
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_BRCM)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_ORCHAGENT_BRCM)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw

$(DOCKER_ORCHAGENT_BRCM)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT_BRCM)_FILES += $(ARP_UPDATE_SCRIPT)
$(DOCKER_ORCHAGENT_BRCM)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
2 changes: 1 addition & 1 deletion platform/cavium/docker-orchagent-cavm.mk
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_CAVM)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_ORCHAGENT_CAVM)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw

$(DOCKER_ORCHAGENT_CAVM)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT_CAVM)_FILES += $(ARP_UPDATE_SCRIPT)
$(DOCKER_ORCHAGENT_CAVM)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
2 changes: 1 addition & 1 deletion platform/centec/docker-orchagent-centec.mk
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_CENTEC)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_ORCHAGENT_CENTEC)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw

$(DOCKER_ORCHAGENT_CENTEC)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT_CENTEC)_FILES += $(ARP_UPDATE_SCRIPT)
$(DOCKER_ORCHAGENT_CENTEC)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
2 changes: 1 addition & 1 deletion platform/marvell/docker-orchagent-mrvl.mk
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ $(DOCKER_ORCHAGENT_MRVL)_RUN_OPT += -v /host/machine.conf:/host/machine.conf
$(DOCKER_ORCHAGENT_MRVL)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro

$(DOCKER_ORCHAGENT_MRVL)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT_MRVL)_FILES += $(ARP_UPDATE_SCRIPT)
$(DOCKER_ORCHAGENT_MRVL)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
2 changes: 1 addition & 1 deletion platform/mellanox/docker-orchagent-mlnx.mk
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_MLNX)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_ORCHAGENT_MLNX)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw

$(DOCKER_ORCHAGENT_MLNX)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT_MLNX)_FILES += $(ARP_UPDATE_SCRIPT)
$(DOCKER_ORCHAGENT_MLNX)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
2 changes: 1 addition & 1 deletion platform/nephos/docker-orchagent-nephos.mk
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_NEPHOS)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_ORCHAGENT_NEPHOS)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw

$(DOCKER_ORCHAGENT_NEPHOS)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT_NEPHOS)_FILES += $(ARP_UPDATE_SCRIPT)
$(DOCKER_ORCHAGENT_NEPHOS)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
2 changes: 1 addition & 1 deletion rules/docker-dhcp-relay.mk
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

DOCKER_DHCP_RELAY = docker-dhcp-relay.gz
$(DOCKER_DHCP_RELAY)_PATH = $(DOCKERS_PATH)/docker-dhcp-relay
$(DOCKER_DHCP_RELAY)_DEPENDS += $(ISC_DHCP_COMMON) $(ISC_DHCP_RELAY) $(ISC_DHCP_CLIENT)
$(DOCKER_DHCP_RELAY)_DEPENDS += $(ISC_DHCP_COMMON) $(ISC_DHCP_RELAY) $(ISC_DHCP_CLIENT) $(REDIS_TOOLS)
$(DOCKER_DHCP_RELAY)_LOAD_DOCKERS = $(DOCKER_CONFIG_ENGINE)
SONIC_DOCKER_IMAGES += $(DOCKER_DHCP_RELAY)
SONIC_INSTALL_DOCKER_IMAGES += $(DOCKER_DHCP_RELAY)
Expand Down
6 changes: 5 additions & 1 deletion rules/scripts.mk
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@ $(ARP_UPDATE_SCRIPT)_PATH = files/scripts
CONFIGDB_LOAD_SCRIPT = configdb-load.sh
$(CONFIGDB_LOAD_SCRIPT)_PATH = files/scripts

SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT = supervisor-proc-exit-listener
$(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)_PATH = files/scripts

SONIC_COPY_FILES += $(CONFIGDB_LOAD_SCRIPT) \
$(ARP_UPDATE_SCRIPT)
$(ARP_UPDATE_SCRIPT) \
$(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)


50 changes: 24 additions & 26 deletions src/sonic-config-engine/tests/sample_output/wait_for_intf.sh
Original file line number Diff line number Diff line change
@@ -1,43 +1,41 @@
#!/usr/bin/env bash

function wait_until_iface_ready
{
IFACE=$1
STATE_DB_IDX="6"

echo "Waiting until interface $IFACE is up..."

# Wait for the interface to come up (i.e., 'ip link show' returns 0)
until ip link show dev $IFACE up > /dev/null 2>&1; do
sleep 1
done
PORT_TABLE_PREFIX="PORT_TABLE"
VLAN_TABLE_PREFIX="VLAN_TABLE"
LAG_TABLE_PREFIX="LAG_TABLE"

echo "Interface $IFACE is up"
function wait_until_iface_ready
{
TABLE_PREFIX=$1
IFACE=$2

echo "Waiting until interface $IFACE has an IPv4 address..."
echo "Waiting until interface $IFACE is ready..."

# Wait until the interface gets assigned an IPv4 address
# Wait for the interface to come up
# (i.e., interface is present in STATE_DB and state is "ok")
while true; do
IP=$(ip -4 addr show dev $IFACE | grep "inet " | awk '{ print $2 }' | cut -d '/' -f1)

if [ -n "$IP" ]; then
RESULT=$(redis-cli -n ${STATE_DB_IDX} HGET "${TABLE_PREFIX}|${IFACE}" "state" 2> /dev/null)
if [ x"$RESULT" == x"ok" ]; then
break
fi

sleep 1
done

echo "Interface $IFACE is configured with IP $IP"
echo "Interface ${IFACE} is ready!"
}


# Wait for all interfaces to come up and have IPv4 addresses assigned
wait_until_iface_ready Vlan1000
wait_until_iface_ready PortChannel04
wait_until_iface_ready PortChannel02
wait_until_iface_ready PortChannel03
wait_until_iface_ready PortChannel03
wait_until_iface_ready PortChannel01
wait_until_iface_ready PortChannel02
wait_until_iface_ready PortChannel04
wait_until_iface_ready PortChannel01
# Wait for all interfaces to be up and ready
wait_until_iface_ready ${VLAN_TABLE_PREFIX} Vlan1000
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel04
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel02
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel03
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel03
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel01
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel02
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel04
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel01

0 comments on commit 2a8af27

Please sign in to comment.