Skip to content

Commit ccb607a

Browse files
jlevequezhenggen-xu
authored andcommitted
[docker-syncd]: Restart SwSS, syncd and dependent services if a critical process in syncd container exits unexpectedly (sonic-net#3534)
Add the same mechanism I developed for the SwSS service in sonic-net#2845 to the syncd service. However, in order to cause the SwSS service to also exit and restart in this situation, I developed a docker-wait-any program which the SwSS service uses to wait for either the swss or syncd containers to exit.
1 parent 1534785 commit ccb607a

38 files changed

+164
-8
lines changed

files/build_templates/sonic_debian_extension.j2

+3
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,9 @@ sudo cp $IMAGE_CONFIGS/hostname/hostname-config.service $FILESYSTEM_ROOT/etc/sy
218218
echo "hostname-config.service" | sudo tee -a $GENERATED_SERVICE_FILE
219219
sudo cp $IMAGE_CONFIGS/hostname/hostname-config.sh $FILESYSTEM_ROOT/usr/bin/
220220

221+
# Copy miscellaneous scripts
222+
sudo cp $IMAGE_CONFIGS/misc/docker-wait-any $FILESYSTEM_ROOT/usr/bin/
223+
221224
# Copy updategraph script and service file
222225
j2 files/build_templates/updategraph.service.j2 | sudo tee $FILESYSTEM_ROOT/etc/systemd/system/updategraph.service
223226
sudo cp $IMAGE_CONFIGS/updategraph/updategraph $FILESYSTEM_ROOT/usr/bin/
+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/usr/bin/env python
2+
3+
"""
4+
docker-wait-any
5+
This script takes one or more Docker container names as arguments,
6+
and it will block indefinitely while all of the specified containers
7+
are running. If any of the specified containers stop, the script will
8+
exit.
9+
This script was created because the 'docker wait' command is lacking
10+
this functionality. It will block until ALL specified containers have
11+
stopped running. Here, we spawn multiple threads and wait on one
12+
container per thread. If any of the threads exit, the entire
13+
application will exit.
14+
NOTE: This script is written against docker-py version 1.6.0. Newer
15+
versions of docker-py have a different API.
16+
"""
17+
18+
import sys
19+
import threading
20+
from docker import Client
21+
22+
# Instantiate a global event to share among our threads
23+
g_thread_exit_event = threading.Event()
24+
25+
26+
def usage():
27+
print("Usage: {} <container_name> [<container_name> ...]".format(sys.argv[0]))
28+
sys.exit(1)
29+
30+
31+
def wait_for_container(docker_client, container_name):
32+
docker_client.wait(container_name)
33+
34+
print("No longer waiting on container '{}'".format(container_name))
35+
36+
# Signal the main thread to exit
37+
g_thread_exit_event.set()
38+
39+
40+
def main():
41+
thread_list = []
42+
43+
docker_client = Client(base_url='unix://var/run/docker.sock')
44+
45+
# Ensure we were passed at least one argument
46+
if len(sys.argv) < 2:
47+
usage()
48+
49+
container_names = sys.argv[1:]
50+
51+
for container_name in container_names:
52+
t = threading.Thread(target=wait_for_container, args=[docker_client, container_name])
53+
t.daemon = True
54+
t.start()
55+
thread_list.append(t)
56+
57+
# Wait until we receive an event signifying one of the containers has stopped
58+
g_thread_exit_event.wait()
59+
sys.exit(0)
60+
61+
if __name__ == '__main__':
62+
main()

files/scripts/swss.sh

+16-1
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,22 @@ start() {
131131

132132
wait() {
133133
start_peer_and_dependent_services
134-
/usr/bin/${SERVICE}.sh wait
134+
135+
# Allow some time for peer container to start
136+
# NOTE: This assumes Docker containers share the same names as their
137+
# corresponding services
138+
for SECS in {1..60}; do
139+
RUNNING=$(docker inspect -f '{{.State.Running}}' ${PEER})
140+
if [[ x"$RUNNING" == x"true" ]]; then
141+
break
142+
else
143+
sleep 1
144+
fi
145+
done
146+
147+
# NOTE: This assumes Docker containers share the same names as their
148+
# corresponding services
149+
/usr/bin/docker-wait-any ${SERVICE} ${PEER}
135150
}
136151

137152
stop() {

platform/barefoot/docker-syncd-bfn-rpc.mk

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
DOCKER_SYNCD_BFN_RPC = docker-syncd-bfn-rpc.gz
44
$(DOCKER_SYNCD_BFN_RPC)_PATH = $(PLATFORM_PATH)/docker-syncd-bfn-rpc
55
$(DOCKER_SYNCD_BFN_RPC)_DEPENDS += $(SYNCD_RPC) $(LIBTHRIFT)
6+
$(DOCKER_SYNCD_BFN_RPC)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
67
ifeq ($(INSTALL_DEBUG_TOOLS), y)
78
$(DOCKER_SYNCD_BFN_RPC)_DEPENDS += $(SYNCD_RPC_DBG) \
89
$(LIBSWSSCOMMON_DBG) \

platform/barefoot/docker-syncd-bfn/Dockerfile.j2

+2
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ debs/{{ deb }}{{' '}}
2929

3030
COPY ["start.sh", "/usr/bin/"]
3131
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
32+
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
33+
COPY ["critical_processes", "/etc/supervisor/"]
3234

3335
## Clean up
3436
RUN apt-get clean -y; apt-get autoclean -y; apt-get autoremove -y
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
syncd

platform/broadcom/docker-syncd-brcm-rpc.mk

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ $(DOCKER_SYNCD_BRCM_RPC)_DEPENDS += $(SYNCD_RPC_DBG) \
99
$(LIBSAIMETADATA_DBG) \
1010
$(LIBSAIREDIS_DBG)
1111
endif
12-
$(DOCKER_SYNCD_BRCM_RPC)_FILES += $(DSSERVE) $(BCMCMD)
12+
$(DOCKER_SYNCD_BRCM_RPC)_FILES += $(DSSERVE) $(BCMCMD) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
1313
$(DOCKER_SYNCD_BRCM_RPC)_LOAD_DOCKERS += $(DOCKER_SYNCD_BASE)
1414
SONIC_DOCKER_IMAGES += $(DOCKER_SYNCD_BRCM_RPC)
1515
SONIC_STRETCH_DOCKERS += $(DOCKER_SYNCD_BRCM_RPC)

platform/broadcom/docker-syncd-brcm/Dockerfile.j2

+2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ COPY ["files/dsserve", "files/bcmcmd", "start.sh", "bcmsh", "/usr/bin/"]
2626
RUN chmod +x /usr/bin/dsserve /usr/bin/bcmcmd
2727

2828
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
29+
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
30+
COPY ["critical_processes", "/etc/supervisor/"]
2931

3032
## Clean up
3133
RUN apt-get clean -y; apt-get autoclean -y; apt-get autoremove -y
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
dsserve
2+
syncd

platform/broadcom/docker-syncd-brcm/supervisord.conf

+7-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ logfile_maxbytes=1MB
33
logfile_backups=2
44
nodaemon=true
55

6+
[eventlistener:supervisor-proc-exit-listener]
7+
command=/usr/bin/supervisor-proc-exit-listener
8+
events=PROCESS_STATE_EXITED
9+
autostart=true
10+
autorestart=unexpected
11+
612
[program:start.sh]
713
command=/usr/bin/start.sh
814
priority=1
@@ -15,7 +21,7 @@ stderr_logfile=syslog
1521
command=/usr/sbin/rsyslogd -n
1622
priority=2
1723
autostart=false
18-
autorestart=false
24+
autorestart=unexpected
1925
stdout_logfile=syslog
2026
stderr_logfile=syslog
2127

platform/cavium/docker-syncd-cavm-rpc.mk

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
DOCKER_SYNCD_CAVM_RPC = docker-syncd-cavm-rpc.gz
44
$(DOCKER_SYNCD_CAVM_RPC)_PATH = $(PLATFORM_PATH)/docker-syncd-cavm-rpc
55
$(DOCKER_SYNCD_CAVM_RPC)_DEPENDS += $(SYNCD_RPC) $(LIBTHRIFT) $(CAVM_LIBSAI) $(XP_TOOLS) $(REDIS_TOOLS)
6+
$(DOCKER_SYNCD_CAVM_RPC)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
67
ifeq ($(INSTALL_DEBUG_TOOLS), y)
78
$(DOCKER_SYNCD_CAVM_RPC)_DEPENDS += $(SYNCD_RPC_DBG) \
89
$(LIBSWSSCOMMON_DBG) \

platform/cavium/docker-syncd-cavm.mk

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
DOCKER_SYNCD_CAVM = docker-syncd-cavm.gz
44
$(DOCKER_SYNCD_CAVM)_PATH = $(PLATFORM_PATH)/docker-syncd-cavm
55
$(DOCKER_SYNCD_CAVM)_DEPENDS += $(SYNCD) $(CAVM_LIBSAI) $(XP_TOOLS) $(REDIS_TOOLS)
6+
$(DOCKER_SYNCD_CAVM)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
67
ifeq ($(INSTALL_DEBUG_TOOLS), y)
78
$(DOCKER_SYNCD_CAVM)_DEPENDS += $(SYNCD_DBG) \
89
$(LIBSWSSCOMMON_DBG) \

platform/cavium/docker-syncd-cavm/Dockerfile.j2

+2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ debs/{{ deb }}{{' '}}
2323

2424
COPY ["start.sh", "/usr/bin/"]
2525
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
26+
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
27+
COPY ["critical_processes", "/etc/supervisor/"]
2628

2729
COPY ["profile.ini", "/etc/ssw/AS7512/"]
2830

Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
syncd

platform/cavium/docker-syncd-cavm/supervisord.conf

+7-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ logfile_maxbytes=1MB
33
logfile_backups=2
44
nodaemon=true
55

6+
[eventlistener:supervisor-proc-exit-listener]
7+
command=/usr/bin/supervisor-proc-exit-listener
8+
events=PROCESS_STATE_EXITED
9+
autostart=true
10+
autorestart=unexpected
11+
612
[program:start.sh]
713
command=/usr/bin/start.sh
814
priority=1
@@ -15,7 +21,7 @@ stderr_logfile=syslog
1521
command=/usr/sbin/rsyslogd -n
1622
priority=2
1723
autostart=false
18-
autorestart=false
24+
autorestart=unexpected
1925
stdout_logfile=syslog
2026
stderr_logfile=syslog
2127

platform/centec/docker-syncd-centec-rpc.mk

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
DOCKER_SYNCD_CENTEC_RPC = docker-syncd-centec-rpc.gz
44
$(DOCKER_SYNCD_CENTEC_RPC)_PATH = $(PLATFORM_PATH)/docker-syncd-centec-rpc
55
$(DOCKER_SYNCD_CENTEC_RPC)_DEPENDS += $(SYNCD_RPC) $(LIBTHRIFT)
6+
$(DOCKER_SYNCD_CENTEC_RPC)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
67
ifeq ($(INSTALL_DEBUG_TOOLS), y)
78
$(DOCKER_SYNCD_CENTEC_RPC)_DEPENDS += $(SYNCD_RPC_DBG) \
89
$(LIBSWSSCOMMON_DBG) \

platform/centec/docker-syncd-centec.mk

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
DOCKER_SYNCD_CENTEC = docker-syncd-centec.gz
44
$(DOCKER_SYNCD_CENTEC)_PATH = $(PLATFORM_PATH)/docker-syncd-centec
55
$(DOCKER_SYNCD_CENTEC)_DEPENDS += $(SYNCD)
6+
$(DOCKER_SYNCD_CENTEC)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
67
ifeq ($(INSTALL_DEBUG_TOOLS), y)
78
$(DOCKER_SYNCD_CENTEC)_DEPENDS += $(SYNCD_DBG) \
89
$(LIBSWSSCOMMON_DBG) \

platform/centec/docker-syncd-centec/Dockerfile.j2

+2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ RUN apt-get install -f kmod
2424

2525
COPY ["start.sh", "/usr/bin/"]
2626
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
27+
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
28+
COPY ["critical_processes", "/etc/supervisor/"]
2729

2830
## Clean up
2931
RUN apt-get clean -y; apt-get autoclean -y; apt-get autoremove -y
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
syncd

platform/centec/docker-syncd-centec/supervisord.conf

+7-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ logfile_maxbytes=1MB
33
logfile_backups=2
44
nodaemon=true
55

6+
[eventlistener:supervisor-proc-exit-listener]
7+
command=/usr/bin/supervisor-proc-exit-listener
8+
events=PROCESS_STATE_EXITED
9+
autostart=true
10+
autorestart=unexpected
11+
612
[program:start.sh]
713
command=/usr/bin/start.sh
814
priority=1
@@ -15,7 +21,7 @@ stderr_logfile=syslog
1521
command=/usr/sbin/rsyslogd -n
1622
priority=2
1723
autostart=false
18-
autorestart=false
24+
autorestart=unexpected
1925
stdout_logfile=syslog
2026
stderr_logfile=syslog
2127

platform/marvell-arm64/docker-syncd-mrvl-rpc.mk

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
DOCKER_SYNCD_MRVL_RPC = docker-syncd-mrvl-rpc.gz
44
$(DOCKER_SYNCD_MRVL_RPC)_PATH = $(PLATFORM_PATH)/docker-syncd-mrvl-rpc
55
$(DOCKER_SYNCD_MRVL_RPC)_DEPENDS += $(SYNCD_RPC) $(LIBTHRIFT)
6+
$(DOCKER_SYNCD_MRVL_RPC)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
67
ifeq ($(INSTALL_DEBUG_TOOLS), y)
78
$(DOCKER_SYNCD_MRVL_RPC)_DEPENDS += $(SYNCD_RPC_DBG) \
89
$(LIBSWSSCOMMON_DBG) \

platform/marvell-arm64/docker-syncd-mrvl/Dockerfile.j2

+2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ debs/{{ deb }}{{' '}}
2828

2929
COPY ["start.sh", "/usr/bin/"]
3030
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
31+
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
32+
COPY ["critical_processes", "/etc/supervisor/"]
3133

3234
## Clean up
3335
RUN apt-get clean -y; apt-get autoclean -y; apt-get autoremove -y
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
syncd

platform/marvell-armhf/docker-syncd-mrvl-rpc.mk

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
DOCKER_SYNCD_MRVL_RPC = docker-syncd-mrvl-rpc.gz
44
$(DOCKER_SYNCD_MRVL_RPC)_PATH = $(PLATFORM_PATH)/docker-syncd-mrvl-rpc
55
$(DOCKER_SYNCD_MRVL_RPC)_DEPENDS += $(SYNCD_RPC) $(LIBTHRIFT)
6+
$(DOCKER_SYNCD_MRVL_RPC)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
67
ifeq ($(INSTALL_DEBUG_TOOLS), y)
78
$(DOCKER_SYNCD_MRVL_RPC)_DEPENDS += $(SYNCD_RPC_DBG) \
89
$(LIBSWSSCOMMON_DBG) \

platform/marvell-armhf/docker-syncd-mrvl/Dockerfile.j2

+2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ debs/{{ deb }}{{' '}}
2828

2929
COPY ["start.sh", "/usr/bin/"]
3030
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
31+
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
32+
COPY ["critical_processes", "/etc/supervisor/"]
3133

3234
## Clean up
3335
RUN apt-get clean -y; apt-get autoclean -y; apt-get autoremove -y
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
syncd

platform/marvell/docker-syncd-mrvl-rpc.mk

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
DOCKER_SYNCD_MRVL_RPC = docker-syncd-mrvl-rpc.gz
44
$(DOCKER_SYNCD_MRVL_RPC)_PATH = $(PLATFORM_PATH)/docker-syncd-mrvl-rpc
55
$(DOCKER_SYNCD_MRVL_RPC)_DEPENDS += $(SYNCD_RPC) $(LIBTHRIFT)
6+
$(DOCKER_SYNCD_MRVL_RPC)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
67
ifeq ($(INSTALL_DEBUG_TOOLS), y)
78
$(DOCKER_SYNCD_MRVL_RPC)_DEPENDS += $(SYNCD_RPC_DBG) \
89
$(LIBSWSSCOMMON_DBG) \

platform/marvell/docker-syncd-mrvl/Dockerfile.j2

+2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ debs/{{ deb }}{{' '}}
2323

2424
COPY ["start.sh", "syncd.sh", "/usr/bin/"]
2525
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
26+
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
27+
COPY ["critical_processes", "/etc/supervisor/"]
2628

2729
## Clean up
2830
RUN apt-get clean -y; apt-get autoclean -y; apt-get autoremove -y
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
syncd

platform/marvell/docker-syncd-mrvl/supervisord.conf

+7-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ logfile_maxbytes=1MB
33
logfile_backups=2
44
nodaemon=true
55

6+
[eventlistener:supervisor-proc-exit-listener]
7+
command=/usr/bin/supervisor-proc-exit-listener
8+
events=PROCESS_STATE_EXITED
9+
autostart=true
10+
autorestart=unexpected
11+
612
[program:start.sh]
713
command=/usr/bin/start.sh
814
priority=1
@@ -15,7 +21,7 @@ stderr_logfile=syslog
1521
command=/usr/sbin/rsyslogd -n
1622
priority=2
1723
autostart=false
18-
autorestart=false
24+
autorestart=unexpected
1925
stdout_logfile=syslog
2026
stderr_logfile=syslog
2127

platform/mellanox/docker-syncd-mlnx-rpc.mk

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
DOCKER_SYNCD_MLNX_RPC = docker-syncd-mlnx-rpc.gz
44
$(DOCKER_SYNCD_MLNX_RPC)_PATH = $(PLATFORM_PATH)/docker-syncd-mlnx-rpc
55
$(DOCKER_SYNCD_MLNX_RPC)_DEPENDS += $(SYNCD_RPC) $(LIBTHRIFT)
6+
$(DOCKER_SYNCD_MLNX_RPC)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
67
ifeq ($(INSTALL_DEBUG_TOOLS), y)
78
$(DOCKER_SYNCD_MLNX_RPC)_DEPENDS += $(SYNCD_RPC_DBG) \
89
$(LIBSWSSCOMMON_DBG) \

platform/mellanox/docker-syncd-mlnx/Dockerfile.j2

+2
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,7 @@ RUN apt-get clean -y && \
3535

3636
COPY ["start.sh", "/usr/bin/"]
3737
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
38+
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
39+
COPY ["critical_processes", "/etc/supervisor/"]
3840

3941
ENTRYPOINT ["/usr/bin/supervisord"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
syncd

platform/mellanox/docker-syncd-mlnx/supervisord.conf

+7-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ logfile_maxbytes=1MB
33
logfile_backups=2
44
nodaemon=true
55

6+
[eventlistener:supervisor-proc-exit-listener]
7+
command=/usr/bin/supervisor-proc-exit-listener
8+
events=PROCESS_STATE_EXITED
9+
autostart=true
10+
autorestart=unexpected
11+
612
[program:start.sh]
713
command=/usr/bin/start.sh
814
priority=1
@@ -15,7 +21,7 @@ stderr_logfile=syslog
1521
command=/usr/sbin/rsyslogd -n
1622
priority=2
1723
autostart=false
18-
autorestart=false
24+
autorestart=unexpected
1925
stdout_logfile=syslog
2026
stderr_logfile=syslog
2127

platform/nephos/docker-syncd-nephos/Dockerfile.j2

+2
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ COPY ["files/dsserve", "files/npx_diag", "start.sh", "/usr/bin/"]
3636
RUN chmod +x /usr/bin/npx_diag /usr/bin/dsserve
3737

3838
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
39+
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
40+
COPY ["critical_processes", "/etc/supervisor/"]
3941

4042
## Clean up
4143
RUN apt-get clean -y; apt-get autoclean -y; apt-get autoremove -y
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
dsserve
2+
syncd

0 commit comments

Comments
 (0)