From 4472ace1205dc9c15f59c31ee63db99723e9ba39 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Wed, 23 Jan 2019 16:58:14 -0800 Subject: [PATCH] ci-operator/templates/openshift/installer: Restore backgrounded 'create cluster' Unwinding part of 7e56379d (templates/openshift: grab bootstrap log on failure, 2019-01-14, #2633). I'd initially added the backgrounding in d862f6f1 (cluster-launch-installer-e2e: Restore EXIT and TERM signal handlers, 2018-10-17, #1957). If you exec 'create cluster', it kills the traps, but I was less clear on whether the background/wait dance was required vs. the approach taken by 7e56379d. Here are scripts comparing the two approaches vs. SIGTERM (e.g. if a test run is aborted because new updates have been pushed to a pull request): $ cat test-sleep.sh #!/bin/sh trap 'rc=$?; if test "${rc}" -eq 0; then echo touch /tmp/setup-success; else echo touch /tmp/exit; fi; exit "${rc}"' EXIT trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then echo kill ${CHILDREN} && kill ${CHILDREN}; fi' TERM echo $$ python -c 'import time; time.sleep(20); print("done sleeping")' $ ./test-sleep.sh # while this is running, in another shell: kill 17125 17125 done sleeping touch /tmp/setup-success That's not good, the Python process should have been killed before it got far enough in to print "done sleeping". Here's with a background/wait: $ cat test-sleep-wait.sh #!/bin/sh trap 'rc=$?; if test "${rc}" -eq 0; then echo touch /tmp/setup-success; else echo touch /tmp/exit; fi; exit "${rc}"' EXIT trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then echo kill ${CHILDREN} && kill ${CHILDREN}; fi' TERM echo $$ python -c 'import time; time.sleep(20); print("done sleeping")' & wait "$!" $ ./test-sleep-wait.sh # kill 17242 17242 kill 17243 touch /tmp/exit that looks better. Stracing to put a finger on the differences (e.g. 'strace -tfo /tmp/trace ./test-sleep.sh'), gives: 21243 21:59:18 execve("./test-sleep.sh", ["./test-sleep.sh"], [/* 74 vars */]) = 0 ... 21243 21:59:18 clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fe60e216a10) = 21244 ... 21244 21:59:18 execve("/usr/bin/python", ["python", "-c", "import time; time.sleep(20); pri"...], [/* 73 vars */]) = 0 ... 21244 21:59:18 select(0, NULL, NULL, NULL, {20, 0} 21243 21:59:25 <... wait4 resumed> 0x7ffe9feef350, 0, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set) 21243 21:59:25 --- SIGTERM {si_signo=SIGTERM, si_code=SI_USER, si_pid=8603, si_uid=114032} --- ... 21244 21:59:38 write(1, "done sleeping\n", 14) = 14 21244 21:59:38 rt_sigaction(SIGINT, {SIG_DFL, [], SA_RESTORER, 0x7f20f7b88680}, {0x7f20f7eab680, [], SA_RESTORER, 0x7f20f7b88680}, 8) = 0 21244 21:59:38 exit_group(0) = ? 21244 21:59:38 +++ exited with 0 +++ 21243 21:59:38 <... wait4 resumed> [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 21244 ... 21243 21:59:38 pipe([3, 4]) = 0 ... 21243 21:59:38 clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fe60e216a10) = 21262 21243 21:59:38 rt_sigprocmask(SIG_SETMASK, [TERM], NULL, 8) = 0 21243 21:59:38 rt_sigaction(SIGCHLD, {0x441390, [], SA_RESTORER|SA_RESTART, 0x7fe60d850280}, 21262 21:59:38 close(255 ... 21243 21:59:38 read(3, ... 21262 21:59:38 close(4) = 0 ... 21262 21:59:38 exit_group(0) = ? 21243 21:59:38 <... read resumed> "", 128) = 0 21262 21:59:38 +++ exited with 0 +++ 21243 21:59:38 --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=21262, si_uid=114032, si_status=0, si_utime=0, si_stime=0} --- 21243 21:59:38 wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 21262 ... 21243 21:59:38 write(1, "touch /tmp/setup-success\n", 25) = 25 21243 21:59:38 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 21243 21:59:38 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 21243 21:59:38 exit_group(0) = ? 21243 21:59:38 +++ exited with 0 +++ so it looks like the shell (21243) just swallows the TERM, waits out the sleeping Python process, and exits zero. 21262 is the 'jobs' process, which doesn't fire until Python finishes its sleep and returns. The trap is a no-op, because test-sleep.sh never spawns backgrounded children, so 'jobs' returns an empty string. And here's an strace on test-sleep-wait.sh: 21638 22:04:48 execve("./test-sleep-wait.sh", ["./test-sleep-wait.sh"], [/* 74 vars */]) = 0 ... 21638 22:04:48 clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f78dc7c4a10) = 21639 ... 21639 22:04:48 execve("/usr/bin/python", ["python", "-c", "import time; time.sleep(20); pri"...], [/* 73 vars */]) = 0 ... 21639 22:04:48 select(0, NULL, NULL, NULL, {20, 0} 21638 22:04:54 <... wait4 resumed> 0x7ffc07d607a0, 0, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set) 21638 22:04:54 --- SIGTERM {si_signo=SIGTERM, si_code=SI_USER, si_pid=8603, si_uid=114032} --- ... 21638 22:04:54 clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f78dc7c4a10) = 21645 ... 21645 22:04:54 write(1, "21639\n", 6) = 6 ... 21645 22:04:54 exit_group(0) = ? 21645 22:04:54 +++ exited with 0 +++ 21638 22:04:54 <... read resumed> "", 128) = 0 21638 22:04:54 --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=21645, si_uid=114032, si_status=0, si_utime=0, si_stime=0} --- 21638 22:04:54 rt_sigreturn({mask=[TERM]}) = 0 ... 21638 22:04:54 wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 21645 ... 21638 22:04:54 write(1, "kill 21639\n", 11) = 11 21638 22:04:54 kill(21639, SIGTERM) = 0 ... 21638 22:04:54 write(1, "touch /tmp/exit\n", 16 21639 22:04:54 +++ killed by SIGTERM +++ 21638 22:04:54 <... write resumed> ) = 16 21638 22:04:54 --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_KILLED, si_pid=21639, si_uid=114032, si_status=SIGTERM, si_utime=0, si_stime=0} --- 21638 22:04:54 wait4(-1, [{WIFSIGNALED(s) && WTERMSIG(s) == SIGTERM}], WNOHANG, NULL) = 21639 21638 22:04:54 wait4(-1, 0x7ffc07d60010, WNOHANG, NULL) = -1 ECHILD (No child processes) 21638 22:04:54 rt_sigreturn({mask=[]}) = 16 21638 22:04:54 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 21638 22:04:54 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 21638 22:04:54 exit_group(143) = ? 21638 22:04:54 +++ exited with 143 +++ With this approach, we see the shell (21638) catch the TERM and immediately launch the trap handler which launches 'jobs' (21645). The trap handler sends a new TERM to the Python process (21639), which exits non-zero. The shell reaps the 'jobs' process, echos "touch /tmp/exit", reaps the killed Python process, and exits 143. From the ordering there it looks like the shell isn't actually waiting for the Python process, although we could add a wait to the TERM handler if we wanted. Let's do that, and also drop some of the debugging echos now that we're committed to strace: $ cat test-sleep-wait-in-handler.sh #!/bin/sh trap 'rc=$?; if test "${rc}" -eq 0; then touch /tmp/success; else touch /tmp/exit; fi; exit "${rc}"' EXIT trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then kill ${CHILDREN} && wait; fi' TERM echo $$ python -c 'import time; time.sleep(20); print("done sleeping")' & wait "$!" $ strace -tfo /tmp/trace ./test-sleep-wait-in-handler.sh 23095 As above, I then 'kill 23095' from another terminal. Excerpts from the trace: 23095 22:27:03 execve("./test-sleep-wait-in-handler.sh", ["./test-sleep-wait-in-handler.sh"], [/* 74 vars */]) = 0 ... 23095 22:27:03 clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f8e464f2a10) = 23096 ... 23096 22:27:03 execve("/usr/bin/python", ["python", "-c", "import time; time.sleep(20); pri"...], [/* 73 vars */]) = 0 ... 23096 22:27:03 select(0, NULL, NULL, NULL, {20, 0} 23095 22:27:09 <... wait4 resumed> 0x7ffe54e18750, 0, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set) 23095 22:27:09 --- SIGTERM {si_signo=SIGTERM, si_code=SI_USER, si_pid=8603, si_uid=114032} --- ... 23095 22:27:09 clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f8e464f2a10) = 23110 ... 23110 22:27:09 write(1, "23096\n", 6) = 6 23095 22:27:09 <... read resumed> "23096\n", 128) = 6 ... 23110 22:27:09 +++ exited with 0 +++ 23095 22:27:09 --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=23110, si_uid=114032, si_status=0, si_utime=0, si_stime=0} --- ... 23095 22:27:09 wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 23110 ... 23095 22:27:09 kill(23096, SIGTERM) = 0 23096 22:27:09 <... select resumed> ) = ? ERESTARTNOHAND (To be restarted if no handler) 23095 22:27:09 rt_sigprocmask(SIG_BLOCK, NULL, 23096 22:27:09 --- SIGTERM {si_signo=SIGTERM, si_code=SI_USER, si_pid=23095, si_uid=114032} --- ... 23095 22:27:09 wait4(-1, 23096 22:27:09 +++ killed by SIGTERM +++ 23095 22:27:09 <... wait4 resumed> [{WIFSIGNALED(s) && WTERMSIG(s) == SIGTERM}], 0, NULL) = 23096 ... 23095 22:27:09 --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_KILLED, si_pid=23096, si_uid=114032, si_status=SIGTERM, si_utime=1, si_stime=0} --- 23095 22:27:09 wait4(-1, 0x7ffe54e17810, WNOHANG, NULL) = -1 ECHILD (No child processes) ... 23095 22:27:09 clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f8e464f2a10) = 23111 ... 23111 22:27:09 execve("/usr/bin/touch", ["touch", "/tmp/exit"], [/* 73 vars */]) = 0 ... 23111 22:27:09 open("/tmp/exit", O_WRONLY|O_CREAT|O_NOCTTY|O_NONBLOCK, 0666) = 3 ... 23111 22:27:09 exit_group(0) = ? 23111 22:27:09 +++ exited with 0 +++ 23095 22:27:09 <... wait4 resumed> [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 23111 23095 22:27:09 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 23095 22:27:09 --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=23111, si_uid=114032, si_status=0, si_utime=0, si_stime=0} --- 23095 22:27:09 wait4(-1, 0x7ffe54e18050, WNOHANG, NULL) = -1 ECHILD (No child processes) ... 23095 22:27:09 exit_group(143) = ? 23095 22:27:09 +++ exited with 143 +++ With this approach, we see the shell (23095) catch the TERM and immediately launch the trap handler which launches 'jobs' (23110) and reaps it. The trap handler sends a new TERM to the Python process (23096), which exits non-zero. The shell reaps the killed Python process, waits but gets no children, launches and reaps the touch process (23111), and exits 143. That looks like what we want, so I've gone that way with this commit. --- .../openshift/installer/cluster-launch-installer-e2e.yaml | 5 +++-- .../installer/cluster-launch-installer-libvirt-e2e.yaml | 2 +- .../openshift/installer/cluster-launch-installer-src.yaml | 5 +++-- .../openshift/openshift-ansible/cluster-launch-e2e-40.yaml | 2 +- .../openshift/openshift-ansible/cluster-scaleup-e2e-40.yaml | 2 +- 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/ci-operator/templates/openshift/installer/cluster-launch-installer-e2e.yaml b/ci-operator/templates/openshift/installer/cluster-launch-installer-e2e.yaml index e3c50edc60a0..059dc74b3b91 100644 --- a/ci-operator/templates/openshift/installer/cluster-launch-installer-e2e.yaml +++ b/ci-operator/templates/openshift/installer/cluster-launch-installer-e2e.yaml @@ -299,7 +299,7 @@ objects: - | #!/bin/sh trap 'rc=$?; if test "${rc}" -eq 0; then touch /tmp/setup-success; else touch /tmp/exit; fi; exit "${rc}"' EXIT - trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then kill ${CHILDREN}; fi' TERM + trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then kill ${CHILDREN} && wait; fi' TERM mkdir /tmp/artifacts/installer && /bin/openshift-install version >/tmp/artifacts/installer/version @@ -373,7 +373,8 @@ objects: exit 1 fi - /bin/openshift-install --dir=/tmp/artifacts/installer create cluster + /bin/openshift-install --dir=/tmp/artifacts/installer create cluster & + wait "$!" # Performs cleanup of all created resources - name: teardown diff --git a/ci-operator/templates/openshift/installer/cluster-launch-installer-libvirt-e2e.yaml b/ci-operator/templates/openshift/installer/cluster-launch-installer-libvirt-e2e.yaml index 4f71adb4138f..f425a6dee61e 100644 --- a/ci-operator/templates/openshift/installer/cluster-launch-installer-libvirt-e2e.yaml +++ b/ci-operator/templates/openshift/installer/cluster-launch-installer-libvirt-e2e.yaml @@ -173,7 +173,7 @@ objects: #!/bin/sh set -euo pipefail trap 'rc=$?; if test "${rc}" -eq 0; then touch "${HOME}"/setup-success; else touch "${HOME}"/exit; fi; exit "${rc}"' EXIT - trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then kill ${CHILDREN}; fi' TERM + trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then kill ${CHILDREN} && wait; fi' TERM cat > "${HOME}"/router-check.sh << 'EOF' #!/bin/bash set -euo pipefail diff --git a/ci-operator/templates/openshift/installer/cluster-launch-installer-src.yaml b/ci-operator/templates/openshift/installer/cluster-launch-installer-src.yaml index 38399c315bec..a2a87c678e1a 100644 --- a/ci-operator/templates/openshift/installer/cluster-launch-installer-src.yaml +++ b/ci-operator/templates/openshift/installer/cluster-launch-installer-src.yaml @@ -264,7 +264,7 @@ objects: - | #!/bin/sh trap 'rc=$?; if test "${rc}" -eq 0; then touch /tmp/setup-success; else touch /tmp/exit; fi; exit "${rc}"' EXIT - trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then kill ${CHILDREN}; fi' TERM + trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then kill ${CHILDREN} && wait; fi' TERM mkdir /tmp/artifacts/installer && /bin/openshift-install version >/tmp/artifacts/installer/version @@ -338,7 +338,8 @@ objects: exit 1 fi - /bin/openshift-install --dir=/tmp/artifacts/installer create cluster + /bin/openshift-install --dir=/tmp/artifacts/installer create cluster & + wait "$!" # Performs cleanup of all created resources - name: teardown diff --git a/ci-operator/templates/openshift/openshift-ansible/cluster-launch-e2e-40.yaml b/ci-operator/templates/openshift/openshift-ansible/cluster-launch-e2e-40.yaml index 853a6e78bd2e..4518282922ff 100644 --- a/ci-operator/templates/openshift/openshift-ansible/cluster-launch-e2e-40.yaml +++ b/ci-operator/templates/openshift/openshift-ansible/cluster-launch-e2e-40.yaml @@ -262,7 +262,7 @@ objects: - | #!/bin/sh trap 'rc=$?; if test "${rc}" -eq 0; then touch /tmp/config-success; else touch /tmp/exit; fi; exit "${rc}"' EXIT - trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then kill ${CHILDREN}; fi' TERM + trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then kill ${CHILDREN} && wait; fi' TERM mkdir /tmp/artifacts/installer && /bin/openshift-install version >/tmp/artifacts/installer/version diff --git a/ci-operator/templates/openshift/openshift-ansible/cluster-scaleup-e2e-40.yaml b/ci-operator/templates/openshift/openshift-ansible/cluster-scaleup-e2e-40.yaml index ad58d07eb2c3..abbf17fbfa27 100644 --- a/ci-operator/templates/openshift/openshift-ansible/cluster-scaleup-e2e-40.yaml +++ b/ci-operator/templates/openshift/openshift-ansible/cluster-scaleup-e2e-40.yaml @@ -303,7 +303,7 @@ objects: - | #!/bin/sh trap 'rc=$?; if test "${rc}" -eq 0; then touch /tmp/setup-success; else touch /tmp/exit; fi; exit "${rc}"' EXIT - trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then kill ${CHILDREN}; fi' TERM + trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then kill ${CHILDREN} && wait; fi' TERM mkdir /tmp/artifacts/installer && /bin/openshift-install version >/tmp/artifacts/installer/version