From f6f872a8fcee04890383065d3683d6251b35e160 Mon Sep 17 00:00:00 2001 From: Jeremy Peterson Date: Wed, 8 Mar 2023 23:41:21 -0500 Subject: [PATCH] Support Graceful Shutdown If the kubelet where the the sriov pod is running has gracefulShutdown configured, we'll delay in preStop for a while if and while /tmp/sriov-delay-shutdown exists, up to a maximum wait of 10 minutes (less than the 15 minutes which is specified in the daemonset pod's terminationGracePeriodSeconds). --- bindata/manifests/daemon/daemonset.yaml | 1 + bindata/scripts/clean-k8s-services.sh | 28 ++++++++++++++-- pkg/daemon/daemon.go | 44 ++++++++++++++++++++----- 3 files changed, 63 insertions(+), 10 deletions(-) diff --git a/bindata/manifests/daemon/daemonset.yaml b/bindata/manifests/daemon/daemonset.yaml index 2a5d5bdaf..b10879fa2 100644 --- a/bindata/manifests/daemon/daemonset.yaml +++ b/bindata/manifests/daemon/daemonset.yaml @@ -93,6 +93,7 @@ spec: volumeMounts: - name: cnibin mountPath: /host/opt/cni/bin + terminationGracePeriodSeconds: 900 volumes: - name: host hostPath: diff --git a/bindata/scripts/clean-k8s-services.sh b/bindata/scripts/clean-k8s-services.sh index 59cbc5fd7..737a5d256 100755 --- a/bindata/scripts/clean-k8s-services.sh +++ b/bindata/scripts/clean-k8s-services.sh @@ -1,12 +1,36 @@ #!/bin/bash +chroot_path="/proc/1/root" +delay_shutdown_path="$chroot_path/tmp/sriov-delay-shutdown" +kubelet_config_path="$chroot_path/etc/kubernetes/kubelet.conf" + +# 10 minutes - this should be shorter than the time that is specifed for the +# terminationGracePeriodSeconds in the daemonset's pod spec, so that everything +# else in the preStop hook has time to run and the Pod can be terminated properly. +wait_time=600 + +# If the kubelet is configured to shutdown gracefully (>0s shutdownGracePeriod), we need to wait for +# things to settle before shutting down the node. +if [ -f "$delay_shutdown_path" ]; then + if grep "$kubelet_config_path" -e shutdownGracePeriod | grep -qv \"0s\"; then + start=$(date +%s) + touch "$chroot_path/var/log/sriov-delay-start" + while [ $(( $(date +%s) - $start )) -lt $wait_time ]; do + if [ ! -f "$delay_shutdown_path" ]; then # don't have to wait anymore + break + fi + sleep 1 + done + rm -f "$delay_shutdown_path" + touch "$chroot_path/var/log/sriov-delay-end" + fi +fi + if [ "$CLUSTER_TYPE" == "openshift" ]; then echo "openshift cluster" exit fi -chroot_path="/host" - function clean_services() { # Remove switchdev service files rm -f $chroot_path/etc/systemd/system/switchdev-configuration-after-nm.service diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 727ffe642..c6459f43e 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -52,6 +52,9 @@ const ( // maxUpdateBackoff is the maximum time to react to a change as we back off // in the face of errors. maxUpdateBackoff = 60 * time.Second + + // the presence of this file indicates that the sriov shutdown should be delayed + delayShutdownPath = "/host/tmp/sriov-delay-shutdown" ) type Message struct { @@ -612,6 +615,22 @@ func (dn *Daemon) completeDrain() error { glog.Errorf("completeDrain(): failed to annotate node: %v", err) return err } + + if _, err := os.Stat(delayShutdownPath); err != nil { + if os.IsNotExist(err) { + // delayShutdownPath does not exist, so we don't need to do anything + return nil + } + + glog.Errorf("completeDrain(): error checking file status %v: %v", delayShutdownPath, err) + return err + } + + if err := os.Remove(delayShutdownPath); err != nil { + glog.Errorf("completeDrain(): failed to remove file %v: %v", delayShutdownPath, err) + return err + } + return nil } @@ -679,15 +698,16 @@ func rebootNode() { glog.Errorf("rebootNode(): %v", err) } defer exit() - // creates a new transient systemd unit to reboot the system. - // We explictily try to stop kubelet.service first, before anything else; this - // way we ensure the rest of system stays running, because kubelet may need - // to do "graceful" shutdown by e.g. de-registering with a load balancer. - // However note we use `;` instead of `&&` so we keep rebooting even - // if kubelet failed to shutdown - that way the machine will still eventually reboot - // as systemd will time out the stop invocation. + // creates a new transient systemd unit to reboot the system that + // reboots the system using `systemctl reboot`` + // by shutting down the system this way instead via `reboot`, + // when kubelet is configured with a shutdownGracePeriod, then it will + // be give some time to pods to run their preStop scripts and respond to + // SIGTERM by terminating gracefully before being forcefully killed via + // SIGKILL. Stopping the kubelet service and then immediately running + // `reboot` just results in all pods being immediately killed cmd := exec.Command("systemd-run", "--unit", "sriov-network-config-daemon-reboot", - "--description", "sriov-network-config-daemon reboot node", "/bin/sh", "-c", "systemctl stop kubelet.service; reboot") + "--description", "sriov-network-config-daemon reboot node", "/bin/sh", "-c", "systemctl reboot") if err := cmd.Run(); err != nil { glog.Errorf("failed to reboot node: %v", err) @@ -933,6 +953,14 @@ func (dn *Daemon) drainNode() error { return err } glog.Info("drainNode(): drain complete") + + file, err := os.Create(delayShutdownPath) + if err != nil { + glog.Errorf("drainNode(): failed to create file %v %v", delayShutdownPath, err) + return err + } + defer file.Close() + return nil }