From a760940d3252ccb845b55b86c69fd83c75fa5c66 Mon Sep 17 00:00:00 2001 From: Suyash Choudhary <57896905+sssash18@users.noreply.github.com> Date: Mon, 24 Jun 2024 14:25:52 +0530 Subject: [PATCH 1/6] improved drain timeout calculation --- pkg/util/provider/drain/drain.go | 19 ++++++++++--------- .../machinecontroller/machine_util.go | 9 ++++++--- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/pkg/util/provider/drain/drain.go b/pkg/util/provider/drain/drain.go index 34da4569e..975ac1759 100644 --- a/pkg/util/provider/drain/drain.go +++ b/pkg/util/provider/drain/drain.go @@ -653,16 +653,12 @@ func (o *Options) evictPodsWithPVInternal( returnCh chan error, ) (remainingPods []*corev1.Pod, fastTrack bool) { var ( - mainContext context.Context - cancelMainContext context.CancelFunc - retryPods []*corev1.Pod + retryPods []*corev1.Pod ) - mainContext, cancelMainContext = context.WithDeadline(ctx, o.drainStartedOn.Add(o.Timeout)) - defer cancelMainContext() for i, pod := range pods { select { - case <-mainContext.Done(): + case <-ctx.Done(): // Timeout occurred. Abort and report the remaining pods. returnCh <- nil return append(retryPods, pods[i+1:]...), true @@ -739,7 +735,7 @@ func (o *Options) evictPodsWithPVInternal( ) podVolumeInfo := podVolumeInfoMap[getPodKey(pod)] - ctx, cancelFn := context.WithTimeout(mainContext, o.getTerminationGracePeriod(pod)+o.PvDetachTimeout) + ctx, cancelFn := context.WithTimeout(ctx, o.getTerminationGracePeriod(pod)+o.PvDetachTimeout) err = o.waitForDetach(ctx, podVolumeInfo, o.nodeName) cancelFn() @@ -762,7 +758,7 @@ func (o *Options) evictPodsWithPVInternal( time.Since(podEvictionStartTime), ) - ctx, cancelFn = context.WithTimeout(mainContext, o.PvReattachTimeout) + ctx, cancelFn = context.WithTimeout(ctx, o.PvReattachTimeout) err = o.waitForReattach(ctx, podVolumeInfo, o.nodeName, volumeAttachmentEventCh) cancelFn() @@ -1000,7 +996,12 @@ func (o *Options) evictPodWithoutPVInternal(ctx context.Context, attemptEvict bo if i >= nretries { attemptEvict = false } - + select { + case <-ctx.Done(): + // Timeout occurred. Abort and report the remaining pods. + returnCh <- fmt.Errorf("timeout occured while waiting for pod %q terminating scheduled on node %v", pod.Name, pod.Spec.NodeName) + default: + } if attemptEvict { err = o.evictPod(ctx, pod, policyGroupVersion) } else { diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 06a3e6da8..25554d76d 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -1044,6 +1044,8 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver ReadonlyFilesystem v1.NodeConditionType = "ReadonlyFilesystem" ) + drainContext, cancelFn := context.WithDeadline(ctx, deleteMachineRequest.Machine.DeletionTimestamp.Add(timeOutDuration)) + defer cancelFn() if !isValidNodeName(nodeName) { message := "Skipping drain as nodeName is not a valid one for machine." printLogInitError(message, &err, &description, machine) @@ -1112,7 +1114,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver } // update node with the machine's phase prior to termination - if err = c.UpdateNodeTerminationCondition(ctx, machine); err != nil { + if err = c.UpdateNodeTerminationCondition(drainContext, machine); err != nil { if forceDeleteMachine { klog.Warningf("Failed to update node conditions: %v. However, since it's a force deletion shall continue deletion of VM.", err) } else { @@ -1153,7 +1155,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver c.volumeAttachmentHandler, ) klog.V(3).Infof("(drainNode) Invoking RunDrain, forceDeleteMachine: %t, forceDeletePods: %t, timeOutDuration: %s", forceDeletePods, forceDeleteMachine, timeOutDuration) - err = drainOptions.RunDrain(ctx) + err = drainOptions.RunDrain(drainContext) if err == nil { // Drain successful klog.V(2).Infof("Drain successful for machine %q ,providerID %q, backing node %q. \nBuf:%v \nErrBuf:%v", machine.Name, getProviderID(machine), getNodeName(machine), buf, errBuf) @@ -1183,7 +1185,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver } updateRetryPeriod, updateErr := c.machineStatusUpdate( - ctx, + drainContext, machine, v1alpha1.LastOperation{ Description: description, @@ -1408,6 +1410,7 @@ func (c *controller) getEffectiveDrainTimeout(machine *v1alpha1.Machine) *metav1 } else { effectiveDrainTimeout = &c.safetyOptions.MachineDrainTimeout } + effectiveDrainTimeout.Duration -= time.Now().Sub(machine.DeletionTimestamp.Time) return effectiveDrainTimeout } From 25c1646538ad16c6726c6a6c50aa84f68e4879e3 Mon Sep 17 00:00:00 2001 From: Suyash Choudhary <57896905+sssash18@users.noreply.github.com> Date: Tue, 25 Jun 2024 12:50:28 +0530 Subject: [PATCH 2/6] removed select for context cancellation --- pkg/util/provider/drain/drain.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pkg/util/provider/drain/drain.go b/pkg/util/provider/drain/drain.go index 975ac1759..4b8b0efd1 100644 --- a/pkg/util/provider/drain/drain.go +++ b/pkg/util/provider/drain/drain.go @@ -996,12 +996,6 @@ func (o *Options) evictPodWithoutPVInternal(ctx context.Context, attemptEvict bo if i >= nretries { attemptEvict = false } - select { - case <-ctx.Done(): - // Timeout occurred. Abort and report the remaining pods. - returnCh <- fmt.Errorf("timeout occured while waiting for pod %q terminating scheduled on node %v", pod.Name, pod.Spec.NodeName) - default: - } if attemptEvict { err = o.evictPod(ctx, pod, policyGroupVersion) } else { From 9dde1f84ee0e4c9054e5684e8fe7178b074b6469 Mon Sep 17 00:00:00 2001 From: Suyash Choudhary <57896905+sssash18@users.noreply.github.com> Date: Tue, 2 Jul 2024 11:51:12 +0530 Subject: [PATCH 3/6] passing drain context only in drainNode --- pkg/util/provider/drain/drain.go | 1 + pkg/util/provider/machinecontroller/machine_util.go | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pkg/util/provider/drain/drain.go b/pkg/util/provider/drain/drain.go index 4b8b0efd1..3b55d7c92 100644 --- a/pkg/util/provider/drain/drain.go +++ b/pkg/util/provider/drain/drain.go @@ -996,6 +996,7 @@ func (o *Options) evictPodWithoutPVInternal(ctx context.Context, attemptEvict bo if i >= nretries { attemptEvict = false } + if attemptEvict { err = o.evictPod(ctx, pod, policyGroupVersion) } else { diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 25554d76d..5158a581a 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -1044,7 +1044,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver ReadonlyFilesystem v1.NodeConditionType = "ReadonlyFilesystem" ) - drainContext, cancelFn := context.WithDeadline(ctx, deleteMachineRequest.Machine.DeletionTimestamp.Add(timeOutDuration)) + drainContext, cancelFn := context.WithDeadline(ctx, time.Now().Add(timeOutDuration)) defer cancelFn() if !isValidNodeName(nodeName) { message := "Skipping drain as nodeName is not a valid one for machine." @@ -1114,7 +1114,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver } // update node with the machine's phase prior to termination - if err = c.UpdateNodeTerminationCondition(drainContext, machine); err != nil { + if err = c.UpdateNodeTerminationCondition(ctx, machine); err != nil { if forceDeleteMachine { klog.Warningf("Failed to update node conditions: %v. However, since it's a force deletion shall continue deletion of VM.", err) } else { @@ -1185,7 +1185,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver } updateRetryPeriod, updateErr := c.machineStatusUpdate( - drainContext, + ctx, machine, v1alpha1.LastOperation{ Description: description, From ada990231f3a162adf887f3a9ce2a6e59d944640 Mon Sep 17 00:00:00 2001 From: Suyash Choudhary <57896905+sssash18@users.noreply.github.com> Date: Tue, 2 Jul 2024 11:54:03 +0530 Subject: [PATCH 4/6] removed effective drain timeout modification --- pkg/util/provider/machinecontroller/machine_util.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 5158a581a..2f3bbaded 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -1410,7 +1410,6 @@ func (c *controller) getEffectiveDrainTimeout(machine *v1alpha1.Machine) *metav1 } else { effectiveDrainTimeout = &c.safetyOptions.MachineDrainTimeout } - effectiveDrainTimeout.Duration -= time.Now().Sub(machine.DeletionTimestamp.Time) return effectiveDrainTimeout } From 9051383b40ae1753f187c9ebae3af7530dceeb42 Mon Sep 17 00:00:00 2001 From: Suyash Choudhary <57896905+sssash18@users.noreply.github.com> Date: Tue, 2 Jul 2024 12:11:39 +0530 Subject: [PATCH 5/6] create drainContext in RunDrain --- pkg/util/provider/drain/drain.go | 7 +++++-- pkg/util/provider/machinecontroller/machine_util.go | 5 +---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/util/provider/drain/drain.go b/pkg/util/provider/drain/drain.go index 3b55d7c92..ca17324a8 100644 --- a/pkg/util/provider/drain/drain.go +++ b/pkg/util/provider/drain/drain.go @@ -189,6 +189,7 @@ func NewDrainOptions( // RunDrain runs the 'drain' command func (o *Options) RunDrain(ctx context.Context) error { o.drainStartedOn = time.Now() + drainContext, cancelFn := context.WithDeadline(ctx, o.drainStartedOn.Add(o.Timeout)) klog.V(4).Infof( "Machine drain started on %s for %q", o.drainStartedOn, @@ -197,6 +198,7 @@ func (o *Options) RunDrain(ctx context.Context) error { defer func() { o.drainEndedOn = time.Now() + cancelFn() klog.Infof( "Machine drain ended on %s and took %s for %q", o.drainEndedOn, @@ -205,12 +207,12 @@ func (o *Options) RunDrain(ctx context.Context) error { ) }() - if err := o.RunCordonOrUncordon(ctx, true); err != nil { + if err := o.RunCordonOrUncordon(drainContext, true); err != nil { klog.Errorf("Drain Error: Cordoning of node failed with error: %v", err) return err } - err := o.deleteOrEvictPodsSimple(ctx) + err := o.deleteOrEvictPodsSimple(drainContext) return err } @@ -379,6 +381,7 @@ func (o *Options) evictPod(ctx context.Context, pod *corev1.Pod, policyGroupVers } klog.V(3).Infof("Attempting to evict the pod:%q from node %q", pod.Name, o.nodeName) // TODO: Remember to change the URL manipulation func when Evction's version change + time.Sleep(6 * time.Minute) return o.client.PolicyV1beta1().Evictions(eviction.Namespace).Evict(ctx, eviction) } diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 2f3bbaded..e41224f86 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -1043,9 +1043,6 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver nodeNotReadyDuration = 5 * time.Minute ReadonlyFilesystem v1.NodeConditionType = "ReadonlyFilesystem" ) - - drainContext, cancelFn := context.WithDeadline(ctx, time.Now().Add(timeOutDuration)) - defer cancelFn() if !isValidNodeName(nodeName) { message := "Skipping drain as nodeName is not a valid one for machine." printLogInitError(message, &err, &description, machine) @@ -1155,7 +1152,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver c.volumeAttachmentHandler, ) klog.V(3).Infof("(drainNode) Invoking RunDrain, forceDeleteMachine: %t, forceDeletePods: %t, timeOutDuration: %s", forceDeletePods, forceDeleteMachine, timeOutDuration) - err = drainOptions.RunDrain(drainContext) + err = drainOptions.RunDrain(ctx) if err == nil { // Drain successful klog.V(2).Infof("Drain successful for machine %q ,providerID %q, backing node %q. \nBuf:%v \nErrBuf:%v", machine.Name, getProviderID(machine), getNodeName(machine), buf, errBuf) From 1b5f6e4e6ae16ba6b501bfbc99c3a78a6fad2adb Mon Sep 17 00:00:00 2001 From: Suyash Choudhary <57896905+sssash18@users.noreply.github.com> Date: Tue, 2 Jul 2024 12:13:00 +0530 Subject: [PATCH 6/6] remove time.Sleep --- pkg/util/provider/drain/drain.go | 1 - pkg/util/provider/machinecontroller/machine_util.go | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/util/provider/drain/drain.go b/pkg/util/provider/drain/drain.go index ca17324a8..be5247339 100644 --- a/pkg/util/provider/drain/drain.go +++ b/pkg/util/provider/drain/drain.go @@ -381,7 +381,6 @@ func (o *Options) evictPod(ctx context.Context, pod *corev1.Pod, policyGroupVers } klog.V(3).Infof("Attempting to evict the pod:%q from node %q", pod.Name, o.nodeName) // TODO: Remember to change the URL manipulation func when Evction's version change - time.Sleep(6 * time.Minute) return o.client.PolicyV1beta1().Evictions(eviction.Namespace).Evict(ctx, eviction) } diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index e41224f86..06a3e6da8 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -1043,6 +1043,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver nodeNotReadyDuration = 5 * time.Minute ReadonlyFilesystem v1.NodeConditionType = "ReadonlyFilesystem" ) + if !isValidNodeName(nodeName) { message := "Skipping drain as nodeName is not a valid one for machine." printLogInitError(message, &err, &description, machine)