Skip to content

Commit

Permalink
✨ Add MachineDrainRule "WaitCompleted"
Browse files Browse the repository at this point in the history
Signed-off-by: Vince Prignano <vince@prigna.com>
  • Loading branch information
vincepri committed Jan 9, 2025
1 parent fe6a595 commit 6650be9
Show file tree
Hide file tree
Showing 11 changed files with 510 additions and 99 deletions.
14 changes: 10 additions & 4 deletions api/v1beta1/machinedrainrules_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ import (

const (
// PodDrainLabel is the label that can be set on Pods in workload clusters to ensure a Pod is not drained.
// The only valid value is "skip".
// The only valid values are "skip" and "wait-completed".
// This label takes precedence over MachineDrainRules defined in the management cluster.
PodDrainLabel = "cluster.x-k8s.io/drain"
)

// MachineDrainRuleDrainBehavior defines the drain behavior. Can be either "Drain" or "Skip".
// +kubebuilder:validation:Enum=Drain;Skip
// MachineDrainRuleDrainBehavior defines the drain behavior. Can be either "Drain", "Skip", or "WaitCompleted".
// +kubebuilder:validation:Enum=Drain;Skip;WaitCompleted
type MachineDrainRuleDrainBehavior string

const (
Expand All @@ -37,6 +37,10 @@ const (

// MachineDrainRuleDrainBehaviorSkip means the drain for a Pod should be skipped.
MachineDrainRuleDrainBehaviorSkip MachineDrainRuleDrainBehavior = "Skip"

// MachineDrainRuleDrainBehaviorWaitCompleted means the Pod should not be evicted,
// but overall drain should wait until the Pod completes.
MachineDrainRuleDrainBehaviorWaitCompleted MachineDrainRuleDrainBehavior = "WaitCompleted"
)

// MachineDrainRuleSpec defines the spec of a MachineDrainRule.
Expand Down Expand Up @@ -112,14 +116,16 @@ type MachineDrainRuleSpec struct {
// MachineDrainRuleDrainConfig configures if and how Pods are drained.
type MachineDrainRuleDrainConfig struct {
// behavior defines the drain behavior.
// Can be either "Drain" or "Skip".
// Can be either "Drain", "Skip", or "WaitCompleted".
// "Drain" means that the Pods to which this MachineDrainRule applies will be drained.
// If behavior is set to "Drain" the order in which Pods are drained can be configured
// with the order field. When draining Pods of a Node the Pods will be grouped by order
// and one group after another will be drained (by increasing order). Cluster API will
// wait until all Pods of a group are terminated / removed from the Node before starting
// with the next group.
// "Skip" means that the Pods to which this MachineDrainRule applies will be skipped during drain.
// "WaitCompleted" means that the pods to which this MachineDrainRule applies will never be evicted
// and we wait for them to be completed.
// +required
Behavior MachineDrainRuleDrainBehavior `json:"behavior"`

Expand Down
1 change: 1 addition & 0 deletions config/crd/bases/cluster.x-k8s.io_machinedrainrules.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

107 changes: 55 additions & 52 deletions docs/proposals/20240930-machine-drain-rules.md

Large diffs are not rendered by default.

26 changes: 22 additions & 4 deletions internal/controllers/machine/drain/drain.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ func (d *Helper) GetPodsForEviction(ctx context.Context, cluster *clusterv1.Clus
podNamespaces[ns.Name] = &ns
}

// Note: As soon as a filter decides that a Pod should be skipped (i.e. DrainBehavior == "Skip")
// Note: As soon as a filter decides that a Pod should be skipped (i.e. DrainBehavior == "Skip" or "WaitCompleted")
// other filters won't be evaluated and the Pod will be skipped.
list := filterPods(ctx, allPods, []PodFilter{
// Phase 1: Basic filtering (aligned to kubectl drain)
Expand All @@ -150,7 +150,7 @@ func (d *Helper) GetPodsForEviction(ctx context.Context, cluster *clusterv1.Clus

// Phase 2: Filtering based on drain label & MachineDrainRules

// Skip Pods with label cluster.x-k8s.io/drain == "skip"
// Skip Pods with label cluster.x-k8s.io/drain == "skip" or "wait-completed"
d.drainLabelFilter,

// Use drain behavior and order from first matching MachineDrainRule
Expand Down Expand Up @@ -225,7 +225,8 @@ func filterPods(ctx context.Context, allPods []*corev1.Pod, filters []PodFilter)
var deleteWarnings []string
for _, filter := range filters {
status = filter(ctx, pod)
if status.DrainBehavior == clusterv1.MachineDrainRuleDrainBehaviorSkip {
if status.DrainBehavior == clusterv1.MachineDrainRuleDrainBehaviorSkip ||
status.DrainBehavior == clusterv1.MachineDrainRuleDrainBehaviorWaitCompleted {
// short-circuit as soon as pod is filtered out
// at that point, there is no reason to run pod
// through any additional filters
Expand Down Expand Up @@ -281,6 +282,7 @@ func (d *Helper) EvictPods(ctx context.Context, podDeleteList *PodDeleteList) Ev
var podsToTriggerEvictionLater []PodDelete
var podsWithDeletionTimestamp []PodDelete
var podsToBeIgnored []PodDelete
var podsToWaitCompleted []PodDelete
for _, pod := range podDeleteList.items {
switch {
case pod.Status.DrainBehavior == clusterv1.MachineDrainRuleDrainBehaviorDrain && pod.Pod.DeletionTimestamp.IsZero():
Expand All @@ -289,6 +291,8 @@ func (d *Helper) EvictPods(ctx context.Context, podDeleteList *PodDeleteList) Ev
} else {
podsToTriggerEvictionLater = append(podsToTriggerEvictionLater, pod)
}
case pod.Status.DrainBehavior == clusterv1.MachineDrainRuleDrainBehaviorWaitCompleted:
podsToWaitCompleted = append(podsToWaitCompleted, pod)
case pod.Status.DrainBehavior == clusterv1.MachineDrainRuleDrainBehaviorDrain:
podsWithDeletionTimestamp = append(podsWithDeletionTimestamp, pod)
default:
Expand All @@ -300,6 +304,7 @@ func (d *Helper) EvictPods(ctx context.Context, podDeleteList *PodDeleteList) Ev
"podsToTriggerEvictionNow", podDeleteListToString(podsToTriggerEvictionNow, 5),
"podsToTriggerEvictionLater", podDeleteListToString(podsToTriggerEvictionLater, 5),
"podsWithDeletionTimestamp", podDeleteListToString(podsWithDeletionTimestamp, 5),
"podsToWaitCompleted", podDeleteListToString(podsToWaitCompleted, 5),
)

// Trigger evictions for at most 10s. We'll continue on the next reconcile if we hit the timeout.
Expand Down Expand Up @@ -394,6 +399,10 @@ evictionLoop:
res.PodsToTriggerEvictionLater = append(res.PodsToTriggerEvictionLater, pd.Pod)
}

for _, pd := range podsToWaitCompleted {
res.PodsToWaitCompleted = append(res.PodsToWaitCompleted, pd.Pod)
}

return res
}

Expand All @@ -403,6 +412,9 @@ func minDrainOrderOfPodsToDrain(pds []PodDelete) int32 {
if pd.Status.DrainBehavior == clusterv1.MachineDrainRuleDrainBehaviorDrain && ptr.Deref(pd.Status.DrainOrder, 0) < minOrder {
minOrder = ptr.Deref(pd.Status.DrainOrder, 0)
}
if pd.Status.DrainBehavior == clusterv1.MachineDrainRuleDrainBehaviorWaitCompleted && 0 < minOrder {
minOrder = 0
}
}
return minOrder
}
Expand Down Expand Up @@ -431,13 +443,15 @@ type EvictionResult struct {
PodsDeletionTimestampSet []*corev1.Pod
PodsFailedEviction map[string][]*corev1.Pod
PodsToTriggerEvictionLater []*corev1.Pod
PodsToWaitCompleted []*corev1.Pod
PodsNotFound []*corev1.Pod
PodsIgnored []*corev1.Pod
}

// DrainCompleted returns if a Node is entirely drained, i.e. if all relevant Pods have gone away.
func (r EvictionResult) DrainCompleted() bool {
return len(r.PodsDeletionTimestampSet) == 0 && len(r.PodsFailedEviction) == 0 && len(r.PodsToTriggerEvictionLater) == 0
return len(r.PodsDeletionTimestampSet) == 0 && len(r.PodsFailedEviction) == 0 &&
len(r.PodsToTriggerEvictionLater) == 0 && len(r.PodsToWaitCompleted) == 0
}

// ConditionMessage returns a condition message for the case where a drain is not completed.
Expand Down Expand Up @@ -498,6 +512,10 @@ func (r EvictionResult) ConditionMessage(nodeDrainStartTime *metav1.Time) string
conditionMessage = fmt.Sprintf("%s\nAfter above Pods have been removed from the Node, the following Pods will be evicted: %s",
conditionMessage, PodListToString(r.PodsToTriggerEvictionLater, 3))
}
if len(r.PodsToWaitCompleted) > 0 {
conditionMessage = fmt.Sprintf("%s\nWaiting for the following Pods to complete without eviction: %s",
conditionMessage, PodListToString(r.PodsToWaitCompleted, 3))
}
return conditionMessage
}

Expand Down
Loading

0 comments on commit 6650be9

Please sign in to comment.