Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🌱 Implement MS remediating conditions #11382

Merged
merged 3 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion api/v1beta1/machineset_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,10 +153,37 @@ const (
MachineSetMachinesUpToDateInternalErrorV1Beta2Reason = InternalErrorV1Beta2Reason
)

// Conditions that will be used for the MachineSet object in v1Beta2 API version.
// MachineSet's Remediating condition and corresponding reasons that will be used in v1Beta2 API version.
const (
// MachineSetRemediatingV1Beta2Condition surfaces details about ongoing remediation of the controlled machines, if any.
MachineSetRemediatingV1Beta2Condition = RemediatingV1Beta2Condition

// MachineSetRemediatingV1Beta2Reason surfaces when the MachineSet has at least one machine with HealthCheckSucceeded set to false
// and with the OwnerRemediated condition set to false.
MachineSetRemediatingV1Beta2Reason = RemediatingV1Beta2Reason

// MachineSetNotRemediatingV1Beta2Reason surfaces when the MachineSet does not have any machine with HealthCheckSucceeded set to false
// and with the OwnerRemediated condition set to false.
MachineSetNotRemediatingV1Beta2Reason = NotRemediatingV1Beta2Reason

// MachineSetRemediatingInternalErrorV1Beta2Reason surfaces unexpected failures when computing the Remediating condition.
MachineSetRemediatingInternalErrorV1Beta2Reason = InternalErrorV1Beta2Reason
)

// Reasons that will be used for the OwnerRemediated condition set by MachineHealthCheck on MachineSet controlled machines
// being remediated in v1Beta2 API version.
const (
// MachineSetMachineCannotBeRemediatedV1Beta2Reason surfaces when remediation of a MachineSet machine can't be started.
MachineSetMachineCannotBeRemediatedV1Beta2Reason = "CannotBeRemediated"

// MachineSetMachineRemediationDeferredV1Beta2Reason surfaces when remediation of a MachineSet machine must be deferred.
MachineSetMachineRemediationDeferredV1Beta2Reason = "RemediationDeferred"

// MachineSetMachineRemediationMachineDeletedV1Beta2Reason surfaces when remediation of a MachineSet machine
// has been completed by deleting the unhealthy machine.
// Note: After an unhealthy machine is deleted, a new one is created by the MachineSet as part of the
// regular reconcile loop that ensures the correct number of replicas exist.
MachineSetMachineRemediationMachineDeletedV1Beta2Reason = "MachineDeleted"
)

// MachineSet's Deleting condition and corresponding reasons that will be used in v1Beta2 API version.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ const (
// Reasons that will be used for the OwnerRemediated condition set by MachineHealthCheck on KubeadmControlPlane controlled machines
// being remediated in v1Beta2 API version.
const (
// KubeadmControlPlaneMachineRemediationInternalErrorV1Beta2Reason surfaces unexpected failures while remediation a control plane machine.
// KubeadmControlPlaneMachineRemediationInternalErrorV1Beta2Reason surfaces unexpected failures while remediating a control plane machine.
KubeadmControlPlaneMachineRemediationInternalErrorV1Beta2Reason = clusterv1.InternalErrorV1Beta2Reason

// KubeadmControlPlaneMachineCannotBeRemediatedV1Beta2Reason surfaces when remediation of a control plane machine can't be started.
Expand Down
2 changes: 0 additions & 2 deletions internal/controllers/machine/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -309,8 +309,6 @@ func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clust
clusterv1.BootstrapReadyCondition,
clusterv1.InfrastructureReadyCondition,
clusterv1.DrainingSucceededCondition,
clusterv1.MachineHealthCheckSucceededCondition,
clusterv1.MachineOwnerRemediatedCondition,
}},
patch.WithOwnedV1Beta2Conditions{Conditions: []string{
clusterv1.MachineAvailableV1Beta2Condition,
Expand Down
188 changes: 146 additions & 42 deletions internal/controllers/machineset/machineset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ import (
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/collections"
"sigs.k8s.io/cluster-api/util/conditions"
v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2"
utilconversion "sigs.k8s.io/cluster-api/util/conversion"
"sigs.k8s.io/cluster-api/util/finalizers"
"sigs.k8s.io/cluster-api/util/labels/format"
Expand Down Expand Up @@ -1195,19 +1196,75 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (

cluster := s.cluster
ms := s.machineSet
filteredMachines := s.machines
machines := s.machines
owner := s.owningMachineDeployment
log := ctrl.LoggerFrom(ctx)

// Remove OwnerRemediated condition from Machines that have HealthCheckSucceeded condition true
// and OwnerRemediated condition false
errList := []error{}
for _, m := range machines {
if !m.DeletionTimestamp.IsZero() {
continue
}

shouldCleanup := conditions.IsTrue(m, clusterv1.MachineHealthCheckSucceededCondition) && conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition)
shouldCleanupV1Beta2 := v1beta2conditions.IsTrue(m, clusterv1.MachineHealthCheckSucceededV1Beta2Condition) && v1beta2conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition)

if !(shouldCleanup || shouldCleanupV1Beta2) {
continue
}

patchHelper, err := patch.NewHelper(m, r.Client)
if err != nil {
errList = append(errList, err)
continue
}

if shouldCleanup {
conditions.Delete(m, clusterv1.MachineOwnerRemediatedCondition)
}

if shouldCleanupV1Beta2 {
v1beta2conditions.Delete(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition)
}

if err := patchHelper.Patch(ctx, m, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
clusterv1.MachineOwnerRemediatedCondition,
}}, patch.WithOwnedV1Beta2Conditions{Conditions: []string{
clusterv1.MachineOwnerRemediatedV1Beta2Condition,
}}); err != nil {
errList = append(errList, err)
}
}
if len(errList) > 0 {
return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errList), "failed to remove OwnerRemediated condition from healhty Machines")
}

machinesToRemediate := collections.FromMachines(machines...).Filter(collections.IsUnhealthyAndOwnerRemediated, collections.Not(collections.HasDeletionTimestamp)).UnsortedList()
sbueringer marked this conversation as resolved.
Show resolved Hide resolved

// If there are no machines to remediate return early.
if len(machinesToRemediate) == 0 {
return ctrl.Result{}, nil
}

// Calculate how many in flight machines we should remediate.
// By default, we allow all machines to be remediated at the same time.
maxInFlight := len(filteredMachines)
maxInFlight := len(machinesToRemediate)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the original versions this was len(all machines).
However, this should not make a difference because in both cases, if mf.Spec.Strategy.Remediation.MaxInFlight is not set, both values are good enough to ensure we can always delete all the machines to be remediated.
Is this correct?

Copy link
Member Author

@sbueringer sbueringer Nov 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes! I was just thinking that len(machinesToRemediate) makes more sense. As we are only remediating at most len(machinesToRemediate) not all Machines.

I guess we could also just set this to MaxInt. Just slightly more obvious :) (I"ll probably do that)


// If the MachineSet is part of a MachineDeployment, only allow remediations if
// it's the desired revision.
if isDeploymentChild(ms) {
if owner.Annotations[clusterv1.RevisionAnnotation] != ms.Annotations[clusterv1.RevisionAnnotation] {
// MachineSet is part of a MachineDeployment but isn't the current revision, no remediations allowed.
if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineSetMachineCannotBeRemediatedV1Beta2Reason,
Message: "Machine won't be remediated because it is pending removal due to rollout",
}, nil); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}

Expand All @@ -1224,31 +1281,31 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
}
}

// List all unhealthy machines.
machinesToRemediate := make([]*clusterv1.Machine, 0, len(filteredMachines))
for _, m := range filteredMachines {
// filteredMachines contains machines in deleting status to calculate correct status.
// skip remediation for those in deleting status.
// Update maxInFlight based on remediations that are in flight.
sbueringer marked this conversation as resolved.
Show resolved Hide resolved
for _, m := range machines {
if !m.DeletionTimestamp.IsZero() {
// TODO: Check for Status: False and Reason: MachineSetMachineRemediationMachineDeletedV1Beta2Reason
// instead when starting to use v1beta2 conditions for control flow.
if conditions.IsTrue(m, clusterv1.MachineOwnerRemediatedCondition) {
// Machine has been remediated by this controller and still in flight.
// Remediation for this Machine has been triggered by this controller but it is still in flight,
// i.e. it still goes through the deletion workflow and exists in etcd.
maxInFlight--
}
continue
}
if conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) {
machinesToRemediate = append(machinesToRemediate, m)
}
}

// If there are no machines to remediate return early.
if len(machinesToRemediate) == 0 {
return ctrl.Result{}, nil
}
// Check if we can remediate any machines.
if maxInFlight <= 0 {
// No tokens available to remediate machines.
log.V(3).Info("Remediation strategy is set, and maximum in flight has been reached", "machinesToBeRemediated", len(machinesToRemediate))
if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineSetMachineRemediationDeferredV1Beta2Reason,
Message: fmt.Sprintf("Waiting because there are already too many remediations in progress (spec.strategy.remediation.maxInFlight is %s)", owner.Spec.Strategy.Remediation.MaxInFlight),
}, nil); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}

Expand All @@ -1263,11 +1320,22 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
if len(machinesToRemediate) > maxInFlight {
log.V(5).Info("Remediation strategy is set, limiting in flight operations", "machinesToBeRemediated", len(machinesToRemediate))
// We have more machines to remediate than tokens available.
machinesToRemediate = machinesToRemediate[:maxInFlight]
allMachinesToRemediate := machinesToRemediate
machinesToRemediate = allMachinesToRemediate[:maxInFlight]
machinesToDeferRemediation := allMachinesToRemediate[maxInFlight:]

if err := patchMachineConditions(ctx, r.Client, machinesToDeferRemediation, metav1.Condition{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineSetMachineRemediationDeferredV1Beta2Reason,
Message: fmt.Sprintf("Waiting because there are already too many remediations in progress (spec.strategy.remediation.maxInFlight is %s)", owner.Spec.Strategy.Remediation.MaxInFlight),
}, nil); err != nil {
return ctrl.Result{}, err
}
}

// Run preflight checks.
preflightChecksResult, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Machine Remediation")
preflightChecksResult, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Machine remediation")
if err != nil {
// If err is not nil use that as the preflightCheckErrMessage
preflightCheckErrMessage = err.Error()
Expand All @@ -1277,48 +1345,84 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
if preflightChecksFailed {
// PreflightChecks did not pass. Update the MachineOwnerRemediated condition on the unhealthy Machines with
// WaitingForRemediationReason reason.
var errs []error
for _, m := range machinesToRemediate {
patchHelper, err := patch.NewHelper(m, r.Client)
if err != nil {
errs = append(errs, err)
continue
}
conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, preflightCheckErrMessage)
if err := patchHelper.Patch(ctx, m); err != nil {
errs = append(errs, err)
}
}

if len(errs) > 0 {
return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to patch unhealthy Machines")
if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineSetMachineRemediationDeferredV1Beta2Reason,
Message: preflightCheckErrMessage,
}, &clusterv1.Condition{
Type: clusterv1.MachineOwnerRemediatedCondition,
Status: corev1.ConditionFalse,
Reason: clusterv1.WaitingForRemediationReason,
Severity: clusterv1.ConditionSeverityWarning,
Message: preflightCheckErrMessage,
}); err != nil {
return ctrl.Result{}, err
}
return preflightChecksResult, nil
}

// PreflightChecks passed, so it is safe to remediate unhealthy machines.
// Remediate unhealthy machines by deleting them.
// PreflightChecks passed, so it is safe to remediate unhealthy machines by deleting them.

// Note: We intentionally patch the Machines before we delete them to make this code reentrant.
// If we delete the Machine first, the Machine would be filtered out on next reconcile because
// it has a deletionTimestamp so it would never get the condition.
// Instead if we set the condition but the deletion does not go through on next reconcile either the
// condition will be fixed/updated or the Machine deletion will be retried.
if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason,
}, &clusterv1.Condition{
Type: clusterv1.MachineOwnerRemediatedCondition,
Status: corev1.ConditionTrue,
}); err != nil {
return ctrl.Result{}, err
}
var errs []error
for _, m := range machinesToRemediate {
log.Info("Deleting unhealthy Machine", "Machine", klog.KObj(m))
patch := client.MergeFrom(m.DeepCopy())
if err := r.Client.Delete(ctx, m); err != nil && !apierrors.IsNotFound(err) {
errs = append(errs, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(m)))
continue
}
conditions.MarkTrue(m, clusterv1.MachineOwnerRemediatedCondition)
if err := r.Client.Status().Patch(ctx, m, patch); err != nil && !apierrors.IsNotFound(err) {
errs = append(errs, errors.Wrapf(err, "failed to update status of Machine %s", klog.KObj(m)))
}
}

if len(errs) > 0 {
return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to delete unhealthy Machines")
}

return ctrl.Result{}, nil
}

func patchMachineConditions(ctx context.Context, c client.Client, machines []*clusterv1.Machine, v1beta2Condition metav1.Condition, condition *clusterv1.Condition) error {
var errs []error
for _, m := range machines {
patchHelper, err := patch.NewHelper(m, c)
if err != nil {
errs = append(errs, err)
continue
}

if condition != nil {
conditions.Set(m, condition)
}
v1beta2conditions.Set(m, v1beta2Condition)

if err := patchHelper.Patch(ctx, m,
patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
clusterv1.MachineOwnerRemediatedCondition,
}}, patch.WithOwnedV1Beta2Conditions{Conditions: []string{
clusterv1.MachineOwnerRemediatedV1Beta2Condition,
}}); err != nil {
errs = append(errs, err)
}
}
if len(errs) > 0 {
return errors.Wrapf(kerrors.NewAggregate(errs), "failed to patch Machines")
}

return nil
}

func (r *Reconciler) reconcileExternalTemplateReference(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, owner *clusterv1.MachineDeployment, ref *corev1.ObjectReference) (objectNotFound bool, err error) {
if !strings.HasSuffix(ref.Kind, clusterv1.TemplateSuffix) {
return false, nil
Expand Down
Loading