From 726572cb90fdcecf2e7d2c7da19bb85478092b9d Mon Sep 17 00:00:00 2001 From: fabriziopandini Date: Thu, 16 Jan 2025 13:46:03 +0100 Subject: [PATCH 1/6] Improve Machine create and delete logs --- .../kubeadm/internal/controllers/controller.go | 2 +- controlplane/kubeadm/internal/controllers/helpers.go | 6 ++++++ .../kubeadm/internal/controllers/remediation.go | 2 +- controlplane/kubeadm/internal/controllers/scale.go | 1 + .../controllers/machineset/machineset_controller.go | 10 +++++----- 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/controlplane/kubeadm/internal/controllers/controller.go b/controlplane/kubeadm/internal/controllers/controller.go index 622023264805..bcb6dca157bb 100644 --- a/controlplane/kubeadm/internal/controllers/controller.go +++ b/controlplane/kubeadm/internal/controllers/controller.go @@ -672,7 +672,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, con continue } - log.Info("Deleting control plane Machine") + log.Info("Deleting Machine (KCP deleted)") if err := r.Client.Delete(ctx, machineToDelete); err != nil && !apierrors.IsNotFound(err) { errs = append(errs, errors.Wrapf(err, "failed to delete control plane Machine %s", klog.KObj(machineToDelete))) } diff --git a/controlplane/kubeadm/internal/controllers/helpers.go b/controlplane/kubeadm/internal/controllers/helpers.go index 82f68eb225b9..7743510d8c8d 100644 --- a/controlplane/kubeadm/internal/controllers/helpers.go +++ b/controlplane/kubeadm/internal/controllers/helpers.go @@ -29,6 +29,7 @@ import ( "k8s.io/apimachinery/pkg/types" kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apiserver/pkg/storage/names" + "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -184,6 +185,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileExternalReference(ctx context.C func (r *KubeadmControlPlaneReconciler) cloneConfigsAndGenerateMachine(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, bootstrapSpec *bootstrapv1.KubeadmConfigSpec, failureDomain *string) error { var errs []error + log := ctrl.LoggerFrom(ctx) // Compute desired Machine machine, err := r.computeDesiredMachine(kcp, cluster, failureDomain, nil) @@ -252,6 +254,10 @@ func (r *KubeadmControlPlaneReconciler) cloneConfigsAndGenerateMachine(ctx conte return kerrors.NewAggregate(errs) } + log.Info("Machine created (scale up)", + "Machine", klog.KObj(machine), + infraRef.Kind, klog.KRef(infraRef.Namespace, infraRef.Name), + bootstrapRef.Kind, klog.KRef(bootstrapRef.Namespace, bootstrapRef.Name)) return nil } diff --git a/controlplane/kubeadm/internal/controllers/remediation.go b/controlplane/kubeadm/internal/controllers/remediation.go index e2c24d93d8c7..3e20fa197fdf 100644 --- a/controlplane/kubeadm/internal/controllers/remediation.go +++ b/controlplane/kubeadm/internal/controllers/remediation.go @@ -311,7 +311,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C } // Surface the operation is in progress. - log.Info("Remediating unhealthy machine") + log.Info("Deleting Machine (remediating unhealthy Machine)") conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") v1beta2conditions.Set(machineToBeRemediated, metav1.Condition{ diff --git a/controlplane/kubeadm/internal/controllers/scale.go b/controlplane/kubeadm/internal/controllers/scale.go index cc41cdfa3e15..1fae895c1e06 100644 --- a/controlplane/kubeadm/internal/controllers/scale.go +++ b/controlplane/kubeadm/internal/controllers/scale.go @@ -139,6 +139,7 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( } logger = logger.WithValues("Machine", klog.KObj(machineToDelete)) + logger.Info("Deleting Machine (scale down)") if err := r.Client.Delete(ctx, machineToDelete); err != nil && !apierrors.IsNotFound(err) { logger.Error(err, "Failed to delete control plane machine") r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "FailedScaleDown", diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index aeb5ac4341ca..cbd6df07a0e7 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -427,7 +427,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, s *scope) (ctrl.Result // else delete owned machines. for _, machine := range machineList { if machine.DeletionTimestamp.IsZero() { - log.Info("Deleting Machine", "Machine", klog.KObj(machine)) + log.Info("Deleting Machine (MS deleted)", "Machine", klog.KObj(machine)) if err := r.Client.Delete(ctx, machine); err != nil && !apierrors.IsNotFound(err) { return ctrl.Result{}, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(machine)) } @@ -794,7 +794,7 @@ func (r *Reconciler) syncReplicas(ctx context.Context, s *scope) (ctrl.Result, e continue } - log.Info(fmt.Sprintf("Created machine %d of %d", i+1, diff), "Machine", klog.KObj(machine)) + log.Info(fmt.Sprintf("Machine created (scale up, creating %d of %d)", i+1, diff), "Machine", klog.KObj(machine)) r.recorder.Eventf(ms, corev1.EventTypeNormal, "SuccessfulCreate", "Created machine %q", machine.Name) machineList = append(machineList, machine) } @@ -816,7 +816,7 @@ func (r *Reconciler) syncReplicas(ctx context.Context, s *scope) (ctrl.Result, e for i, machine := range machinesToDelete { log := log.WithValues("Machine", klog.KObj(machine)) if machine.GetDeletionTimestamp().IsZero() { - log.Info(fmt.Sprintf("Deleting machine %d of %d", i+1, diff)) + log.Info(fmt.Sprintf("Deleting Machine (scale down, deleting %d of %d)", i+1, diff)) if err := r.Client.Delete(ctx, machine); err != nil { log.Error(err, "Unable to delete Machine") r.recorder.Eventf(ms, corev1.EventTypeWarning, "FailedDelete", "Failed to delete machine %q: %v", machine.Name, err) @@ -825,7 +825,7 @@ func (r *Reconciler) syncReplicas(ctx context.Context, s *scope) (ctrl.Result, e } r.recorder.Eventf(ms, corev1.EventTypeNormal, "SuccessfulDelete", "Deleted machine %q", machine.Name) } else { - log.Info(fmt.Sprintf("Waiting for machine %d of %d to be deleted", i+1, diff)) + log.Info(fmt.Sprintf("Waiting for Machine to be deleted (scale down, deleting %d of %d)", i+1, diff)) } } @@ -1492,7 +1492,7 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) ( } var errs []error for _, m := range machinesToRemediate { - log.Info("Deleting unhealthy Machine", "Machine", klog.KObj(m)) + log.Info("Deleting Machine (remediating unhealthy Machine)", "Machine", klog.KObj(m)) if err := r.Client.Delete(ctx, m); err != nil && !apierrors.IsNotFound(err) { errs = append(errs, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(m))) } From 5d60723b0ed21601de4aebb321721f02a337994e Mon Sep 17 00:00:00 2001 From: fabriziopandini Date: Fri, 17 Jan 2025 12:18:11 +0100 Subject: [PATCH 2/6] Add k/v pairs describing the overall status of the control plane --- .../kubeadm/internal/control_plane.go | 80 +++++++++++++++++++ .../internal/controllers/controller.go | 3 +- .../kubeadm/internal/controllers/helpers.go | 17 ++-- .../internal/controllers/remediation.go | 3 +- .../kubeadm/internal/controllers/scale.go | 22 ++++- util/collections/machine_collection.go | 10 +++ 6 files changed, 117 insertions(+), 18 deletions(-) diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index f79dee1d8ddd..11dbba3fdf3c 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -18,6 +18,9 @@ package internal import ( "context" + "fmt" + "sort" + "strings" "github.com/pkg/errors" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -32,6 +35,7 @@ import ( controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd" "sigs.k8s.io/cluster-api/util/collections" + "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/failuredomains" "sigs.k8s.io/cluster-api/util/patch" ) @@ -385,3 +389,79 @@ func (c *ControlPlane) InjectTestManagementCluster(managementCluster ManagementC c.managementCluster = managementCluster c.workloadCluster = nil } + +// StatusToLogKeyAndValues returns following key/value pairs describing the overall status of the control plane: +// - machines is the list of KCP machines; each machine might have additional notes surfacing +// - if the machine has been created in the current reconcile (new) +// - if machines node is not yet (node ref not set) +// - if the machine has bee marked for remediation (health check failed) +// - if there are unhealthy control plane component on the machine +// - if the machine has a deletion timestamp/has been deleted in the current reconcile (deleting) +// - if the machine is not up to date with the KCP spec (not up to date) +// +// - etcdMembers list as reported by etcd. +func (c *ControlPlane) StatusToLogKeyAndValues(newMachine, deletedMachine *clusterv1.Machine) []any { + controlPlaneMachineHealthConditions := []clusterv1.ConditionType{ + controlplanev1.MachineAPIServerPodHealthyCondition, + controlplanev1.MachineControllerManagerPodHealthyCondition, + controlplanev1.MachineSchedulerPodHealthyCondition, + } + if c.IsEtcdManaged() { + controlPlaneMachineHealthConditions = append(controlPlaneMachineHealthConditions, + controlplanev1.MachineEtcdPodHealthyCondition, + controlplanev1.MachineEtcdMemberHealthyCondition, + ) + } + + machines := []string{} + for _, m := range c.Machines { + notes := []string{} + + if m.Status.NodeRef == nil { + notes = append(notes, "node ref not set") + } + + if c.MachinesToBeRemediatedByKCP().Has(m) { + notes = append(notes, "health check failed") + } + + for _, condition := range controlPlaneMachineHealthConditions { + if conditions.IsUnknown(m, condition) { + notes = append(notes, strings.Replace(string(condition), "Healthy", " health unknown", -1)) + } + if conditions.IsFalse(m, condition) { + notes = append(notes, strings.Replace(string(condition), "Healthy", " not healthy", -1)) + } + } + + if !c.UpToDateMachines().Has(m) { + notes = append(notes, "not up to date") + } + + if !m.DeletionTimestamp.IsZero() || (deletedMachine != nil && m.Name == deletedMachine.Name) { + notes = append(notes, "deleting") + } + + name := m.Name + if len(notes) > 0 { + name = fmt.Sprintf("%s (%s)", name, strings.Join(notes, ", ")) + } + machines = append(machines, name) + } + + if newMachine != nil { + machines = append(machines, fmt.Sprintf("%s (new)", newMachine.Name)) + } + sort.Strings(machines) + + etcdMembers := []string{} + for _, m := range c.EtcdMembers { + etcdMembers = append(etcdMembers, m.Name) + } + sort.Strings(etcdMembers) + + return []any{ + "machines", strings.Join(machines, ", "), + "etcdMembers", strings.Join(etcdMembers, ", "), + } +} diff --git a/controlplane/kubeadm/internal/controllers/controller.go b/controlplane/kubeadm/internal/controllers/controller.go index bcb6dca157bb..d3bfbe07794d 100644 --- a/controlplane/kubeadm/internal/controllers/controller.go +++ b/controlplane/kubeadm/internal/controllers/controller.go @@ -672,7 +672,8 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, con continue } - log.Info("Deleting Machine (KCP deleted)") + log.WithValues(controlPlane.StatusToLogKeyAndValues(nil, machineToDelete)...). + Info("Deleting Machine (KCP deleted)") if err := r.Client.Delete(ctx, machineToDelete); err != nil && !apierrors.IsNotFound(err) { errs = append(errs, errors.Wrapf(err, "failed to delete control plane Machine %s", klog.KObj(machineToDelete))) } diff --git a/controlplane/kubeadm/internal/controllers/helpers.go b/controlplane/kubeadm/internal/controllers/helpers.go index 7743510d8c8d..dc3f5b24b5d1 100644 --- a/controlplane/kubeadm/internal/controllers/helpers.go +++ b/controlplane/kubeadm/internal/controllers/helpers.go @@ -29,7 +29,6 @@ import ( "k8s.io/apimachinery/pkg/types" kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apiserver/pkg/storage/names" - "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -183,14 +182,13 @@ func (r *KubeadmControlPlaneReconciler) reconcileExternalReference(ctx context.C return patchHelper.Patch(ctx, obj) } -func (r *KubeadmControlPlaneReconciler) cloneConfigsAndGenerateMachine(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, bootstrapSpec *bootstrapv1.KubeadmConfigSpec, failureDomain *string) error { +func (r *KubeadmControlPlaneReconciler) cloneConfigsAndGenerateMachine(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, bootstrapSpec *bootstrapv1.KubeadmConfigSpec, failureDomain *string) (*clusterv1.Machine, error) { var errs []error - log := ctrl.LoggerFrom(ctx) // Compute desired Machine machine, err := r.computeDesiredMachine(kcp, cluster, failureDomain, nil) if err != nil { - return errors.Wrap(err, "failed to create Machine: failed to compute desired Machine") + return nil, errors.Wrap(err, "failed to create Machine: failed to compute desired Machine") } // Since the cloned resource should eventually have a controller ref for the Machine, we create an @@ -222,7 +220,7 @@ func (r *KubeadmControlPlaneReconciler) cloneConfigsAndGenerateMachine(ctx conte // Safe to return early here since no resources have been created yet. conditions.MarkFalse(kcp, controlplanev1.MachinesCreatedCondition, controlplanev1.InfrastructureTemplateCloningFailedReason, clusterv1.ConditionSeverityError, err.Error()) - return errors.Wrap(err, "failed to clone infrastructure template") + return nil, errors.Wrap(err, "failed to clone infrastructure template") } machine.Spec.InfrastructureRef = *infraRef @@ -250,15 +248,10 @@ func (r *KubeadmControlPlaneReconciler) cloneConfigsAndGenerateMachine(ctx conte if err := r.cleanupFromGeneration(ctx, infraRef, bootstrapRef); err != nil { errs = append(errs, errors.Wrap(err, "failed to cleanup generated resources")) } - - return kerrors.NewAggregate(errs) + return nil, kerrors.NewAggregate(errs) } - log.Info("Machine created (scale up)", - "Machine", klog.KObj(machine), - infraRef.Kind, klog.KRef(infraRef.Namespace, infraRef.Name), - bootstrapRef.Kind, klog.KRef(bootstrapRef.Namespace, bootstrapRef.Name)) - return nil + return machine, nil } func (r *KubeadmControlPlaneReconciler) cleanupFromGeneration(ctx context.Context, remoteRefs ...*corev1.ObjectReference) error { diff --git a/controlplane/kubeadm/internal/controllers/remediation.go b/controlplane/kubeadm/internal/controllers/remediation.go index 3e20fa197fdf..1dd98641eb05 100644 --- a/controlplane/kubeadm/internal/controllers/remediation.go +++ b/controlplane/kubeadm/internal/controllers/remediation.go @@ -311,7 +311,8 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C } // Surface the operation is in progress. - log.Info("Deleting Machine (remediating unhealthy Machine)") + log.WithValues(controlPlane.StatusToLogKeyAndValues(nil, machineToBeRemediated)...). + Info("Deleting Machine (remediating unhealthy Machine)") conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") v1beta2conditions.Set(machineToBeRemediated, metav1.Condition{ diff --git a/controlplane/kubeadm/internal/controllers/scale.go b/controlplane/kubeadm/internal/controllers/scale.go index 1fae895c1e06..bbfe7b77bad7 100644 --- a/controlplane/kubeadm/internal/controllers/scale.go +++ b/controlplane/kubeadm/internal/controllers/scale.go @@ -53,12 +53,19 @@ func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Conte return ctrl.Result{}, err } - if err := r.cloneConfigsAndGenerateMachine(ctx, controlPlane.Cluster, controlPlane.KCP, bootstrapSpec, fd); err != nil { + newMachine, err := r.cloneConfigsAndGenerateMachine(ctx, controlPlane.Cluster, controlPlane.KCP, bootstrapSpec, fd) + if err != nil { logger.Error(err, "Failed to create initial control plane Machine") r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "FailedInitialization", "Failed to create initial control plane Machine for cluster %s control plane: %v", klog.KObj(controlPlane.Cluster), err) return ctrl.Result{}, err } + logger.WithValues(controlPlane.StatusToLogKeyAndValues(newMachine, nil)...). + Info("Machine created (scale up)", + "Machine", klog.KObj(newMachine), + newMachine.Spec.InfrastructureRef.Kind, klog.KRef(newMachine.Spec.InfrastructureRef.Namespace, newMachine.Spec.InfrastructureRef.Name), + newMachine.Spec.Bootstrap.ConfigRef.Kind, klog.KRef(newMachine.Spec.Bootstrap.ConfigRef.Namespace, newMachine.Spec.Bootstrap.ConfigRef.Name)) + // Requeue the control plane, in case there are additional operations to perform return ctrl.Result{Requeue: true}, nil } @@ -87,12 +94,19 @@ func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, return ctrl.Result{}, err } - if err := r.cloneConfigsAndGenerateMachine(ctx, controlPlane.Cluster, controlPlane.KCP, bootstrapSpec, fd); err != nil { + newMachine, err := r.cloneConfigsAndGenerateMachine(ctx, controlPlane.Cluster, controlPlane.KCP, bootstrapSpec, fd) + if err != nil { logger.Error(err, "Failed to create additional control plane Machine") r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "FailedScaleUp", "Failed to create additional control plane Machine for cluster % control plane: %v", klog.KObj(controlPlane.Cluster), err) return ctrl.Result{}, err } + logger.WithValues(controlPlane.StatusToLogKeyAndValues(newMachine, nil)...). + Info("Machine created (scale up)", + "Machine", klog.KObj(newMachine), + newMachine.Spec.InfrastructureRef.Kind, klog.KRef(newMachine.Spec.InfrastructureRef.Namespace, newMachine.Spec.InfrastructureRef.Name), + newMachine.Spec.Bootstrap.ConfigRef.Kind, klog.KRef(newMachine.Spec.Bootstrap.ConfigRef.Namespace, newMachine.Spec.Bootstrap.ConfigRef.Name)) + // Requeue the control plane, in case there are other operations to perform return ctrl.Result{Requeue: true}, nil } @@ -138,14 +152,14 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( // NOTE: etcd member removal will be performed by the kcp-cleanup hook after machine completes drain & all volumes are detached. } - logger = logger.WithValues("Machine", klog.KObj(machineToDelete)) - logger.Info("Deleting Machine (scale down)") if err := r.Client.Delete(ctx, machineToDelete); err != nil && !apierrors.IsNotFound(err) { logger.Error(err, "Failed to delete control plane machine") r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "FailedScaleDown", "Failed to delete control plane Machine %s for cluster %s control plane: %v", machineToDelete.Name, klog.KObj(controlPlane.Cluster), err) return ctrl.Result{}, err } + logger.WithValues(controlPlane.StatusToLogKeyAndValues(nil, machineToDelete)...). + Info("Deleting Machine (scale down)", "Machine", klog.KObj(machineToDelete)) // Requeue the control plane, in case there are additional operations to perform return ctrl.Result{Requeue: true}, nil diff --git a/util/collections/machine_collection.go b/util/collections/machine_collection.go index a8c686a8a979..ed8686810d2d 100644 --- a/util/collections/machine_collection.go +++ b/util/collections/machine_collection.go @@ -123,6 +123,16 @@ func ToMachineList(machines Machines) clusterv1.MachineList { return ml } +// Has return true when the collection has the given machine. +func (s Machines) Has(machine *clusterv1.Machine) bool { + for _, m := range s { + if m.Name == machine.Name && m.Namespace == machine.Namespace { + return true + } + } + return false +} + // Insert adds items to the set. func (s Machines) Insert(machines ...*clusterv1.Machine) { for i := range machines { From be1bb7afeedd16505d409b66ae7f68ef0fffb730 Mon Sep 17 00:00:00 2001 From: fabriziopandini Date: Fri, 17 Jan 2025 13:44:51 +0100 Subject: [PATCH 3/6] Fix unit test --- controlplane/kubeadm/internal/controllers/helpers_test.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/controlplane/kubeadm/internal/controllers/helpers_test.go b/controlplane/kubeadm/internal/controllers/helpers_test.go index a36a65c5f97c..18dfa1157b3d 100644 --- a/controlplane/kubeadm/internal/controllers/helpers_test.go +++ b/controlplane/kubeadm/internal/controllers/helpers_test.go @@ -374,7 +374,8 @@ func TestCloneConfigsAndGenerateMachine(t *testing.T) { bootstrapSpec := &bootstrapv1.KubeadmConfigSpec{ JoinConfiguration: &bootstrapv1.JoinConfiguration{}, } - g.Expect(r.cloneConfigsAndGenerateMachine(ctx, cluster, kcp, bootstrapSpec, nil)).To(Succeed()) + _, err := r.cloneConfigsAndGenerateMachine(ctx, cluster, kcp, bootstrapSpec, nil) + g.Expect(err).To(Succeed()) machineList := &clusterv1.MachineList{} g.Expect(env.GetAPIReader().List(ctx, machineList, client.InNamespace(cluster.Namespace))).To(Succeed()) @@ -463,7 +464,8 @@ func TestCloneConfigsAndGenerateMachineFail(t *testing.T) { // Try to break Infra Cloning kcp.Spec.MachineTemplate.InfrastructureRef.Name = "something_invalid" - g.Expect(r.cloneConfigsAndGenerateMachine(ctx, cluster, kcp, bootstrapSpec, nil)).To(HaveOccurred()) + _, err := r.cloneConfigsAndGenerateMachine(ctx, cluster, kcp, bootstrapSpec, nil) + g.Expect(err).To(HaveOccurred()) g.Expect(&kcp.GetConditions()[0]).Should(conditions.HaveSameStateOf(&clusterv1.Condition{ Type: controlplanev1.MachinesCreatedCondition, Status: corev1.ConditionFalse, From 4945ddeccc9f127d3f8a313fc52a571223f48141 Mon Sep 17 00:00:00 2001 From: fabriziopandini Date: Fri, 17 Jan 2025 13:45:08 +0100 Subject: [PATCH 4/6] Address comments --- internal/controllers/machineset/machineset_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index cbd6df07a0e7..ef08832ab1e2 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -427,7 +427,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, s *scope) (ctrl.Result // else delete owned machines. for _, machine := range machineList { if machine.DeletionTimestamp.IsZero() { - log.Info("Deleting Machine (MS deleted)", "Machine", klog.KObj(machine)) + log.Info("Deleting Machine (MachineSet deleted)", "Machine", klog.KObj(machine)) if err := r.Client.Delete(ctx, machine); err != nil && !apierrors.IsNotFound(err) { return ctrl.Result{}, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(machine)) } From 3f214b9c923fb56358c8f0914bdf921a70eaff06 Mon Sep 17 00:00:00 2001 From: fabriziopandini Date: Fri, 17 Jan 2025 17:55:31 +0100 Subject: [PATCH 5/6] More feedback --- .../kubeadm/internal/control_plane.go | 22 +++--- .../kubeadm/internal/control_plane_test.go | 72 +++++++++++++++++++ .../internal/controllers/controller.go | 6 +- .../internal/controllers/remediation.go | 2 + .../kubeadm/internal/controllers/scale.go | 2 + .../machineset/machineset_controller.go | 12 +++- 6 files changed, 101 insertions(+), 15 deletions(-) diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index 11dbba3fdf3c..43da2fedb9ad 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -392,12 +392,12 @@ func (c *ControlPlane) InjectTestManagementCluster(managementCluster ManagementC // StatusToLogKeyAndValues returns following key/value pairs describing the overall status of the control plane: // - machines is the list of KCP machines; each machine might have additional notes surfacing -// - if the machine has been created in the current reconcile (new) -// - if machines node is not yet (node ref not set) -// - if the machine has bee marked for remediation (health check failed) +// - if the machine has been created in the current reconcile +// - if machines node ref is not yet set +// - if the machine has been marked for remediation // - if there are unhealthy control plane component on the machine -// - if the machine has a deletion timestamp/has been deleted in the current reconcile (deleting) -// - if the machine is not up to date with the KCP spec (not up to date) +// - if the machine has a deletion timestamp/has been deleted in the current reconcile +// - if the machine is not up to date with the KCP spec // // - etcdMembers list as reported by etcd. func (c *ControlPlane) StatusToLogKeyAndValues(newMachine, deletedMachine *clusterv1.Machine) []any { @@ -418,11 +418,11 @@ func (c *ControlPlane) StatusToLogKeyAndValues(newMachine, deletedMachine *clust notes := []string{} if m.Status.NodeRef == nil { - notes = append(notes, "node ref not set") + notes = append(notes, "status.nodeRef not set") } if c.MachinesToBeRemediatedByKCP().Has(m) { - notes = append(notes, "health check failed") + notes = append(notes, "marked for remediation") } for _, condition := range controlPlaneMachineHealthConditions { @@ -435,10 +435,12 @@ func (c *ControlPlane) StatusToLogKeyAndValues(newMachine, deletedMachine *clust } if !c.UpToDateMachines().Has(m) { - notes = append(notes, "not up to date") + notes = append(notes, "not up-to-date") } - if !m.DeletionTimestamp.IsZero() || (deletedMachine != nil && m.Name == deletedMachine.Name) { + if deletedMachine != nil && m.Name == deletedMachine.Name { + notes = append(notes, "just deleted") + } else if !m.DeletionTimestamp.IsZero() { notes = append(notes, "deleting") } @@ -450,7 +452,7 @@ func (c *ControlPlane) StatusToLogKeyAndValues(newMachine, deletedMachine *clust } if newMachine != nil { - machines = append(machines, fmt.Sprintf("%s (new)", newMachine.Name)) + machines = append(machines, fmt.Sprintf("%s (just created)", newMachine.Name)) } sort.Strings(machines) diff --git a/controlplane/kubeadm/internal/control_plane_test.go b/controlplane/kubeadm/internal/control_plane_test.go index 5d02724de87d..2f9b84d263c2 100644 --- a/controlplane/kubeadm/internal/control_plane_test.go +++ b/controlplane/kubeadm/internal/control_plane_test.go @@ -17,8 +17,10 @@ limitations under the License. package internal import ( + "strings" "testing" + "github.com/google/go-cmp/cmp" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -26,6 +28,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" + "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd" "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/conditions" ) @@ -252,6 +255,75 @@ func TestHasHealthyMachineStillProvisioning(t *testing.T) { }) } +func TestStatusToLogKeyAndValues(t *testing.T) { + healthyMachine := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{Name: "healthy"}, + Status: clusterv1.MachineStatus{ + NodeRef: &corev1.ObjectReference{Name: "healthy-node"}, + Conditions: []clusterv1.Condition{ + {Type: controlplanev1.MachineAPIServerPodHealthyCondition, Status: corev1.ConditionTrue}, + {Type: controlplanev1.MachineControllerManagerPodHealthyCondition, Status: corev1.ConditionTrue}, + {Type: controlplanev1.MachineSchedulerPodHealthyCondition, Status: corev1.ConditionTrue}, + {Type: controlplanev1.MachineEtcdPodHealthyCondition, Status: corev1.ConditionTrue}, + {Type: controlplanev1.MachineEtcdMemberHealthyCondition, Status: corev1.ConditionTrue}, + }, + }, + } + + machineWithoutNode := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{Name: "without-node"}, + Status: clusterv1.MachineStatus{ + NodeRef: nil, + Conditions: []clusterv1.Condition{ + {Type: controlplanev1.MachineAPIServerPodHealthyCondition, Status: corev1.ConditionUnknown}, + {Type: controlplanev1.MachineControllerManagerPodHealthyCondition, Status: corev1.ConditionUnknown}, + {Type: controlplanev1.MachineSchedulerPodHealthyCondition, Status: corev1.ConditionUnknown}, + {Type: controlplanev1.MachineEtcdPodHealthyCondition, Status: corev1.ConditionUnknown}, + {Type: controlplanev1.MachineEtcdMemberHealthyCondition, Status: corev1.ConditionFalse}, // not a real use case, but used to test a code branch. + }, + }, + } + + machineJustCreated := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "just-created"}} + + machineJustDeleted := healthyMachine.DeepCopy() + machineJustDeleted.Name = "just-deleted" + + machineNotUpToDate := healthyMachine.DeepCopy() + machineNotUpToDate.Name = "not-up-to-date" + + machineMarkedForRemediation := healthyMachine.DeepCopy() + machineMarkedForRemediation.Name = "marked-for-remediation" + machineMarkedForRemediation.Status.Conditions = append(machineMarkedForRemediation.Status.Conditions, + clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededCondition, Status: corev1.ConditionFalse}, + clusterv1.Condition{Type: clusterv1.MachineOwnerRemediatedCondition, Status: corev1.ConditionFalse}, + ) + + g := NewWithT(t) + c := &ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{}, + Machines: collections.FromMachines(healthyMachine, machineWithoutNode, machineJustDeleted, machineNotUpToDate, machineMarkedForRemediation), + machinesNotUptoDate: collections.FromMachines(machineNotUpToDate), + EtcdMembers: []*etcd.Member{{Name: "m1"}, {Name: "m2"}, {Name: "m3"}}, + } + + got := c.StatusToLogKeyAndValues(machineJustCreated, machineJustDeleted) + + g.Expect(got).To((HaveLen(4))) + g.Expect(got[0]).To(Equal("machines")) + machines := strings.Join([]string{ + "healthy", + "just-created (just created)", + "just-deleted (just deleted)", + "marked-for-remediation (marked for remediation)", + "not-up-to-date (not up-to-date)", + "without-node (status.nodeRef not set, APIServerPod health unknown, ControllerManagerPod health unknown, SchedulerPod health unknown, EtcdPod health unknown, EtcdMember not healthy)", + }, ", ") + g.Expect(got[1]).To(Equal(machines), cmp.Diff(got[1], machines)) + g.Expect(got[2]).To(Equal("etcdMembers")) + g.Expect(got[3]).To(Equal("m1, m2, m3")) +} + type machineOpt func(*clusterv1.Machine) func failureDomain(controlPlane bool) clusterv1.FailureDomainSpec { diff --git a/controlplane/kubeadm/internal/controllers/controller.go b/controlplane/kubeadm/internal/controllers/controller.go index d3bfbe07794d..0dfe3603167e 100644 --- a/controlplane/kubeadm/internal/controllers/controller.go +++ b/controlplane/kubeadm/internal/controllers/controller.go @@ -672,11 +672,13 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, con continue } - log.WithValues(controlPlane.StatusToLogKeyAndValues(nil, machineToDelete)...). - Info("Deleting Machine (KCP deleted)") if err := r.Client.Delete(ctx, machineToDelete); err != nil && !apierrors.IsNotFound(err) { errs = append(errs, errors.Wrapf(err, "failed to delete control plane Machine %s", klog.KObj(machineToDelete))) } + // Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set. + // Also, setting DeletionTimestamp doesn't mean the Machine is actually deleted (deletion takes some time). + log.WithValues(controlPlane.StatusToLogKeyAndValues(nil, machineToDelete)...). + Info("Deleting Machine (KubeadmControlPlane deleted)") } if len(errs) > 0 { err := kerrors.NewAggregate(errs) diff --git a/controlplane/kubeadm/internal/controllers/remediation.go b/controlplane/kubeadm/internal/controllers/remediation.go index 1dd98641eb05..f67315721541 100644 --- a/controlplane/kubeadm/internal/controllers/remediation.go +++ b/controlplane/kubeadm/internal/controllers/remediation.go @@ -311,6 +311,8 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C } // Surface the operation is in progress. + // Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set. + // Also, setting DeletionTimestamp doesn't mean the Machine is actually deleted (deletion takes some time). log.WithValues(controlPlane.StatusToLogKeyAndValues(nil, machineToBeRemediated)...). Info("Deleting Machine (remediating unhealthy Machine)") conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") diff --git a/controlplane/kubeadm/internal/controllers/scale.go b/controlplane/kubeadm/internal/controllers/scale.go index bbfe7b77bad7..6dcbfab7f1ca 100644 --- a/controlplane/kubeadm/internal/controllers/scale.go +++ b/controlplane/kubeadm/internal/controllers/scale.go @@ -158,6 +158,8 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( "Failed to delete control plane Machine %s for cluster %s control plane: %v", machineToDelete.Name, klog.KObj(controlPlane.Cluster), err) return ctrl.Result{}, err } + // Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set. + // Also, setting DeletionTimestamp doesn't mean the Machine is actually deleted (deletion takes some time). logger.WithValues(controlPlane.StatusToLogKeyAndValues(nil, machineToDelete)...). Info("Deleting Machine (scale down)", "Machine", klog.KObj(machineToDelete)) diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index ef08832ab1e2..a220c98e42f5 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -427,10 +427,12 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, s *scope) (ctrl.Result // else delete owned machines. for _, machine := range machineList { if machine.DeletionTimestamp.IsZero() { - log.Info("Deleting Machine (MachineSet deleted)", "Machine", klog.KObj(machine)) if err := r.Client.Delete(ctx, machine); err != nil && !apierrors.IsNotFound(err) { return ctrl.Result{}, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(machine)) } + // Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set. + // Also, setting DeletionTimestamp doesn't mean the Machine is actually deleted (deletion takes some time). + log.Info("Deleting Machine (MachineSet deleted)", "Machine", klog.KObj(machine)) } } @@ -816,13 +818,15 @@ func (r *Reconciler) syncReplicas(ctx context.Context, s *scope) (ctrl.Result, e for i, machine := range machinesToDelete { log := log.WithValues("Machine", klog.KObj(machine)) if machine.GetDeletionTimestamp().IsZero() { - log.Info(fmt.Sprintf("Deleting Machine (scale down, deleting %d of %d)", i+1, diff)) if err := r.Client.Delete(ctx, machine); err != nil { log.Error(err, "Unable to delete Machine") r.recorder.Eventf(ms, corev1.EventTypeWarning, "FailedDelete", "Failed to delete machine %q: %v", machine.Name, err) errs = append(errs, err) continue } + // Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set. + // Also, setting DeletionTimestamp doesn't mean the Machine is actually deleted (deletion takes some time). + log.Info(fmt.Sprintf("Deleting Machine (scale down, deleting %d of %d)", i+1, diff)) r.recorder.Eventf(ms, corev1.EventTypeNormal, "SuccessfulDelete", "Deleted machine %q", machine.Name) } else { log.Info(fmt.Sprintf("Waiting for Machine to be deleted (scale down, deleting %d of %d)", i+1, diff)) @@ -1492,10 +1496,12 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) ( } var errs []error for _, m := range machinesToRemediate { - log.Info("Deleting Machine (remediating unhealthy Machine)", "Machine", klog.KObj(m)) if err := r.Client.Delete(ctx, m); err != nil && !apierrors.IsNotFound(err) { errs = append(errs, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(m))) } + // Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set. + // Also, setting DeletionTimestamp doesn't mean the Machine is actually deleted (deletion takes some time). + log.Info("Deleting Machine (remediating unhealthy Machine)", "Machine", klog.KObj(m)) } if len(errs) > 0 { return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to delete unhealthy Machines") From 1e55e7a0faac11829c78043c727ada7c5dc1c198 Mon Sep 17 00:00:00 2001 From: fabriziopandini Date: Mon, 20 Jan 2025 12:24:17 +0100 Subject: [PATCH 6/6] More comments --- controlplane/kubeadm/internal/control_plane.go | 2 +- controlplane/kubeadm/internal/control_plane_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index 43da2fedb9ad..b360e8997cac 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -390,7 +390,7 @@ func (c *ControlPlane) InjectTestManagementCluster(managementCluster ManagementC c.workloadCluster = nil } -// StatusToLogKeyAndValues returns following key/value pairs describing the overall status of the control plane: +// StatusToLogKeyAndValues returns the following key/value pairs describing the overall status of the control plane: // - machines is the list of KCP machines; each machine might have additional notes surfacing // - if the machine has been created in the current reconcile // - if machines node ref is not yet set diff --git a/controlplane/kubeadm/internal/control_plane_test.go b/controlplane/kubeadm/internal/control_plane_test.go index 2f9b84d263c2..a456bd102fd6 100644 --- a/controlplane/kubeadm/internal/control_plane_test.go +++ b/controlplane/kubeadm/internal/control_plane_test.go @@ -309,7 +309,7 @@ func TestStatusToLogKeyAndValues(t *testing.T) { got := c.StatusToLogKeyAndValues(machineJustCreated, machineJustDeleted) - g.Expect(got).To((HaveLen(4))) + g.Expect(got).To(HaveLen(4)) g.Expect(got[0]).To(Equal("machines")) machines := strings.Join([]string{ "healthy",