Skip to content

Commit

Permalink
Add k/v pairs describing the overall status of the control plane
Browse files Browse the repository at this point in the history
  • Loading branch information
fabriziopandini authored and k8s-infra-cherrypick-robot committed Jan 20, 2025
1 parent 726572c commit 5d60723
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 18 deletions.
80 changes: 80 additions & 0 deletions controlplane/kubeadm/internal/control_plane.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ package internal

import (
"context"
"fmt"
"sort"
"strings"

"github.com/pkg/errors"
apierrors "k8s.io/apimachinery/pkg/api/errors"
Expand All @@ -32,6 +35,7 @@ import (
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd"
"sigs.k8s.io/cluster-api/util/collections"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/failuredomains"
"sigs.k8s.io/cluster-api/util/patch"
)
Expand Down Expand Up @@ -385,3 +389,79 @@ func (c *ControlPlane) InjectTestManagementCluster(managementCluster ManagementC
c.managementCluster = managementCluster
c.workloadCluster = nil
}

// StatusToLogKeyAndValues returns following key/value pairs describing the overall status of the control plane:
// - machines is the list of KCP machines; each machine might have additional notes surfacing
// - if the machine has been created in the current reconcile (new)
// - if machines node is not yet (node ref not set)
// - if the machine has bee marked for remediation (health check failed)
// - if there are unhealthy control plane component on the machine
// - if the machine has a deletion timestamp/has been deleted in the current reconcile (deleting)
// - if the machine is not up to date with the KCP spec (not up to date)
//
// - etcdMembers list as reported by etcd.
func (c *ControlPlane) StatusToLogKeyAndValues(newMachine, deletedMachine *clusterv1.Machine) []any {
controlPlaneMachineHealthConditions := []clusterv1.ConditionType{
controlplanev1.MachineAPIServerPodHealthyCondition,
controlplanev1.MachineControllerManagerPodHealthyCondition,
controlplanev1.MachineSchedulerPodHealthyCondition,
}
if c.IsEtcdManaged() {
controlPlaneMachineHealthConditions = append(controlPlaneMachineHealthConditions,
controlplanev1.MachineEtcdPodHealthyCondition,
controlplanev1.MachineEtcdMemberHealthyCondition,
)
}

machines := []string{}
for _, m := range c.Machines {
notes := []string{}

if m.Status.NodeRef == nil {
notes = append(notes, "node ref not set")
}

if c.MachinesToBeRemediatedByKCP().Has(m) {
notes = append(notes, "health check failed")
}

for _, condition := range controlPlaneMachineHealthConditions {
if conditions.IsUnknown(m, condition) {
notes = append(notes, strings.Replace(string(condition), "Healthy", " health unknown", -1))
}
if conditions.IsFalse(m, condition) {
notes = append(notes, strings.Replace(string(condition), "Healthy", " not healthy", -1))
}
}

if !c.UpToDateMachines().Has(m) {
notes = append(notes, "not up to date")
}

if !m.DeletionTimestamp.IsZero() || (deletedMachine != nil && m.Name == deletedMachine.Name) {
notes = append(notes, "deleting")
}

name := m.Name
if len(notes) > 0 {
name = fmt.Sprintf("%s (%s)", name, strings.Join(notes, ", "))
}
machines = append(machines, name)
}

if newMachine != nil {
machines = append(machines, fmt.Sprintf("%s (new)", newMachine.Name))
}
sort.Strings(machines)

etcdMembers := []string{}
for _, m := range c.EtcdMembers {
etcdMembers = append(etcdMembers, m.Name)
}
sort.Strings(etcdMembers)

return []any{
"machines", strings.Join(machines, ", "),
"etcdMembers", strings.Join(etcdMembers, ", "),
}
}
3 changes: 2 additions & 1 deletion controlplane/kubeadm/internal/controllers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,8 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, con
continue
}

log.Info("Deleting Machine (KCP deleted)")
log.WithValues(controlPlane.StatusToLogKeyAndValues(nil, machineToDelete)...).
Info("Deleting Machine (KCP deleted)")
if err := r.Client.Delete(ctx, machineToDelete); err != nil && !apierrors.IsNotFound(err) {
errs = append(errs, errors.Wrapf(err, "failed to delete control plane Machine %s", klog.KObj(machineToDelete)))
}
Expand Down
17 changes: 5 additions & 12 deletions controlplane/kubeadm/internal/controllers/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ import (
"k8s.io/apimachinery/pkg/types"
kerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apiserver/pkg/storage/names"
"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

Expand Down Expand Up @@ -183,14 +182,13 @@ func (r *KubeadmControlPlaneReconciler) reconcileExternalReference(ctx context.C
return patchHelper.Patch(ctx, obj)
}

func (r *KubeadmControlPlaneReconciler) cloneConfigsAndGenerateMachine(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, bootstrapSpec *bootstrapv1.KubeadmConfigSpec, failureDomain *string) error {
func (r *KubeadmControlPlaneReconciler) cloneConfigsAndGenerateMachine(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, bootstrapSpec *bootstrapv1.KubeadmConfigSpec, failureDomain *string) (*clusterv1.Machine, error) {
var errs []error
log := ctrl.LoggerFrom(ctx)

// Compute desired Machine
machine, err := r.computeDesiredMachine(kcp, cluster, failureDomain, nil)
if err != nil {
return errors.Wrap(err, "failed to create Machine: failed to compute desired Machine")
return nil, errors.Wrap(err, "failed to create Machine: failed to compute desired Machine")
}

// Since the cloned resource should eventually have a controller ref for the Machine, we create an
Expand Down Expand Up @@ -222,7 +220,7 @@ func (r *KubeadmControlPlaneReconciler) cloneConfigsAndGenerateMachine(ctx conte
// Safe to return early here since no resources have been created yet.
conditions.MarkFalse(kcp, controlplanev1.MachinesCreatedCondition, controlplanev1.InfrastructureTemplateCloningFailedReason,
clusterv1.ConditionSeverityError, err.Error())
return errors.Wrap(err, "failed to clone infrastructure template")
return nil, errors.Wrap(err, "failed to clone infrastructure template")
}
machine.Spec.InfrastructureRef = *infraRef

Expand Down Expand Up @@ -250,15 +248,10 @@ func (r *KubeadmControlPlaneReconciler) cloneConfigsAndGenerateMachine(ctx conte
if err := r.cleanupFromGeneration(ctx, infraRef, bootstrapRef); err != nil {
errs = append(errs, errors.Wrap(err, "failed to cleanup generated resources"))
}

return kerrors.NewAggregate(errs)
return nil, kerrors.NewAggregate(errs)
}

log.Info("Machine created (scale up)",
"Machine", klog.KObj(machine),
infraRef.Kind, klog.KRef(infraRef.Namespace, infraRef.Name),
bootstrapRef.Kind, klog.KRef(bootstrapRef.Namespace, bootstrapRef.Name))
return nil
return machine, nil
}

func (r *KubeadmControlPlaneReconciler) cleanupFromGeneration(ctx context.Context, remoteRefs ...*corev1.ObjectReference) error {
Expand Down
3 changes: 2 additions & 1 deletion controlplane/kubeadm/internal/controllers/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,8 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
}

// Surface the operation is in progress.
log.Info("Deleting Machine (remediating unhealthy Machine)")
log.WithValues(controlPlane.StatusToLogKeyAndValues(nil, machineToBeRemediated)...).
Info("Deleting Machine (remediating unhealthy Machine)")
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")

v1beta2conditions.Set(machineToBeRemediated, metav1.Condition{
Expand Down
22 changes: 18 additions & 4 deletions controlplane/kubeadm/internal/controllers/scale.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,19 @@ func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Conte
return ctrl.Result{}, err
}

if err := r.cloneConfigsAndGenerateMachine(ctx, controlPlane.Cluster, controlPlane.KCP, bootstrapSpec, fd); err != nil {
newMachine, err := r.cloneConfigsAndGenerateMachine(ctx, controlPlane.Cluster, controlPlane.KCP, bootstrapSpec, fd)
if err != nil {
logger.Error(err, "Failed to create initial control plane Machine")
r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "FailedInitialization", "Failed to create initial control plane Machine for cluster %s control plane: %v", klog.KObj(controlPlane.Cluster), err)
return ctrl.Result{}, err
}

logger.WithValues(controlPlane.StatusToLogKeyAndValues(newMachine, nil)...).
Info("Machine created (scale up)",
"Machine", klog.KObj(newMachine),
newMachine.Spec.InfrastructureRef.Kind, klog.KRef(newMachine.Spec.InfrastructureRef.Namespace, newMachine.Spec.InfrastructureRef.Name),
newMachine.Spec.Bootstrap.ConfigRef.Kind, klog.KRef(newMachine.Spec.Bootstrap.ConfigRef.Namespace, newMachine.Spec.Bootstrap.ConfigRef.Name))

// Requeue the control plane, in case there are additional operations to perform
return ctrl.Result{Requeue: true}, nil
}
Expand Down Expand Up @@ -87,12 +94,19 @@ func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context,
return ctrl.Result{}, err
}

if err := r.cloneConfigsAndGenerateMachine(ctx, controlPlane.Cluster, controlPlane.KCP, bootstrapSpec, fd); err != nil {
newMachine, err := r.cloneConfigsAndGenerateMachine(ctx, controlPlane.Cluster, controlPlane.KCP, bootstrapSpec, fd)
if err != nil {
logger.Error(err, "Failed to create additional control plane Machine")
r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "FailedScaleUp", "Failed to create additional control plane Machine for cluster % control plane: %v", klog.KObj(controlPlane.Cluster), err)
return ctrl.Result{}, err
}

logger.WithValues(controlPlane.StatusToLogKeyAndValues(newMachine, nil)...).
Info("Machine created (scale up)",
"Machine", klog.KObj(newMachine),
newMachine.Spec.InfrastructureRef.Kind, klog.KRef(newMachine.Spec.InfrastructureRef.Namespace, newMachine.Spec.InfrastructureRef.Name),
newMachine.Spec.Bootstrap.ConfigRef.Kind, klog.KRef(newMachine.Spec.Bootstrap.ConfigRef.Namespace, newMachine.Spec.Bootstrap.ConfigRef.Name))

// Requeue the control plane, in case there are other operations to perform
return ctrl.Result{Requeue: true}, nil
}
Expand Down Expand Up @@ -138,14 +152,14 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
// NOTE: etcd member removal will be performed by the kcp-cleanup hook after machine completes drain & all volumes are detached.
}

logger = logger.WithValues("Machine", klog.KObj(machineToDelete))
logger.Info("Deleting Machine (scale down)")
if err := r.Client.Delete(ctx, machineToDelete); err != nil && !apierrors.IsNotFound(err) {
logger.Error(err, "Failed to delete control plane machine")
r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "FailedScaleDown",
"Failed to delete control plane Machine %s for cluster %s control plane: %v", machineToDelete.Name, klog.KObj(controlPlane.Cluster), err)
return ctrl.Result{}, err
}
logger.WithValues(controlPlane.StatusToLogKeyAndValues(nil, machineToDelete)...).
Info("Deleting Machine (scale down)", "Machine", klog.KObj(machineToDelete))

// Requeue the control plane, in case there are additional operations to perform
return ctrl.Result{Requeue: true}, nil
Expand Down
10 changes: 10 additions & 0 deletions util/collections/machine_collection.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,16 @@ func ToMachineList(machines Machines) clusterv1.MachineList {
return ml
}

// Has return true when the collection has the given machine.
func (s Machines) Has(machine *clusterv1.Machine) bool {
for _, m := range s {
if m.Name == machine.Name && m.Namespace == machine.Namespace {
return true
}
}
return false
}

// Insert adds items to the set.
func (s Machines) Insert(machines ...*clusterv1.Machine) {
for i := range machines {
Expand Down

0 comments on commit 5d60723

Please sign in to comment.