diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index a83fd77bf..9ace0fcb8 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -160,6 +160,7 @@ jobs: - check-capi-controlplane-docker-tunneling-proxy - check-capi-controlplane-docker-worker - check-capi-docker-machine-change-template + - check-capi-controlplane-remediation - check-capi-remote-machine-template-update - check-capi-docker-machine-template-update - check-capi-docker-machine-template-update-recreate diff --git a/api/controlplane/v1beta1/k0s_types.go b/api/controlplane/v1beta1/k0s_types.go index e48d7bc89..77f1812c0 100644 --- a/api/controlplane/v1beta1/k0s_types.go +++ b/api/controlplane/v1beta1/k0s_types.go @@ -39,6 +39,11 @@ const ( const ( // ControlPlaneReadyCondition documents the status of the control plane ControlPlaneReadyCondition clusterv1.ConditionType = "ControlPlaneReady" + + // RemediationInProgressAnnotation is used to keep track that a remediation is in progress, + // and more specifically it tracks that the system is in between having deleted an unhealthy machine + // and recreating its replacement. + RemediationInProgressAnnotation = "controlplane.cluster.x-k8s.io/remediation-in-progress" ) // +kubebuilder:object:root=true diff --git a/go.mod b/go.mod index 7b26e3816..2f93d9a68 100644 --- a/go.mod +++ b/go.mod @@ -12,6 +12,7 @@ require ( github.com/k0sproject/version v0.6.0 github.com/onsi/ginkgo/v2 v2.20.2 github.com/onsi/gomega v1.34.2 + github.com/pkg/errors v0.9.1 github.com/stretchr/testify v1.9.0 gopkg.in/yaml.v3 v3.0.1 k8s.io/api v0.30.3 @@ -125,7 +126,6 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect - github.com/pkg/errors v0.9.1 // indirect github.com/prometheus/client_golang v1.18.0 // indirect github.com/prometheus/client_model v0.6.0 // indirect github.com/prometheus/common v0.45.0 // indirect diff --git a/internal/controller/controlplane/helper.go b/internal/controller/controlplane/helper.go index 823e43cdf..b47748fa8 100644 --- a/internal/controller/controlplane/helper.go +++ b/internal/controller/controlplane/helper.go @@ -52,9 +52,18 @@ func (c *K0sController) createMachine(ctx context.Context, name string, cluster } _ = ctrl.SetControllerReference(kcp, machine, c.Scheme) - return machine, c.Client.Patch(ctx, machine, client.Apply, &client.PatchOptions{ + err = c.Client.Patch(ctx, machine, client.Apply, &client.PatchOptions{ FieldManager: "k0smotron", }) + if err != nil { + return machine, err + } + + // Remove the annotation tracking that a remediation is in progress. + // A remediation is completed when the replacement machine has been created above. + delete(kcp.Annotations, cpv1beta1.RemediationInProgressAnnotation) + + return machine, nil } func (c *K0sController) deleteMachine(ctx context.Context, name string, kcp *cpv1beta1.K0sControlPlane) error { diff --git a/internal/controller/controlplane/k0s_controlplane_controller.go b/internal/controller/controlplane/k0s_controlplane_controller.go index 774be676c..b4a413926 100644 --- a/internal/controller/controlplane/k0s_controlplane_controller.go +++ b/internal/controller/controlplane/k0s_controlplane_controller.go @@ -27,6 +27,7 @@ import ( "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/runtime/schema" + "github.com/go-logr/logr" "github.com/google/uuid" autopilot "github.com/k0sproject/k0s/pkg/apis/autopilot/v1beta2" "github.com/k0sproject/k0smotron/internal/controller/util" @@ -271,6 +272,11 @@ func (c *K0sController) reconcile(ctx context.Context, cluster *clusterv1.Cluste return fmt.Errorf("error reconciling kubeconfig secret: %w", err) } + err = c.reconcileUnhealthyMachines(ctx, cluster, kcp) + if err != nil { + return err + } + err = c.reconcileMachines(ctx, cluster, kcp) if err != nil { return err @@ -417,58 +423,66 @@ func (c *K0sController) reconcileMachines(ctx context.Context, cluster *clusterv if len(machineNamesToDelete) > 0 { logger.Info("Found machines to delete", "count", len(machineNamesToDelete)) - kubeClient, err := c.getKubeClient(ctx, cluster) - if err != nil { - return fmt.Errorf("error getting cluster client set for deletion: %w", err) - } // Remove the oldest machine abd wait for the machine to be deleted to avoid etcd issues - machine := machines.Filter(func(m *clusterv1.Machine) bool { + machineToDelete := machines.Filter(func(m *clusterv1.Machine) bool { return machineNamesToDelete[m.Name] }).Oldest() - logger.Info("Found oldest machine to delete", "machine", machine.Name) - if machine.Status.Phase == string(clusterv1.MachinePhaseDeleting) { - logger.Info("Machine is being deleted, waiting for it to be deleted", "machine", machine.Name) + logger.Info("Found oldest machine to delete", "machine", machineToDelete.Name) + if machineToDelete.Status.Phase == string(clusterv1.MachinePhaseDeleting) { + logger.Info("Machine is being deleted, waiting for it to be deleted", "machine", machineToDelete.Name) return fmt.Errorf("waiting for previous machine to be deleted") } - name := machine.Name - - waitCtx, cancel := context.WithTimeout(ctx, 60*time.Second) - defer cancel() - err = wait.PollUntilContextCancel(waitCtx, 10*time.Second, true, func(fctx context.Context) (bool, error) { - if err := c.markChildControlNodeToLeave(fctx, name, kubeClient); err != nil { - return false, fmt.Errorf("error marking controlnode to leave: %w", err) - } - - ok, err := c.checkMachineLeft(fctx, name, kubeClient) - if err != nil { - logger.Error(err, "Error checking machine left", "machine", name) - } - return ok, err - }) + err := c.runMachineDeletionSequence(ctx, logger, cluster, kcp, machineToDelete) if err != nil { - return fmt.Errorf("error checking machine left: %w", err) + return err } - if err := c.deleteControlNode(ctx, name, kubeClient); err != nil { - return fmt.Errorf("error deleting controlnode: %w", err) - } + logger.Info("Deleted machine", "machine", machineToDelete.Name) + } + return nil +} - if err := c.deleteBootstrapConfig(ctx, name, kcp); err != nil { - return fmt.Errorf("error deleting machine from template: %w", err) - } +func (c *K0sController) runMachineDeletionSequence(ctx context.Context, logger logr.Logger, cluster *clusterv1.Cluster, kcp *cpv1beta1.K0sControlPlane, machine *clusterv1.Machine) error { + kubeClient, err := c.getKubeClient(ctx, cluster) + if err != nil { + return fmt.Errorf("error getting cluster client set for deletion: %w", err) + } - if err := c.deleteMachineFromTemplate(ctx, name, cluster, kcp); err != nil { - return fmt.Errorf("error deleting machine from template: %w", err) + waitCtx, cancel := context.WithTimeout(ctx, 60*time.Second) + defer cancel() + err = wait.PollUntilContextCancel(waitCtx, 10*time.Second, true, func(fctx context.Context) (bool, error) { + if err := c.markChildControlNodeToLeave(fctx, machine.Name, kubeClient); err != nil { + return false, fmt.Errorf("error marking controlnode to leave: %w", err) } - if err := c.deleteMachine(ctx, name, kcp); err != nil { - return fmt.Errorf("error deleting machine from template: %w", err) + ok, err := c.checkMachineLeft(fctx, machine.Name, kubeClient) + if err != nil { + logger.Error(err, "Error checking machine left", "machine", machine.Name) } + return ok, err + }) + if err != nil { + return fmt.Errorf("error checking machine left: %w", err) + } - logger.Info("Deleted machine", "machine", name) + if err := c.deleteControlNode(ctx, machine.Name, kubeClient); err != nil { + return fmt.Errorf("error deleting controlnode: %w", err) } + + if err := c.deleteBootstrapConfig(ctx, machine.Name, kcp); err != nil { + return fmt.Errorf("error deleting machine from template: %w", err) + } + + if err := c.deleteMachineFromTemplate(ctx, machine.Name, cluster, kcp); err != nil { + return fmt.Errorf("error deleting machine from template: %w", err) + } + + if err := c.deleteMachine(ctx, machine.Name, kcp); err != nil { + return fmt.Errorf("error deleting machine from template: %w", err) + } + return nil } diff --git a/internal/controller/controlplane/k0smotron_controlplane_controller.go b/internal/controller/controlplane/k0smotron_controlplane_controller.go index 5687b67bf..ea4c16463 100644 --- a/internal/controller/controlplane/k0smotron_controlplane_controller.go +++ b/internal/controller/controlplane/k0smotron_controlplane_controller.go @@ -35,7 +35,6 @@ import ( "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" - "sigs.k8s.io/cluster-api/util" capiutil "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" "sigs.k8s.io/cluster-api/util/secret" @@ -121,7 +120,7 @@ func (c *K0smotronController) Reconcile(ctx context.Context, req ctrl.Request) ( } if ready { - remoteClient, err := remote.NewClusterClient(ctx, "k0smotron", c.Client, util.ObjectKey(cluster)) + remoteClient, err := remote.NewClusterClient(ctx, "k0smotron", c.Client, capiutil.ObjectKey(cluster)) if err != nil { return res, fmt.Errorf("failed to create remote client: %w", err) } @@ -298,7 +297,7 @@ func (c *K0smotronController) reconcile(ctx context.Context, cluster *clusterv1. func (c *K0smotronController) ensureCertificates(ctx context.Context, cluster *clusterv1.Cluster, kcp *cpv1beta1.K0smotronControlPlane) error { certificates := secret.NewCertificatesForInitialControlPlane(&bootstrapv1.ClusterConfiguration{}) - return certificates.LookupOrGenerate(ctx, c.Client, util.ObjectKey(cluster), *metav1.NewControllerRef(kcp, cpv1beta1.GroupVersion.WithKind("K0smotronControlPlane"))) + return certificates.LookupOrGenerate(ctx, c.Client, capiutil.ObjectKey(cluster), *metav1.NewControllerRef(kcp, cpv1beta1.GroupVersion.WithKind("K0smotronControlPlane"))) } // SetupWithManager sets up the controller with the Manager. diff --git a/internal/controller/controlplane/remediation.go b/internal/controller/controlplane/remediation.go new file mode 100644 index 000000000..7f3fecb32 --- /dev/null +++ b/internal/controller/controlplane/remediation.go @@ -0,0 +1,166 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controlplane + +import ( + "context" + "fmt" + + cpv1beta1 "github.com/k0sproject/k0smotron/api/controlplane/v1beta1" + "github.com/pkg/errors" + kerrors "k8s.io/apimachinery/pkg/util/errors" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/util/annotations" + "sigs.k8s.io/cluster-api/util/collections" + "sigs.k8s.io/cluster-api/util/conditions" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +func (c *K0sController) reconcileUnhealthyMachines(ctx context.Context, cluster *clusterv1.Cluster, kcp *cpv1beta1.K0sControlPlane) (retErr error) { + log := ctrl.LoggerFrom(ctx) + + machines, err := collections.GetFilteredMachinesForCluster(ctx, c, cluster, collections.ControlPlaneMachines(cluster.Name)) + if err != nil { + return fmt.Errorf("failed to filter machines for control plane: %w", err) + } + + healthyMachines := machines.Filter(isHealthy) + + // cleanup pending remediation actions not completed if the underlying machine is now back to healthy. + // machines to be sanitized has the following conditions: + // + // HealthCheckSucceeded=True (current machine's state is Health) + // AND + // OwnerRemediated=False (machine was marked as unhealthy previously) + err = c.sanitizeHealthyMachines(ctx, healthyMachines) + if err != nil { + return err + } + if _, ok := kcp.Annotations[cpv1beta1.RemediationInProgressAnnotation]; ok { + log.Info("Another remediation is already in progress. Skipping remediation.") + return nil + } + + // retrieve machines marked as unheathy by MHC controller + unhealthyMachines := machines.Filter(collections.HasUnhealthyCondition) + + // no unhealthy machines to remediate. Reconciliation can move on to the next stage. + if len(unhealthyMachines) == 0 { + return nil + } + machineToBeRemediated := unhealthyMachines.Oldest() + + if !machineToBeRemediated.ObjectMeta.DeletionTimestamp.IsZero() { + log.Info("Machine to remediate is being deleted.") + return nil + } + log = log.WithValues("Machine", machineToBeRemediated) + // Always patch the machine to be remediated conditions in order to inform about remediation state. + defer func() { + derr := c.Status().Patch(ctx, machineToBeRemediated, client.Merge) + if derr != nil { + log.Error(err, "Failed to patch control plane Machine", "Machine", machineToBeRemediated.Name) + if retErr == nil { + retErr = errors.Wrapf(err, "failed to patch control plane Machine %s", machineToBeRemediated.Name) + } + return + } + }() + // Ensure that the cluster remains available during and after the remediation process. The remediation must not + // compromise the cluster's ability to serve workloads or cause disruption to the control plane's functionality. + if kcp.Status.Ready { + // The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance. + if !(machines.Len() > 1) { + log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "replicas", machines.Len()) + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1") + return nil + } + + // The cluster MUST NOT have healthy machines still being provisioned. This rule prevents KCP taking actions while the cluster is in a transitional state. + if isProvisioningHealthyMachine(healthyMachines) { + log.Info("A control plane machine needs remediation, but there are other control-plane machines being provisioned. Skipping remediation") + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine provisioning to complete before triggering remediation") + + return nil + } + + // The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state. + if len(machines.Filter(collections.HasDeletionTimestamp)) > 0 { + log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation") + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation") + return nil + } + } + + // After checks, remediation can be carried out. + + if err := c.runMachineDeletionSequence(ctx, log, cluster, kcp, machineToBeRemediated); err != nil { + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) + return errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name) + } + log.Info("Remediated unhealthy machine, another new machine should take its place soon.") + + // Mark controlplane to track that remediation is in progress and do not proceed until machine is gone. + // This annotation is removed when new controlplane creates a new machine. + annotations.AddAnnotations(kcp, map[string]string{ + cpv1beta1.RemediationInProgressAnnotation: "true", + }) + + return nil +} + +func isHealthy(machine *clusterv1.Machine) bool { + if machine == nil { + return false + } + return conditions.IsTrue(machine, clusterv1.MachineHealthCheckSucceededCondition) +} + +func hasNode(machine *clusterv1.Machine) bool { + if machine == nil { + return false + } + return machine.Status.NodeRef != nil +} + +func isProvisioningHealthyMachine(healthyMachines collections.Machines) bool { + return len(healthyMachines.Filter(collections.Not(hasNode))) > 0 +} + +func (c *K0sController) sanitizeHealthyMachines(ctx context.Context, healthyMachines collections.Machines) error { + log := ctrl.LoggerFrom(ctx) + + errList := []error{} + for _, m := range healthyMachines { + if conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) && m.DeletionTimestamp.IsZero() { + + conditions.Delete(m, clusterv1.MachineOwnerRemediatedCondition) + + err := c.Status().Patch(ctx, m, client.Merge) + if err != nil { + log.Error(err, "Failed to patch control plane Machine to clean machine's unhealthy condition", "Machine", m.Name) + errList = append(errList, errors.Wrapf(err, "failed to patch control plane Machine %s to clean machine's unhelthy condition", m.Name)) + } + } + } + if len(errList) > 0 { + return kerrors.NewAggregate(errList) + } + + return nil +} diff --git a/inttest/Makefile.variables b/inttest/Makefile.variables index 124e707e7..60143e1d6 100644 --- a/inttest/Makefile.variables +++ b/inttest/Makefile.variables @@ -21,6 +21,7 @@ smoketests := \ check-capi-controlplane-docker-worker \ check-capi-controlplane-docker-tunneling \ check-capi-controlplane-docker-tunneling-proxy \ + check-capi-controlplane-remediation \ check-monitoring \ check-capi-docker-machinedeployment \ check-capi-docker-clusterclass \ diff --git a/inttest/capi-controlplane-remediation/capi_controlplane_remediation_test.go b/inttest/capi-controlplane-remediation/capi_controlplane_remediation_test.go new file mode 100644 index 000000000..ae96f7792 --- /dev/null +++ b/inttest/capi-controlplane-remediation/capi_controlplane_remediation_test.go @@ -0,0 +1,364 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package capicontolplaneremediation + +import ( + "context" + "encoding/json" + "fmt" + "os" + "os/exec" + "strconv" + "strings" + "testing" + "time" + + "github.com/k0sproject/k0smotron/inttest/util" + "github.com/stretchr/testify/suite" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" +) + +type CAPIControlplaneHealthCheckRemediation struct { + suite.Suite + client *kubernetes.Clientset + restConfig *rest.Config + clusterYamlsPath string + ctx context.Context +} + +func TestCAPIControlplaneHealthCheckRemediation(t *testing.T) { + s := CAPIControlplaneHealthCheckRemediation{} + suite.Run(t, &s) +} + +func (s *CAPIControlplaneHealthCheckRemediation) SetupSuite() { + kubeConfigPath := os.Getenv("KUBECONFIG") + s.Require().NotEmpty(kubeConfigPath, "KUBECONFIG env var must be set and point to kind cluster") + // Get kube client from kubeconfig + restCfg, err := clientcmd.BuildConfigFromFlags("", kubeConfigPath) + s.Require().NoError(err) + s.Require().NotNil(restCfg) + s.restConfig = restCfg + + // Get kube client from kubeconfig + kubeClient, err := kubernetes.NewForConfig(restCfg) + s.Require().NoError(err) + s.Require().NotNil(kubeClient) + s.client = kubeClient + + tmpDir := s.T().TempDir() + s.clusterYamlsPath = tmpDir + "/cluster.yaml" + s.Require().NoError(os.WriteFile(s.clusterYamlsPath, []byte(testEnvironmentDeclaration), 0644)) + + s.ctx, _ = util.NewSuiteContext(s.T()) +} + +func (s *CAPIControlplaneHealthCheckRemediation) TestCAPIControlPlaneDockerDownScaling() { + + // Apply the child cluster objects + s.applyClusterObjects() + defer func() { + keep := os.Getenv("KEEP_AFTER_TEST") + if keep == "true" { + return + } + if keep == "on-failure" && s.T().Failed() { + return + } + s.T().Log("Deleting cluster objects") + s.deleteCluster() + }() + + var localPort int + err := wait.PollUntilContextCancel(s.ctx, 1*time.Second, true, func(ctx context.Context) (bool, error) { + localPort, _ = getLBPort("docker-test-cluster-lb") + return localPort > 0, nil + }) + s.Require().NoError(err) + + s.T().Log("waiting to see admin kubeconfig secret") + kmcKC, err := util.GetKMCClientSet(s.ctx, s.client, "docker-test-cluster", "default", localPort) + s.Require().NoError(err) + + // nolint:staticcheck + err = wait.PollImmediateUntilWithContext(s.ctx, 1*time.Second, func(ctx context.Context) (bool, error) { + b, _ := s.client.RESTClient(). + Get(). + AbsPath("/healthz"). + DoRaw(context.Background()) + + return string(b) == "ok", nil + }) + s.Require().NoError(err) + + s.T().Log("waiting for control-plane nodes to be ready") + for i := 0; i < 3; i++ { + // nolint:staticcheck + err = wait.PollImmediateUntilWithContext(s.ctx, 1*time.Second, func(ctx context.Context) (bool, error) { + nodeName := fmt.Sprintf("docker-test-cluster-docker-test-%d", i) + output, err := exec.Command("docker", "exec", nodeName, "k0s", "status").Output() + if err != nil { + return false, nil + } + + return strings.Contains(string(output), "Version:"), nil + }) + s.Require().NoError(err) + } + + s.T().Log("waiting for worker-node to be ready") + s.Require().NoError(util.WaitForNodeReadyStatus(s.ctx, kmcKC, "docker-test-cluster-docker-test-worker-0", corev1.ConditionTrue)) + + s.T().Log("forcing a machine healtch check and wait for recreation") + // to force MachineHealthCheck controller takes action, we use 'cluster.x-k8s.io/remediate-machine' annotation on the machine + s.forceMHC() + + time.Sleep(time.Minute) + s.T().Log("waiting for expected control-plane replicas without old annotation related to forcing remediation") + for i := 0; i < 3; i++ { + // nolint:staticcheck + err = wait.PollImmediateUntilWithContext(s.ctx, 1*time.Second, func(ctx context.Context) (bool, error) { + nodeName := fmt.Sprintf("docker-test-cluster-docker-test-%d", i) + output, err := exec.Command("docker", "exec", nodeName, "k0s", "status").Output() + if err != nil { + return false, nil + } + + containsVersion := strings.Contains(string(output), "Version:") + + // condition to check that new machine is generated because previous had 'cluster.x-k8s.io/remediate-machine' annotation + hasForceRemediationAnnotation := false + if i == 0 { + hasForceRemediationAnnotation = s.hasForceRemediationAnnotation() + } + + return containsVersion && !hasForceRemediationAnnotation, nil + }) + s.Require().NoError(err) + } +} + +func (s *CAPIControlplaneHealthCheckRemediation) applyClusterObjects() { + // Exec via kubectl + out, err := exec.Command("kubectl", "apply", "-f", s.clusterYamlsPath).CombinedOutput() + s.Require().NoError(err, "failed to apply cluster objects: %s", string(out)) +} + +func (s *CAPIControlplaneHealthCheckRemediation) deleteCluster() { + // Exec via kubectl + out, err := exec.Command("kubectl", "delete", "-f", s.clusterYamlsPath).CombinedOutput() + s.Require().NoError(err, "failed to delete cluster objects: %s", string(out)) +} + +func (s *CAPIControlplaneHealthCheckRemediation) forceMHC() { + // Exec via kubectl + out, err := exec.Command( + "kubectl", + "patch", + "machine", + "docker-test-cluster-docker-test-0", + "--type=merge", + "-p", + `{"metadata": {"annotations": {"cluster.x-k8s.io/remediate-machine": ""}}}`, + ).CombinedOutput() + s.Require().NoError(err, "failed to patch machine to force a healthcheck: %s", string(out)) +} + +func (s *CAPIControlplaneHealthCheckRemediation) hasForceRemediationAnnotation() bool { + // Exec via kubectl + out, err := exec.Command( + "sh", + "-c", + `kubectl get machine docker-test-cluster-docker-test-0 -o json | jq '.metadata.annotations | has("cluster.x-k8s.io/remediate-machine")'`, + ).CombinedOutput() + s.Require().NoError(err, "failed to check if annotation present in machine: %s", string(out)) + return string(out) == "True" +} + +func getLBPort(name string) (int, error) { + b, err := exec.Command("docker", "inspect", name, "--format", "{{json .NetworkSettings.Ports}}").Output() + if err != nil { + return 0, fmt.Errorf("failed to get inspect info from container %s: %w", name, err) + } + + var ports map[string][]map[string]string + err = json.Unmarshal(b, &ports) + if err != nil { + return 0, fmt.Errorf("failed to unmarshal inspect info from container %s: %w", name, err) + } + + return strconv.Atoi(ports["6443/tcp"][0]["HostPort"]) +} + +var testEnvironmentDeclaration = ` +apiVersion: cluster.x-k8s.io/v1beta1 +kind: Cluster +metadata: + name: docker-test-cluster + namespace: default +spec: + clusterNetwork: + pods: + cidrBlocks: + - 192.168.0.0/16 + serviceDomain: cluster.local + services: + cidrBlocks: + - 10.128.0.0/12 + controlPlaneRef: + apiVersion: controlplane.cluster.x-k8s.io/v1beta1 + kind: K0sControlPlane + name: docker-test-cluster-docker-test + infrastructureRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: DockerCluster + name: docker-test +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: DockerMachineTemplate +metadata: + name: docker-test-cp-template + namespace: default +spec: + template: + spec: + customImage: kindest/node:v1.31.0 +--- +apiVersion: controlplane.cluster.x-k8s.io/v1beta1 +kind: K0sControlPlane +metadata: + name: docker-test-cluster-docker-test +spec: + replicas: 3 + version: v1.31.2+k0s.0 + k0sConfigSpec: + args: + - --enable-worker + k0s: + apiVersion: k0s.k0sproject.io/v1beta1 + kind: ClusterConfig + metadata: + name: k0s + spec: + api: + extraArgs: + anonymous-auth: "true" + telemetry: + enabled: false + network: + controlPlaneLoadBalancing: + enabled: false + files: + - path: /tmp/test-file-secret + contentFrom: + secretRef: + name: test-file-secret + key: value + machineTemplate: + infrastructureRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: DockerMachineTemplate + name: docker-test-cp-template + namespace: default +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: DockerCluster +metadata: + name: docker-test + namespace: default +spec: +--- +apiVersion: cluster.x-k8s.io/v1beta1 +kind: Machine +metadata: + name: docker-test-cluster-docker-test-worker-0 + namespace: default +spec: + version: v1.31.2 + clusterName: docker-test-cluster + bootstrap: + configRef: + apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 + kind: K0sWorkerConfig + name: docker-test-worker-0 + infrastructureRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: DockerMachine + name: docker-test-cluster-docker-test-worker-0 +--- +apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 +kind: K0sWorkerConfig +metadata: + name: docker-test-worker-0 + namespace: default +spec: + # version is deliberately different to be able to verify we actually pick it up :) + version: v1.31.2+k0s.0 + args: + - --labels=k0sproject.io/foo=bar + preStartCommands: + - echo -n "pre-start" > /tmp/pre-start + postStartCommands: + - echo -n "post-start" > /tmp/post-start + files: + - path: /tmp/test-file + content: test-file + - path: /tmp/test-file-secret + contentFrom: + secretRef: + name: test-file-secret + key: value +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: DockerMachine +metadata: + name: docker-test-cluster-docker-test-worker-0 + namespace: default +spec: + customImage: kindest/node:v1.31.0 +--- +apiVersion: v1 +kind: Secret +metadata: + name: test-file-secret + namespace: default +type: Opaque +data: + value: dGVzdA== +--- +apiVersion: cluster.x-k8s.io/v1beta1 +kind: MachineHealthCheck +metadata: + name: docker-test-cluster-kcp-unhealthy-5m +spec: + clusterName: docker-test-cluster + maxUnhealthy: 100% + selector: + matchLabels: + cluster.x-k8s.io/control-plane: "true" + unhealthyConditions: + - type: Ready + status: Unknown + timeout: 500s + - type: Ready + status: "False" + timeout: 500s +`