diff --git a/test/extended/two_node/tnf_kubelet_disruption.go b/test/extended/two_node/tnf_kubelet_disruption.go index 169e5ea6e3fb..ee96bfa79620 100644 --- a/test/extended/two_node/tnf_kubelet_disruption.go +++ b/test/extended/two_node/tnf_kubelet_disruption.go @@ -3,6 +3,7 @@ package two_node import ( "context" "fmt" + "regexp" "time" g "github.com/onsi/ginkgo/v2" @@ -10,6 +11,7 @@ import ( v1 "github.com/openshift/api/config/v1" "github.com/openshift/origin/test/extended/etcd/helpers" "github.com/openshift/origin/test/extended/two_node/utils" + "github.com/openshift/origin/test/extended/two_node/utils/services" exutil "github.com/openshift/origin/test/extended/util" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" nodeutil "k8s.io/kubernetes/pkg/util/node" @@ -17,9 +19,11 @@ import ( ) const ( - kubeletDisruptionTimeout = 10 * time.Minute // Timeout for kubelet disruption scenarios - kubeletRestoreTimeout = 5 * time.Minute // Time to wait for kubelet service restore - kubeletGracePeriod = 30 * time.Second // Grace period for kubelet to start/stop + kubeletDisruptionTimeout = 10 * time.Minute // Timeout for kubelet disruption scenarios + kubeletRestoreTimeout = 5 * time.Minute // Time to wait for kubelet service restore + kubeletGracePeriod = 30 * time.Second // Grace period for kubelet to start/stop + etcdStableDuringDisruption = 5 * time.Minute // Duration to assert etcd member stays healthy during disruption + failureWindowClockSkewBuffer = 1 * time.Minute // Buffer for clock skew when checking resource failure history ) var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Serial][Slow][Disruptive] Two Node with Fencing cluster", func() { @@ -80,12 +84,13 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual nodeList, _ := utils.GetNodes(oc, utils.AllNodes) cleanupNode := nodeList.Items[1] // Use second node for cleanup commands - g.By(fmt.Sprintf("Cleanup: Clearing any kubelet resource bans using node %s", cleanupNode.Name)) - cleanupErr := utils.RemoveConstraint(oc, cleanupNode.Name, "kubelet-clone") - if cleanupErr != nil { - framework.Logf("Warning: Failed to clear kubelet-clone resource: %v (expected if no bans were active)", cleanupErr) - } else { - framework.Logf("Successfully cleared kubelet-clone resource bans and failures") + g.By(fmt.Sprintf("Cleanup: Clearing any kubelet and etcd resource bans using node %s", cleanupNode.Name)) + for _, resource := range []string{"kubelet-clone", "etcd-clone"} { + if cleanupErr := utils.RemoveConstraint(oc, cleanupNode.Name, resource); cleanupErr != nil { + framework.Logf("Warning: Failed to clear %s: %v (expected if no bans were active)", resource, cleanupErr) + } else { + framework.Logf("Successfully cleared %s resource bans and failures", resource) + } } g.By("Cleanup: Validating etcd cluster health") @@ -136,15 +141,24 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual return !nodeutil.IsNodeReady(nodeObj) }, kubeletDisruptionTimeout, utils.FiveSecondPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s is not in state Ready after kubelet resource ban is applied", targetNode.Name)) + g.By("Verifying PacemakerHealthCheckDegraded condition reports kubelet failure on target node") + err = services.WaitForPacemakerHealthCheckDegraded(oc, "Kubelet", healthCheckDegradedTimeout, utils.FiveSecondPollInterval) + o.Expect(err).NotTo(o.HaveOccurred(), "Pacemaker health check should report degraded due to kubelet constraint") + // Assert degraded resource is Kubelet and that it is the node we banned (operator message format: " node is unhealthy: Kubelet ...") + o.Expect(services.AssertPacemakerHealthCheckContains(oc, []string{"Kubelet", targetNode.Name})).To(o.Succeed()) + g.By("Validating etcd cluster remains healthy with surviving node") o.Consistently(func() error { return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, survivingNode.Name) - }, 5*time.Minute, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should remain healthy during kubelet disruption", survivingNode.Name)) + }, etcdStableDuringDisruption, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should remain healthy during kubelet disruption", survivingNode.Name)) g.By("Clearing kubelet resource bans to allow normal operation") err = utils.RemoveConstraint(oc, survivingNode.Name, "kubelet-clone") o.Expect(err).To(o.BeNil(), "Expected to clear kubelet resource bans without errors") + g.By("Verifying PacemakerHealthCheckDegraded condition clears after recovery") + o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed()) + g.By("Validating both nodes are Ready") for _, node := range nodes { o.Eventually(func() bool { @@ -211,7 +225,7 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual g.By("Verifying Pacemaker recorded the kubelet failure in operation history") // Use a time window from when we stopped kubelet to now - failureWindow := time.Since(stopTime) + time.Minute // Add buffer for clock skew + failureWindow := time.Since(stopTime) + failureWindowClockSkewBuffer hasFailure, failures, err := utils.HasRecentResourceFailure(oc, survivingNode.Name, "kubelet-clone", failureWindow) o.Expect(err).To(o.BeNil(), "Expected to check resource failure history without errors") o.Expect(hasFailure).To(o.BeTrue(), "Pacemaker should have recorded kubelet failure in operation history") @@ -238,5 +252,61 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual return utils.ValidateEssentialOperatorsAvailable(oc) }, kubeletRestoreTimeout, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(), "Essential operators should be available") }) +}) + +// Etcd constraint / health check test lives in a separate Describe without [OCPFeatureGate:DualReplica]; +// we do not add new tests under the FeatureGate-gated suite. +var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][Suite:openshift/two-node][Serial][Slow][Disruptive] Two Node etcd constraint and health check", func() { + defer g.GinkgoRecover() + + var ( + oc = exutil.NewCLIWithoutNamespace("two-node-etcd-constraint").AsAdmin() + etcdClientFactory *helpers.EtcdClientFactoryImpl + ) + + g.BeforeEach(func() { + utils.SkipIfNotTopology(oc, v1.DualReplicaTopologyMode) + etcdClientFactory = helpers.NewEtcdClientFactory(oc.KubeClient()) + utils.SkipIfClusterIsNotHealthy(oc, etcdClientFactory) + }) + g.It("should recover from etcd resource location constraint with health check degraded then healthy", func() { + nodeList, err := utils.GetNodes(oc, utils.AllNodes) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") + o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected to find exactly 2 nodes for two-node cluster") + nodes := nodeList.Items + targetNode := nodes[0] + survivingNode := nodes[1] + + g.By("Ensuring both nodes are healthy before applying etcd constraint") + for _, node := range nodes { + o.Expect(nodeutil.IsNodeReady(&node)).To(o.BeTrue(), fmt.Sprintf("Node %s should be ready", node.Name)) + } + + g.By(fmt.Sprintf("Banning etcd resource from node %s (location constraint)", targetNode.Name)) + err = utils.AddConstraint(oc, survivingNode.Name, "etcd-clone", targetNode.Name) + o.Expect(err).To(o.BeNil(), "Expected to ban etcd-clone from target node") + g.DeferCleanup(func() { + _ = utils.RemoveConstraint(oc, survivingNode.Name, "etcd-clone") + }) + + g.By("Verifying PacemakerHealthCheckDegraded condition reports etcd failure on target node") + // Operator message format: " node is unhealthy: Etcd has failed" (or "is stopped", etc.) + degradedPattern := regexp.QuoteMeta(targetNode.Name) + ` node is unhealthy: Etcd .*` + err = services.WaitForPacemakerHealthCheckDegraded(oc, degradedPattern, healthCheckDegradedTimeout, utils.FiveSecondPollInterval) + o.Expect(err).NotTo(o.HaveOccurred(), "Pacemaker health check should report degraded due to etcd constraint") + o.Expect(services.AssertPacemakerHealthCheckContains(oc, []string{"Etcd", targetNode.Name})).To(o.Succeed()) + + g.By("Removing etcd-clone constraint to restore normal operation") + err = utils.RemoveConstraint(oc, survivingNode.Name, "etcd-clone") + o.Expect(err).To(o.BeNil(), "Expected to clear etcd-clone constraint") + + g.By("Verifying PacemakerHealthCheckDegraded condition clears after recovery") + o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed()) + + g.By("Validating etcd cluster is healthy") + o.Eventually(func() error { + return utils.LogEtcdClusterStatus(oc, "after etcd constraint removal", etcdClientFactory) + }, kubeletRestoreTimeout, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred()) + }) }) diff --git a/test/extended/two_node/tnf_node_replacement.go b/test/extended/two_node/tnf_node_replacement.go index d7416c01d262..dce5573bc80e 100644 --- a/test/extended/two_node/tnf_node_replacement.go +++ b/test/extended/two_node/tnf_node_replacement.go @@ -228,6 +228,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual g.By("Destroying the target VM") destroyVM(&testConfig) + g.By("Verifying that a fencing event was recorded for the target node") + o.Expect(services.WaitForFencingEvent(oc, []string{testConfig.TargetNode.Name}, healthCheckDegradedTimeoutAfterFencing, utils.FiveSecondPollInterval)).To(o.Succeed()) + // Wait for etcd to stop on the surviving node g.By("Waiting for etcd to stop on the surviving node") waitForEtcdToStop(&testConfig) @@ -256,6 +259,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual g.By("Verifying the cluster is fully restored") verifyRestoredCluster(&testConfig, oc) + g.By("Verifying PacemakerHealthCheckDegraded condition clears after recovery") + o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeoutAfterFencing, utils.FiveSecondPollInterval)).To(o.Succeed()) + g.By("Successfully completed node replacement process") e2e.Logf("Node replacement process completed. Backup files created in: %s", backupDir) }) diff --git a/test/extended/two_node/tnf_pacemaker_healthcheck.go b/test/extended/two_node/tnf_pacemaker_healthcheck.go new file mode 100644 index 000000000000..e95830ed45a4 --- /dev/null +++ b/test/extended/two_node/tnf_pacemaker_healthcheck.go @@ -0,0 +1,200 @@ +package two_node + +import ( + "context" + "fmt" + "math/rand" + "strings" + "sync" + "time" + + g "github.com/onsi/ginkgo/v2" + o "github.com/onsi/gomega" + v1 "github.com/openshift/api/config/v1" + "github.com/openshift/origin/test/extended/etcd/helpers" + "github.com/openshift/origin/test/extended/two_node/utils" + "github.com/openshift/origin/test/extended/two_node/utils/services" + exutil "github.com/openshift/origin/test/extended/util" + corev1 "k8s.io/api/core/v1" + e2e "k8s.io/kubernetes/test/e2e/framework" +) + +const ( + // healthCheckUpdatedTimeout is the time to wait for the Pacemaker health check condition to update (degraded or healthy). + healthCheckUpdatedTimeout = 2 * time.Minute + healthCheckDegradedTimeout = healthCheckUpdatedTimeout + healthCheckHealthyTimeout = healthCheckUpdatedTimeout + // Longer timeouts for tests that trigger a fencing event (ungraceful shutdown, cold-boot, network disruption): + // API server can be slow to recover, so we wait up to 5 minutes before asserting PacemakerHealthCheckDegraded/Healthy. + healthCheckDegradedTimeoutAfterFencing = 5 * time.Minute + healthCheckHealthyTimeoutAfterFencing = 5 * time.Minute + // StatusUnknownDegradedThreshold and StatusStalenessThreshold in CEO are 5 minutes; we must block for at least this long before asserting degraded. + staleMinBlockDuration = 5 * time.Minute + // After blocking, allow time for healthcheck controller (30s resync) to observe degraded. + staleCRDegradedTimeout = 2 * time.Minute + staleTimestampDegradedTimeout = 2 * time.Minute + // Interval for background delete loops: delete as soon as resources appear (match aggressive manual watch cadence). + staleTestDeleteInterval = 2 * time.Second + pacemakerClusterCRName = "cluster" + statusCollectorLabel = "app.kubernetes.io/name=pacemaker-status-collector" + etcdNamespaceFencing = "openshift-etcd" +) + +var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][Suite:openshift/two-node][Serial][Disruptive] Pacemaker health check disruptive scenarios", func() { + defer g.GinkgoRecover() + + var ( + oc = exutil.NewCLIWithoutNamespace("tnf-pacemaker-healthcheck").AsAdmin() + etcdClientFactory *helpers.EtcdClientFactoryImpl + peerNode corev1.Node + targetNode corev1.Node + ) + + g.BeforeEach(func() { + utils.SkipIfNotTopology(oc, v1.DualReplicaTopologyMode) + etcdClientFactory = helpers.NewEtcdClientFactory(oc.KubeClient()) + utils.SkipIfClusterIsNotHealthy(oc, etcdClientFactory) + + nodes, err := utils.GetNodes(oc, utils.AllNodes) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") + randomIndex := rand.Intn(len(nodes.Items)) + peerNode = nodes.Items[randomIndex] + targetNode = nodes.Items[(randomIndex+1)%len(nodes.Items)] + }) + + g.It("should report degraded when a node is in standby then healthy after unstandby", func() { + g.By(fmt.Sprintf("Putting %s in standby from %s", targetNode.Name, peerNode.Name)) + o.Expect(utils.PcsNodeStandby(oc, peerNode.Name, targetNode.Name)).To(o.Succeed()) + + g.By("Verifying PacemakerHealthCheckDegraded condition reports target node in standby") + o.Expect(services.WaitForPacemakerHealthCheckDegraded(oc, "standby", healthCheckDegradedTimeout, utils.FiveSecondPollInterval)).To(o.Succeed()) + o.Expect(services.AssertPacemakerHealthCheckContains(oc, []string{targetNode.Name, "standby"})).To(o.Succeed()) + + g.By(fmt.Sprintf("Bringing %s out of standby", targetNode.Name)) + o.Expect(utils.PcsNodeUnstandby(oc, peerNode.Name, targetNode.Name)).To(o.Succeed()) + + g.By("Verifying PacemakerHealthCheckDegraded condition clears") + o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed()) + }) + + g.It("should report degraded when cluster is in maintenance mode then healthy after clearing", func() { + g.By("Setting cluster maintenance mode") + o.Expect(utils.PcsPropertySetMaintenanceMode(oc, peerNode.Name, true)).To(o.Succeed()) + + g.By("Verifying PacemakerHealthCheckDegraded condition reports maintenance") + o.Expect(services.WaitForPacemakerHealthCheckDegraded(oc, "maintenance", healthCheckDegradedTimeout, utils.FiveSecondPollInterval)).To(o.Succeed()) + + g.By("Clearing cluster maintenance mode") + o.Expect(utils.PcsPropertySetMaintenanceMode(oc, peerNode.Name, false)).To(o.Succeed()) + + g.By("Verifying PacemakerHealthCheckDegraded condition clears") + o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed()) + }) + + g.It("should report degraded when a node is in maintenance mode then healthy after unmaintenance", func() { + g.By(fmt.Sprintf("Putting %s in node maintenance from %s", targetNode.Name, peerNode.Name)) + o.Expect(utils.PcsNodeMaintenance(oc, peerNode.Name, targetNode.Name)).To(o.Succeed()) + + g.By("Verifying PacemakerHealthCheckDegraded condition reports target node in maintenance") + o.Expect(services.WaitForPacemakerHealthCheckDegraded(oc, "maintenance", healthCheckDegradedTimeout, utils.FiveSecondPollInterval)).To(o.Succeed()) + o.Expect(services.AssertPacemakerHealthCheckContains(oc, []string{targetNode.Name, "maintenance"})).To(o.Succeed()) + + g.By(fmt.Sprintf("Bringing %s out of node maintenance", targetNode.Name)) + o.Expect(utils.PcsNodeUnmaintenance(oc, peerNode.Name, targetNode.Name)).To(o.Succeed()) + + g.By("Verifying PacemakerHealthCheckDegraded condition clears") + o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed()) + }) + +}) + +var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][Suite:openshift/two-node][Serial] Pacemaker health check stale status scenarios", func() { + defer g.GinkgoRecover() + + var ( + oc = exutil.NewCLIWithoutNamespace("tnf-pacemaker-stale").AsAdmin() + etcdClientFactory *helpers.EtcdClientFactoryImpl + ) + + g.BeforeEach(func() { + utils.SkipIfNotTopology(oc, v1.DualReplicaTopologyMode) + etcdClientFactory = helpers.NewEtcdClientFactory(oc.KubeClient()) + utils.SkipIfClusterIsNotHealthy(oc, etcdClientFactory) + }) + + g.It("should report degraded when PacemakerCluster CR is repeatedly deleted then healthy after CR is allowed to exist", func() { + ctx, cancel := context.WithCancel(context.Background()) + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + ticker := time.NewTicker(staleTestDeleteInterval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + out, err := oc.AsAdmin().Run("delete").Args("pacemakercluster", pacemakerClusterCRName, "--ignore-not-found").Output() + if err != nil { + e2e.Logf("Staleness CR delete loop: delete pacemakercluster/%s failed: %v (output: %q)", pacemakerClusterCRName, err, string(out)) + } else if strings.TrimSpace(string(out)) != "" { + e2e.Logf("Staleness CR delete loop: %s", string(out)) + } + } + } + }() + + g.By("Deleting PacemakerCluster CR for 5 minutes so operator exceeds StatusUnknownDegradedThreshold") + time.Sleep(staleMinBlockDuration) + + g.By("Waiting for PacemakerHealthCheckDegraded (CR not found)") + o.Expect(services.WaitForPacemakerHealthCheckDegraded(oc, "not found", staleCRDegradedTimeout, utils.FiveSecondPollInterval)).To(o.Succeed()) + + // Only stop the delete loop after asserting degraded; otherwise the operator could recreate the CR before we observe not found. + g.By("Stopping CR delete loop and allowing operator to recreate CR") + cancel() + wg.Wait() + + g.By("Verifying PacemakerHealthCheckDegraded condition clears") + o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed()) + }) + + g.It("should report degraded when status collector jobs are repeatedly deleted then healthy after jobs can run", func() { + ctx, cancel := context.WithCancel(context.Background()) + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + ticker := time.NewTicker(staleTestDeleteInterval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + out, err := oc.AsAdmin().Run("delete").Args("jobs", "-n", etcdNamespaceFencing, "-l", statusCollectorLabel, "--ignore-not-found").Output() + if err != nil { + e2e.Logf("Staleness job delete loop: delete jobs -l %s -n %s failed: %v (output: %q)", statusCollectorLabel, etcdNamespaceFencing, err, string(out)) + } else if strings.TrimSpace(string(out)) != "" { + e2e.Logf("Staleness job delete loop: %s", string(out)) + } + } + } + }() + + g.By("Blocking status collector for 5 minutes so CR lastUpdated exceeds StatusStalenessThreshold") + time.Sleep(staleMinBlockDuration) + + g.By("Waiting for PacemakerHealthCheckDegraded (stale status)") + o.Expect(services.WaitForPacemakerHealthCheckDegraded(oc, "stale", staleTimestampDegradedTimeout, utils.FiveSecondPollInterval)).To(o.Succeed()) + + // Only stop the delete loop after asserting degraded; otherwise a job could complete and update the CR before we observe stale. + g.By("Stopping job delete loop and allowing cronjob to run") + cancel() + wg.Wait() + + g.By("Verifying PacemakerHealthCheckDegraded condition clears") + o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed()) + }) +}) diff --git a/test/extended/two_node/tnf_recovery.go b/test/extended/two_node/tnf_recovery.go index 8f5635edcaa2..0b96e04a2444 100644 --- a/test/extended/two_node/tnf_recovery.go +++ b/test/extended/two_node/tnf_recovery.go @@ -5,6 +5,8 @@ import ( "fmt" "math/rand" "os" + "path/filepath" + "strings" "time" g "github.com/onsi/ginkgo/v2" @@ -17,6 +19,7 @@ import ( "github.com/openshift/origin/test/extended/two_node/utils/services" exutil "github.com/openshift/origin/test/extended/util" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/kubernetes/test/e2e/framework" ) @@ -27,12 +30,16 @@ const ( memberIsLeaderTimeout = 20 * time.Minute memberRejoinedLearnerTimeout = 10 * time.Minute memberPromotedVotingTimeout = 15 * time.Minute - networkDisruptionDuration = 15 * time.Second + networkDisruptionDuration = 15 * time.Second vmRestartTimeout = 5 * time.Minute vmUngracefulShutdownTimeout = 30 * time.Second // Ungraceful VM shutdown is typically fast vmGracefulShutdownTimeout = 10 * time.Minute // Graceful VM shutdown is typically slow membersHealthyAfterDoubleReboot = 30 * time.Minute // Includes full VM reboot and etcd member healthy progressLogInterval = time.Minute // Target interval for progress logging + + fencingJobName = "tnf-fencing-job" + fencingJobNamespace = "openshift-etcd" + fencingJobWaitTimeout = 10 * time.Minute ) // computeLogInterval calculates poll attempts between progress logs based on poll interval. @@ -91,6 +98,7 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual o.Expect(err).To(o.BeNil(), "Expected to gracefully shutdown the node without errors") time.Sleep(time.Minute) + // Graceful shutdown does not fence the node; skip fencing-event assertion and proceed to member-list validation. g.By(fmt.Sprintf("Ensuring %s leaves the member list (timeout: %v)", targetNode.Name, memberHasLeftTimeout)) o.Eventually(func() error { return helpers.EnsureMemberRemoved(g.GinkgoT(), etcdClientFactory, targetNode.Name) @@ -113,6 +121,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual &survivedNode, &targetNode, true, false, // targetNode expected started == true, learner == false memberPromotedVotingTimeout, utils.FiveSecondPollInterval) + + g.By("Verifying PacemakerHealthCheckDegraded condition clears after recovery") + o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeoutAfterFencing, utils.FiveSecondPollInterval)).To(o.Succeed()) }) g.It("should recover from ungraceful node shutdown with etcd member re-addition", func() { @@ -126,6 +137,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual o.Expect(err).To(o.BeNil(), "Expected to ungracefully shutdown the node without errors", targetNode.Name, err) time.Sleep(1 * time.Minute) + g.By("Verifying that a fencing event was recorded for the node") + o.Expect(services.WaitForFencingEvent(oc, []string{targetNode.Name}, healthCheckDegradedTimeoutAfterFencing, utils.FiveSecondPollInterval)).To(o.Succeed()) + g.By(fmt.Sprintf("Ensuring that %s added %s back as learner (timeout: %v)", peerNode.Name, targetNode.Name, memberIsLeaderTimeout)) validateEtcdRecoveryState(oc, etcdClientFactory, &survivedNode, @@ -143,6 +157,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual &survivedNode, &targetNode, true, false, // targetNode expected started == true, learner == false memberPromotedVotingTimeout, utils.FiveSecondPollInterval) + + g.By("Verifying PacemakerHealthCheckDegraded condition clears after recovery") + o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeoutAfterFencing, utils.FiveSecondPollInterval)).To(o.Succeed()) }) g.It("should recover from network disruption with etcd member re-addition", func() { @@ -155,6 +172,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual o.Expect(err).To(o.BeNil(), "Expected to disrupt network without errors") g.GinkgoT().Printf("command: '%s'\n", command) + g.By("Verifying that a fencing event was recorded for one of the nodes") + o.Expect(services.WaitForFencingEvent(oc, []string{targetNode.Name, peerNode.Name}, healthCheckDegradedTimeoutAfterFencing, utils.FiveSecondPollInterval)).To(o.Succeed()) + g.By(fmt.Sprintf("Ensuring cluster recovery with proper leader/learner roles after network disruption (timeout: %v)", memberIsLeaderTimeout)) // Note: The fenced node may recover quickly and already be started when we get // the first etcd membership. This is valid behavior, so we capture the learner's @@ -177,6 +197,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual leaderNode, learnerNode, true, false, // targetNode expected started == true, learner == false memberPromotedVotingTimeout, utils.FiveSecondPollInterval) + + g.By("Verifying PacemakerHealthCheckDegraded condition clears after recovery") + o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeoutAfterFencing, utils.FiveSecondPollInterval)).To(o.Succeed()) }) g.It("should recover from a double node failure (cold-boot) [Requires:HypervisorSSHConfig]", func() { @@ -207,6 +230,7 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s (node: %s) to reach shut off state in %s timeout", d.vm, d.node, vmUngracefulShutdownTimeout)) } + // Both nodes failed at once; they may have the same etcd revision and restart together without fencing. g.By("Restarting both nodes") restartVms(dataPair, c) @@ -216,6 +240,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual &nodeA, &nodeB, true, false, membersHealthyAfterDoubleReboot, utils.FiveSecondPollInterval) + + g.By("Verifying PacemakerHealthCheckDegraded condition clears after recovery") + o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeoutAfterFencing, utils.FiveSecondPollInterval)).To(o.Succeed()) }) g.It("should recover from double graceful node shutdown (cold-boot) [Requires:HypervisorSSHConfig]", func() { @@ -255,6 +282,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual &nodeA, &nodeB, true, false, membersHealthyAfterDoubleReboot, utils.FiveSecondPollInterval) + + g.By("Verifying PacemakerHealthCheckDegraded condition clears after recovery") + o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeoutAfterFencing, utils.FiveSecondPollInterval)).To(o.Succeed()) }) g.It("should recover from sequential graceful node shutdowns (cold-boot) [Requires:HypervisorSSHConfig]", func() { @@ -327,6 +357,7 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual err = vmShutdownAndWait(VMShutdownModeUngraceful, vmSecondToShutdown, c) o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s to reach shut off state", vmSecondToShutdown)) + // No survivor to fence the last node; first node left gracefully and will rejoin as learner. g.By("Restarting both nodes") restartVms(dataPair, c) @@ -336,15 +367,91 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual &firstToShutdown, &secondToShutdown, true, false, membersHealthyAfterDoubleReboot, utils.FiveSecondPollInterval) + + g.By("Verifying PacemakerHealthCheckDegraded condition clears after recovery") + o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeoutAfterFencing, utils.FiveSecondPollInterval)).To(o.Succeed()) }) g.It("should recover from BMC credential rotation with fencing", func() { - bmcNode := targetNode - survivedNode := peerNode + secretName := "fencing-credentials-" + targetNode.Name + backupDir, err := os.MkdirTemp("", "tnf-fencing-backup-") + o.Expect(err).To(o.BeNil(), "Expected to create backup directory") + defer func() { + _ = os.RemoveAll(backupDir) + }() + + // Backup the fencing secret (copy from OpenShift to disk) so we can restore it after + // corrupting Pacemaker; delete+recreate with the same content triggers the fencing job + // to push the correct credentials back into Pacemaker. + g.By("Backing up fencing secret") + o.Expect(core.BackupResource(oc, "secret", secretName, fencingJobNamespace, backupDir)).To(o.Succeed()) + + g.By("Updating pacemaker stonith with wrong password to simulate fencing degraded") + stonithID := targetNode.Name + "_redfish" + o.Expect(utils.PcsStonithUpdatePassword(oc, peerNode.Name, stonithID, "wrongpassword")).To(o.Succeed()) + + // Ensure we always restore the secret on exit so the cluster is not left with wrong Pacemaker credentials. + backupFile := filepath.Join(backupDir, secretName+".yaml") + defer func() { + _, _ = oc.AsAdmin().Run("delete").Args("secret", secretName, "-n", fencingJobNamespace, "--ignore-not-found").Output() + if err := core.RestoreResource(oc, backupFile); err != nil { + framework.Logf("Warning: deferred restore of fencing secret failed: %v", err) + } + }() + + g.By("Verifying PacemakerHealthCheckDegraded condition reports fencing unavailable for target node") + o.Expect(services.WaitForPacemakerHealthCheckDegraded(oc, `fencing unavailable \(no agents running\)`, healthCheckDegradedTimeoutAfterFencing, utils.FiveSecondPollInterval)).To(o.Succeed()) + o.Expect(services.AssertPacemakerHealthCheckContains(oc, []string{targetNode.Name, "fencing unavailable"})).To(o.Succeed()) + + g.By("Restoring fencing secret (delete and recreate to trigger fencing job)") + _, _ = oc.AsAdmin().Run("delete").Args("secret", secretName, "-n", fencingJobNamespace, "--ignore-not-found").Output() + o.Expect(core.RestoreResource(oc, backupFile)).To(o.Succeed()) + g.By("Waiting for fencing job to complete so Pacemaker gets the correct credentials") kubeClient := oc.AdminKubeClient() + o.Eventually(func() error { + job, err := kubeClient.BatchV1().Jobs(fencingJobNamespace).Get(context.Background(), fencingJobName, metav1.GetOptions{}) + if err != nil { + return err + } + if job.Status.CompletionTime == nil { + return fmt.Errorf("job %s not yet complete", fencingJobName) + } + return nil + }, fencingJobWaitTimeout, utils.FiveSecondPollInterval).Should(o.Succeed(), "fencing job must complete before asserting healthy") - ns, secretName, originalPassword, err := apis.RotateNodeBMCPassword(kubeClient, &bmcNode) + g.By("Verifying PacemakerHealthCheckDegraded condition clears") + o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeoutAfterFencing, utils.FiveSecondPollInterval)).To(o.Succeed()) + }) +}) + +// Fencing secret rotation test lives in a separate Describe without [OCPFeatureGate:DualReplica]; +// we do not add new tests under the FeatureGate-gated suite. +var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][Suite:openshift/two-node][Serial][Disruptive] Two Node fencing secret rotation", func() { + defer g.GinkgoRecover() + + var ( + oc = exutil.NewCLIWithoutNamespace("tnf-fencing-secret-rotation").AsAdmin() + etcdClientFactory *helpers.EtcdClientFactoryImpl + peerNode, targetNode corev1.Node + ) + + g.BeforeEach(func() { + utils.SkipIfNotTopology(oc, v1.DualReplicaTopologyMode) + etcdClientFactory = helpers.NewEtcdClientFactory(oc.KubeClient()) + utils.SkipIfClusterIsNotHealthy(oc, etcdClientFactory) + + nodes, err := utils.GetNodes(oc, utils.AllNodes) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") + randomIndex := rand.Intn(len(nodes.Items)) + peerNode = nodes.Items[randomIndex] + targetNode = nodes.Items[(randomIndex+1)%len(nodes.Items)] + }) + + g.It("should reject invalid fencing credential updates and keep PacemakerCluster healthy", func() { + kubeClient := oc.AdminKubeClient() + + ns, secretName, originalPassword, err := apis.RotateNodeBMCPassword(kubeClient, &targetNode) o.Expect(err).ToNot(o.HaveOccurred(), "expected to rotate BMC credentials without error") defer func() { @@ -356,39 +463,81 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual }() g.By("Ensuring etcd members remain healthy after BMC credential rotation") o.Eventually(func() error { - if err := helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, survivedNode.Name); err != nil { + if err := helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, peerNode.Name); err != nil { return err } - if err := helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, bmcNode.Name); err != nil { + if err := helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, targetNode.Name); err != nil { return err } return nil }, nodeIsHealthyTimeout, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(), "etcd members should be healthy after BMC credential rotation") - g.By(fmt.Sprintf("Triggering a fencing-style network disruption between %s and %s", bmcNode.Name, survivedNode.Name)) - command, err := exutil.TriggerNetworkDisruption(oc.KubeClient(), &bmcNode, &survivedNode, networkDisruptionDuration) - o.Expect(err).To(o.BeNil(), "Expected to disrupt network without errors") - framework.Logf("network disruption command: %q", command) - - g.By(fmt.Sprintf("Ensuring cluster recovery with proper leader/learner roles after BMC credential rotation + network disruption (timeout: %v)", memberIsLeaderTimeout)) - leaderNode, learnerNode, learnerStarted := validateEtcdRecoveryStateWithoutAssumingLeader(oc, etcdClientFactory, - &survivedNode, &bmcNode, memberIsLeaderTimeout, utils.FiveSecondPollInterval) + g.By("Triggering fencing job by patching fencing secret so we can assert it refuses to update pacemaker") + fencingSecretName := "fencing-credentials-" + targetNode.Name + secret, err := kubeClient.CoreV1().Secrets(fencingJobNamespace).Get(context.Background(), fencingSecretName, metav1.GetOptions{}) + o.Expect(err).ToNot(o.HaveOccurred(), "get fencing secret for node %s", targetNode.Name) + patched := secret.DeepCopy() + if patched.Data == nil { + patched.Data = make(map[string][]byte) + } + patched.Data["test-trigger"] = []byte("1") + _, err = kubeClient.CoreV1().Secrets(fencingJobNamespace).Update(context.Background(), patched, metav1.UpdateOptions{}) + o.Expect(err).ToNot(o.HaveOccurred(), "patch fencing secret to trigger job") - if learnerStarted { - framework.Logf("Learner node %q already started as learner after disruption", learnerNode.Name) - } else { - g.By(fmt.Sprintf("Ensuring '%s' rejoins as learner (timeout: %v)", learnerNode.Name, memberRejoinedLearnerTimeout)) - validateEtcdRecoveryState(oc, etcdClientFactory, - leaderNode, - learnerNode, true, true, - memberRejoinedLearnerTimeout, utils.FiveSecondPollInterval) + g.By("Waiting for fencing job to complete") + o.Eventually(func() error { + job, err := kubeClient.BatchV1().Jobs(fencingJobNamespace).Get(context.Background(), fencingJobName, metav1.GetOptions{}) + if err != nil { + return err + } + if job.Status.CompletionTime == nil { + return fmt.Errorf("job %s not yet complete", fencingJobName) + } + return nil + }, fencingJobWaitTimeout, utils.FiveSecondPollInterval).Should(o.Succeed(), "fencing job must complete") + + g.By("Verifying the secret was never rotated in Pacemaker: fencing job logs show it refused to update stonith") + pods, err := kubeClient.CoreV1().Pods(fencingJobNamespace).List(context.Background(), metav1.ListOptions{LabelSelector: "job-name=" + fencingJobName}) + o.Expect(err).ToNot(o.HaveOccurred(), "list pods for fencing job") + o.Expect(pods.Items).ToNot(o.BeEmpty(), "fencing job should have at least one pod (job may not have been recreated after secret patch)") + pod := &pods.Items[0] + podName := pod.Name + framework.Logf("Fencing job pod: %s (namespace: %s) phase=%s", podName, fencingJobNamespace, pod.Status.Phase) + for i, cs := range pod.Status.ContainerStatuses { + framework.Logf(" container[%d] %s: ready=%v state=%+v", i, cs.Name, cs.Ready, cs.State) + } + // CRI-O may return "unable to retrieve container logs" briefly after the job completes; retry a few times. + const logRetries = 3 + const logRetryDelay = 5 * time.Second + var podLogs string + for attempt := 0; attempt < logRetries; attempt++ { + podLogs, err = oc.AsAdmin().Run("logs").Args(podName, "-n", fencingJobNamespace).Output() + if err != nil { + framework.Logf("Get fencing job pod logs (attempt %d/%d): %v", attempt+1, logRetries, err) + if attempt < logRetries-1 { + time.Sleep(logRetryDelay) + } + continue + } + if strings.Contains(podLogs, "unable to retrieve container logs") && attempt < logRetries-1 { + framework.Logf("Pod logs contained 'unable to retrieve container logs' (attempt %d/%d), retrying in %v", attempt+1, logRetries, logRetryDelay) + time.Sleep(logRetryDelay) + continue + } + break + } + o.Expect(err).ToNot(o.HaveOccurred(), "get fencing job pod logs after %d attempts", logRetries) + framework.Logf("Fencing job pod: %s (namespace: %s). Full pod logs:\n%s", podName, fencingJobNamespace, podLogs) + if strings.Contains(podLogs, "unable to retrieve container logs") { + framework.Failf("could not retrieve container logs for fencing job pod %s (CRI-O may have purged them); pod phase=%s", podName, pod.Status.Phase) } + o.Expect(podLogs).To(o.ContainSubstring("already configured and does not need an update"), + "fencing job must have refused to update stonith (secret never rotated in Pacemaker). Fencing job pod: %s. Full pod logs:\n%s", podName, podLogs) + o.Expect(podLogs).To(o.ContainSubstring(targetNode.Name), + "fencing job logs should mention the node %s. Fencing job pod: %s. Full pod logs:\n%s", targetNode.Name, podName, podLogs) - g.By(fmt.Sprintf("Ensuring learner node '%s' is promoted back as voting member (timeout: %v)", learnerNode.Name, memberPromotedVotingTimeout)) - validateEtcdRecoveryState(oc, etcdClientFactory, - leaderNode, - learnerNode, true, false, - memberPromotedVotingTimeout, utils.FiveSecondPollInterval) + g.By("Ensuring PacemakerCluster remained healthy: PacemakerHealthCheckDegraded is not set") + o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed()) }) }) diff --git a/test/extended/two_node/utils/common.go b/test/extended/two_node/utils/common.go index c7cce10fd9bb..260ec20b4b95 100644 --- a/test/extended/two_node/utils/common.go +++ b/test/extended/two_node/utils/common.go @@ -493,6 +493,84 @@ func RemoveConstraint(oc *exutil.CLI, nodeName string, resourceName string) erro return nil } +// PcsNodeStandby puts a node in standby mode (resources moved away, node excluded from cluster). +// +// err := PcsNodeStandby(oc, "master-0", "master-1") +func PcsNodeStandby(oc *exutil.CLI, execNodeName string, targetNodeName string) error { + cmd := fmt.Sprintf("sudo pcs node standby %s", targetNodeName) + output, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, execNodeName, "default", "bash", "-c", cmd) + if err != nil { + return fmt.Errorf("failed to standby node %s: %v, output: %s", targetNodeName, err, output) + } + return nil +} + +// PcsNodeUnstandby brings a node out of standby mode. +// +// err := PcsNodeUnstandby(oc, "master-0", "master-1") +func PcsNodeUnstandby(oc *exutil.CLI, execNodeName string, targetNodeName string) error { + cmd := fmt.Sprintf("sudo pcs node unstandby %s", targetNodeName) + output, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, execNodeName, "default", "bash", "-c", cmd) + if err != nil { + return fmt.Errorf("failed to unstandby node %s: %v, output: %s", targetNodeName, err, output) + } + return nil +} + +// PcsPropertySetMaintenanceMode sets cluster-wide maintenance mode (true or false). +// +// err := PcsPropertySetMaintenanceMode(oc, "master-0", true) +func PcsPropertySetMaintenanceMode(oc *exutil.CLI, execNodeName string, enabled bool) error { + val := "false" + if enabled { + val = "true" + } + cmd := fmt.Sprintf("sudo pcs property set maintenance-mode=%s", val) + output, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, execNodeName, "default", "bash", "-c", cmd) + if err != nil { + return fmt.Errorf("failed to set maintenance-mode=%s: %v, output: %s", val, err, output) + } + return nil +} + +// PcsNodeMaintenance puts a single node in maintenance mode (its resources become unmanaged). +// +// err := PcsNodeMaintenance(oc, "master-0", "master-1") +func PcsNodeMaintenance(oc *exutil.CLI, execNodeName string, targetNodeName string) error { + cmd := fmt.Sprintf("sudo pcs node maintenance %s", targetNodeName) + output, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, execNodeName, "default", "bash", "-c", cmd) + if err != nil { + return fmt.Errorf("failed to put node %s in maintenance: %v, output: %s", targetNodeName, err, output) + } + return nil +} + +// PcsNodeUnmaintenance brings a node out of maintenance mode. +// +// err := PcsNodeUnmaintenance(oc, "master-0", "master-1") +func PcsNodeUnmaintenance(oc *exutil.CLI, execNodeName string, targetNodeName string) error { + cmd := fmt.Sprintf("sudo pcs node unmaintenance %s", targetNodeName) + output, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, execNodeName, "default", "bash", "-c", cmd) + if err != nil { + return fmt.Errorf("failed to unmaintenance node %s: %v, output: %s", targetNodeName, err, output) + } + return nil +} + +// PcsStonithUpdatePassword updates the password for a STONITH device (e.g. master-1_redfish). +// Used to simulate fencing agent degraded by setting a wrong password; restore via secret recreation. +// +// err := PcsStonithUpdatePassword(oc, "master-0", "master-1_redfish", "wrongpassword") +func PcsStonithUpdatePassword(oc *exutil.CLI, execNodeName string, stonithID string, password string) error { + // Shell-escape the password for use in pcs stonith update + cmd := fmt.Sprintf("sudo pcs stonith update %s password=%q", stonithID, password) + output, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, execNodeName, "default", "bash", "-c", cmd) + if err != nil { + return fmt.Errorf("failed to update stonith %s password: %v, output: %s", stonithID, err, output) + } + return nil +} + // IsResourceStopped checks if a pacemaker resource is in stopped state. // // stopped, err := IsResourceStopped(oc, "master-0", "kubelet-clone") diff --git a/test/extended/two_node/utils/services/pacemaker.go b/test/extended/two_node/utils/services/pacemaker.go index 8ced5c47ad16..324c3c02e39f 100644 --- a/test/extended/two_node/utils/services/pacemaker.go +++ b/test/extended/two_node/utils/services/pacemaker.go @@ -5,10 +5,14 @@ import ( "context" "encoding/xml" "fmt" + "regexp" + "strings" "time" + operatorv1 "github.com/openshift/api/operator/v1" "github.com/openshift/origin/test/extended/two_node/utils/core" exutil "github.com/openshift/origin/test/extended/util" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" e2e "k8s.io/kubernetes/test/e2e/framework" ) @@ -35,6 +39,23 @@ const ( pcsProperty = "property" ) +// PacemakerHealthCheckDegradedType is the condition type set by the cluster-etcd-operator on the etcd CR. +const PacemakerHealthCheckDegradedType = "PacemakerHealthCheckDegraded" + +// Event reasons emitted by cluster-etcd-operator when a node is fenced (used to assert fencing occurred). +const ( + EventReasonPacemakerNodeOffline = "PacemakerNodeOffline" + EventReasonPacemakerFencingEvent = "PacemakerFencingEvent" + eventNamespaceEtcdOperator = "openshift-etcd-operator" + eventNamespaceEtcd = "openshift-etcd" +) + +// etcdGetTimeout is the timeout for a single GET of the etcd CR when polling for PacemakerHealthCheckDegraded. +// During fencing or API slowness the server may respond slowly; a longer timeout improves the chance of +// getting a real condition value instead of a timeout error. API errors (including timeout) are retried +// by WaitForPacemakerHealthCheckDegraded and WaitForPacemakerHealthCheckHealthy. +const etcdGetTimeout = 2 * time.Minute + func formatPcsCommandString(command string, envVars string) string { if envVars != "" { return fmt.Sprintf("%s %s %s %s", superuserPrefix, envVars, pcsExecutable, command) @@ -352,3 +373,147 @@ func PcsStatusViaDebug(ctx context.Context, oc *exutil.CLI, nodeName string) (st } return output, err } + +// GetPacemakerHealthCheckCondition returns the PacemakerHealthCheckDegraded condition from the etcd CR. +// The condition is set by the cluster-etcd-operator based on pacemaker cluster status. +// Returns nil if the condition is not present. Uses etcdGetTimeout so that during API slowness +// (e.g. after fencing) we get a real response; callers retry on error. +func GetPacemakerHealthCheckCondition(oc *exutil.CLI) (*operatorv1.OperatorCondition, error) { + ctx, cancel := context.WithTimeout(context.Background(), etcdGetTimeout) + defer cancel() + etcd, err := oc.AdminOperatorClient().OperatorV1().Etcds().Get(ctx, "cluster", metav1.GetOptions{}) + if err != nil { + return nil, fmt.Errorf("get etcd cluster: %w", err) + } + for i := range etcd.Status.Conditions { + if etcd.Status.Conditions[i].Type == PacemakerHealthCheckDegradedType { + return &etcd.Status.Conditions[i], nil + } + } + return nil, nil +} + +// WaitForPacemakerHealthCheckDegraded waits for the PacemakerHealthCheckDegraded condition to become True +// and the message to match the given pattern (regexp). Pass an empty string to match any message. +func WaitForPacemakerHealthCheckDegraded(oc *exutil.CLI, messagePattern string, timeout, pollInterval time.Duration) error { + var re *regexp.Regexp + if messagePattern != "" { + var err error + re, err = regexp.Compile(messagePattern) + if err != nil { + return fmt.Errorf("compile message pattern: %w", err) + } + } + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + cond, err := GetPacemakerHealthCheckCondition(oc) + if err != nil { + e2e.Logf("GetPacemakerHealthCheckCondition: %v", err) + time.Sleep(pollInterval) + continue + } + if cond == nil { + time.Sleep(pollInterval) + continue + } + if cond.Status != operatorv1.ConditionTrue { + e2e.Logf("PacemakerHealthCheckDegraded condition status=%s message=%q (waiting for True)", cond.Status, cond.Message) + time.Sleep(pollInterval) + continue + } + if re != nil && !re.MatchString(cond.Message) { + e2e.Logf("PacemakerHealthCheckDegraded condition True but message %q does not match pattern %q", cond.Message, messagePattern) + time.Sleep(pollInterval) + continue + } + return nil + } + return fmt.Errorf("PacemakerHealthCheckDegraded did not become True with message matching %q within %v", messagePattern, timeout) +} + +// WaitForPacemakerHealthCheckHealthy waits for the PacemakerHealthCheckDegraded condition to become False (cleared). +func WaitForPacemakerHealthCheckHealthy(oc *exutil.CLI, timeout, pollInterval time.Duration) error { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + cond, err := GetPacemakerHealthCheckCondition(oc) + if err != nil { + time.Sleep(pollInterval) + continue + } + if cond == nil || cond.Status == operatorv1.ConditionFalse { + return nil + } + time.Sleep(pollInterval) + } + return fmt.Errorf("PacemakerHealthCheckDegraded did not become False within %v", timeout) +} + +// WaitForFencingEvent waits until a Kubernetes Event exists that proves a node was fenced. +// The cluster-etcd-operator emits events with Reason PacemakerNodeOffline or PacemakerFencingEvent +// when a node is fenced. This is more reliable than waiting for PacemakerHealthCheckDegraded on the +// etcd CR, since by the time the API server is responsive the node may already be recovering. +// nodeNames: at least one of these node names must appear in the event message (e.g. master-0, master-1). +func WaitForFencingEvent(oc *exutil.CLI, nodeNames []string, timeout, pollInterval time.Duration) error { + reasons := []string{EventReasonPacemakerNodeOffline, EventReasonPacemakerFencingEvent} + namespaces := []string{eventNamespaceEtcdOperator, eventNamespaceEtcd} + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + ctx, cancel := context.WithTimeout(context.Background(), etcdGetTimeout) + for _, ns := range namespaces { + list, err := oc.AdminKubeClient().CoreV1().Events(ns).List(ctx, metav1.ListOptions{}) + if err != nil { + e2e.Logf("List events in %s: %v", ns, err) + continue + } + for i := range list.Items { + ev := &list.Items[i] + if !contains(reasons, ev.Reason) { + continue + } + msg := ev.Message + for _, name := range nodeNames { + if strings.Contains(msg, name) { + e2e.Logf("Found fencing event: namespace=%s reason=%s message=%q", ns, ev.Reason, msg) + cancel() + return nil + } + } + } + } + cancel() + time.Sleep(pollInterval) + } + return fmt.Errorf("no event with reason %v and message containing any of %v within %v", reasons, nodeNames, timeout) +} + +func contains(ss []string, s string) bool { + for _, x := range ss { + if x == s { + return true + } + } + return false +} + +// AssertPacemakerHealthCheckContains verifies that the PacemakerHealthCheckDegraded condition message +// contains all of the expected substrings. If the condition is not True, returns an error. +// Pass expectedPatterns as substrings that must all appear in the condition message. +func AssertPacemakerHealthCheckContains(oc *exutil.CLI, expectedPatterns []string) error { + cond, err := GetPacemakerHealthCheckCondition(oc) + if err != nil { + return err + } + if cond == nil { + return fmt.Errorf("PacemakerHealthCheckDegraded condition not found on etcd CR") + } + if cond.Status != operatorv1.ConditionTrue { + return fmt.Errorf("PacemakerHealthCheckDegraded condition status is %s, expected True", cond.Status) + } + msg := cond.Message + for _, sub := range expectedPatterns { + if !strings.Contains(msg, sub) { + return fmt.Errorf("PacemakerHealthCheckDegraded message %q does not contain %q", msg, sub) + } + } + return nil +}