From 96f3cd1dbac5f4a6e912c3e6e592ad2f04689daf Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Wed, 11 Feb 2026 21:21:39 -0500 Subject: [PATCH 1/2] Skip CO condition tests on SNO This pull skips all CO tests on SNO. - `Available=False` and `Degrade=True` are not checked at all no matter if the test case is executed in an upgrade test suite, or not. Before it was handled as an exception and thus the job would be just flaky instead of failing. Thus, the relevant exceptions can be removed. - All checks on the `Progressing` condition are skipped as well on a SNO cluster. The logging logic was inherited if it fails to determine the control plane topology because I am not sure on which type of clusters an error will show up. --- .../legacycvomonitortests/monitortest.go | 13 +++- .../legacycvomonitortests/operators.go | 69 ++++++------------- test/extended/machines/scale.go | 18 ++++- 3 files changed, 46 insertions(+), 54 deletions(-) diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go index 5e0fad828ca0..c675972215ed 100644 --- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go +++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go @@ -14,6 +14,7 @@ import ( configv1 "github.com/openshift/api/config/v1" "k8s.io/client-go/rest" + e2e "k8s.io/kubernetes/test/e2e/framework" ) type legacyMonitorTests struct { @@ -90,15 +91,21 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C junits = append(junits, testOperatorOSUpdateStartedEventRecorded(finalIntervals, w.adminRESTConfig)...) isUpgrade := platformidentification.DidUpgradeHappenDuringCollection(finalIntervals, time.Time{}, time.Time{}) + topology, err := getControlPlaneTopology(w.adminRESTConfig) + if err != nil { + e2e.Logf("failed to get control plane topology: %v", err) + } + singleNode := topology == configv1.SingleReplicaTopologyMode + if isUpgrade { - junits = append(junits, testUpgradeOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...) + junits = append(junits, testUpgradeOperatorStateTransitions(finalIntervals, w.adminRESTConfig, topology)...) level, err := getUpgradeLevel(w.adminRESTConfig) if err != nil || level == unknownUpgradeLevel { return nil, fmt.Errorf("failed to determine upgrade level: %w", err) } - junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel)...) + junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel, singleNode)...) } else { - junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...) + junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig, singleNode)...) } return junits, nil diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go index c67ec339a85d..2ea4d1aab979 100644 --- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go +++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go @@ -46,13 +46,7 @@ func checkAuthenticationAvailableExceptions(condition *configv1.ClusterOperatorS return false } -func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config) []*junitapi.JUnitTestCase { - topology, err := getControlPlaneTopology(clientConfig) - if err != nil { - logrus.Warnf("Error checking for ControlPlaneTopology configuration (unable to make topology exceptions): %v", err) - } - isSingleNode := topology == configv1.SingleReplicaTopologyMode - +func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config, singleNode bool) []*junitapi.JUnitTestCase { except := func(operator string, condition *configv1.ClusterOperatorStatusCondition, _ monitorapi.Interval, clientConfig *rest.Config) string { if condition.Status == configv1.ConditionTrue { if condition.Type == configv1.OperatorAvailable { @@ -64,30 +58,6 @@ func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clien } } - if isSingleNode { - switch operator { - case "dns": - if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && - strings.Contains(condition.Message, `DNS "default" is unavailable.`) { - return "dns operator is allowed to have Available=False due to serial taint tests on single node" - } - if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue && - strings.Contains(condition.Message, `DNS default is degraded`) { - return "dns operator is allowed to have Degraded=True due to serial taint tests on single node" - } - case "openshift-apiserver": - if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && - strings.Contains(condition.Message, `connect: connection refused`) { - return "openshift apiserver operator is allowed to have Available=False due kube-apiserver force rollout test on single node" - } - case "csi-snapshot-controller": - if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && - strings.Contains(condition.Message, `Waiting for Deployment`) { - return "csi snapshot controller is allowed to have Available=False due to CSI webhook test on single node" - } - } - } - // For the non-upgrade case, if any operator has Available=False, fail the test. if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse { if operator == "authentication" { @@ -156,7 +126,7 @@ func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clien return "We are not worried about other operator condition blips for stable-system tests yet." } - return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, false) + return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, false, singleNode) } func getControlPlaneTopology(clientConfig *rest.Config) (configv1.TopologyMode, error) { @@ -268,14 +238,9 @@ func hasUpgradeFailedEvent(eventList monitorapi.Intervals) bool { return false } -func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config) []*junitapi.JUnitTestCase { +func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config, topology configv1.TopologyMode) []*junitapi.JUnitTestCase { upgradeWindows := getUpgradeWindows(events) - topology, err := getControlPlaneTopology(clientConfig) - if err != nil { - logrus.Warnf("Error checking for ControlPlaneTopology configuration on upgrade (unable to make topology exceptions): %v", err) - } - isSingleNode := topology == configv1.SingleReplicaTopologyMode isTwoNode := topology == configv1.HighlyAvailableArbiterMode || topology == configv1.DualReplicaTopologyMode upgradeFailed := hasUpgradeFailedEvent(events) @@ -284,10 +249,6 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf if upgradeFailed { return "upgrade failed, not recording unexpected operator transitions as failure" } - // SingleNode is expected to go Available=False and Degraded=True for most / all operators during upgrade - if isSingleNode { - return "single node is allowed to be unavailable/degraded during upgrades" - } if condition.Status == configv1.ConditionTrue { if condition.Type == configv1.OperatorAvailable { @@ -461,9 +422,6 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf } case "kube-apiserver": if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue { - if isSingleNode && condition.Reason == "NodeInstaller_InstallerPodFailed" { - return "https://issues.redhat.com/browse/OCPBUGS-38678" - } return "https://issues.redhat.com/browse/OCPBUGS-38661" } case "kube-controller-manager": @@ -486,7 +444,7 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf return "" } - return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, true) + return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, true, topology == configv1.SingleReplicaTopologyMode) } func isVSphere(config *rest.Config) (bool, error) { @@ -520,7 +478,7 @@ func checkReplicas(namespace string, operator string, clientConfig *rest.Config) return 0, fmt.Errorf("Error fetching replicas") } -func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []configv1.ClusterStatusConditionType, except exceptionCallback, clientConfig *rest.Config, upgrade bool) []*junitapi.JUnitTestCase { +func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []configv1.ClusterStatusConditionType, except exceptionCallback, clientConfig *rest.Config, upgrade, singleNode bool) []*junitapi.JUnitTestCase { ret := []*junitapi.JUnitTestCase{} var start, stop time.Time @@ -548,7 +506,16 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes [] }) continue } - + if singleNode { + // SingleNode is expected to go Available=False and Degraded=True for most / all operators during upgrade + ret = append(ret, &junitapi.JUnitTestCase{ + Name: testName, + SkipMessage: &junitapi.SkipMessage{ + Message: "Test skipped on a single-node cluster", + }, + }) + continue + } excepted := []string{} fatal := []string{} @@ -633,7 +600,7 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes [] return ret } -func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade bool) []*junitapi.JUnitTestCase { +func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade, singleNode bool) []*junitapi.JUnitTestCase { var ret []*junitapi.JUnitTestCase upgradeWindows := getUpgradeWindows(events) multiUpgrades := platformidentification.UpgradeNumberDuringCollection(events, time.Time{}, time.Time{}) > 1 @@ -729,6 +696,10 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, mcTestCase.SkipMessage = &junitapi.SkipMessage{ Message: "Test skipped in a patch-level upgrade test", } + } else if singleNode { + mcTestCase.SkipMessage = &junitapi.SkipMessage{ + Message: "Test skipped on a single-node cluster", + } } else if t, ok := coProgressingStart[operatorName]; !ok || t.IsZero() { output := fmt.Sprintf("clusteroperator/%s was never Progressing=True during the upgrade window from %s to %s", operatorName, start.Format(time.RFC3339), stop.Format(time.RFC3339)) exception = except(operatorName, "") diff --git a/test/extended/machines/scale.go b/test/extended/machines/scale.go index 65811d56e9e1..55aa750c851c 100644 --- a/test/extended/machines/scale.go +++ b/test/extended/machines/scale.go @@ -11,7 +11,7 @@ import ( o "github.com/onsi/gomega" configv1 "github.com/openshift/api/config/v1" configclient "github.com/openshift/client-go/config/clientset/versioned" - bmhelper "github.com/openshift/origin/test/extended/baremetal" + configv1client "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1" "github.com/stretchr/objx" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -23,6 +23,9 @@ import ( "k8s.io/client-go/scale" e2e "k8s.io/kubernetes/test/e2e/framework" e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" + + bmhelper "github.com/openshift/origin/test/extended/baremetal" + exutil "github.com/openshift/origin/test/extended/util" ) const ( @@ -266,7 +269,18 @@ var _ = g.Describe("[sig-cluster-lifecycle][Feature:Machines][Serial] Managed cl violations = append(violations, operator) } } - o.Expect(violations).To(o.BeEmpty(), "those cluster operators left Progressing=False while cluster was scaling: %v", violations) + + cfg, err := e2e.LoadConfig() + o.Expect(err).NotTo(o.HaveOccurred()) + configV1Client, err := configv1client.NewForConfig(cfg) + o.Expect(err).NotTo(o.HaveOccurred()) + topo, err := exutil.GetControlPlaneTopologyFromConfigClient(configV1Client) + if err != nil { + e2e.Logf("failed to get control plane topology: %v", err) + } + if *topo != configv1.SingleReplicaTopologyMode { + o.Expect(violations).To(o.BeEmpty(), "those cluster operators left Progressing=False while cluster was scaling: %v", violations) + } }) // The 30m timeout is essentially required by the baremetal platform environment, From 223dd4de5d1733cc07ff2bd8d4e1c751d8f86ef3 Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Thu, 12 Feb 2026 15:13:41 -0500 Subject: [PATCH 2/2] CO tests becomes flaky on a cluster upgrade failure We do not want to report testing results if the upgrade failed. However, the relevant COs might be the root cause of the upgrade failure. Instead of completely skipping the test cases, we make them flaky to still bubble up some signals. However, the signal for the flakiness is weak so that it does not lead to a job failure. This strategy is already implemented for Available and Degraded conditions. We extend it to Progressing as well. There is another place to check Progressing [1] which is out of the scope of `monitortest` and thus determining a failing upgrade is challenging for it. For now, let us ignore that case. [1]. https://github.com/openshift/origin/blob/50bae192e1195e41949279128562f5e861f35d72/test/extended/machines/scale.go#L269 --- .../legacycvomonitortests/monitortest.go | 5 +++-- .../legacycvomonitortests/operators.go | 18 ++++++++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go index c675972215ed..6f32765aacaf 100644 --- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go +++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go @@ -98,12 +98,13 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C singleNode := topology == configv1.SingleReplicaTopologyMode if isUpgrade { - junits = append(junits, testUpgradeOperatorStateTransitions(finalIntervals, w.adminRESTConfig, topology)...) + upgradeFailed := hasUpgradeFailedEvent(finalIntervals) + junits = append(junits, testUpgradeOperatorStateTransitions(finalIntervals, w.adminRESTConfig, topology, upgradeFailed)...) level, err := getUpgradeLevel(w.adminRESTConfig) if err != nil || level == unknownUpgradeLevel { return nil, fmt.Errorf("failed to determine upgrade level: %w", err) } - junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel, singleNode)...) + junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel, singleNode, upgradeFailed)...) } else { junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig, singleNode)...) } diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go index 2ea4d1aab979..52ae8d223865 100644 --- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go +++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go @@ -238,16 +238,15 @@ func hasUpgradeFailedEvent(eventList monitorapi.Intervals) bool { return false } -func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config, topology configv1.TopologyMode) []*junitapi.JUnitTestCase { +func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config, topology configv1.TopologyMode, upgradeFailed bool) []*junitapi.JUnitTestCase { upgradeWindows := getUpgradeWindows(events) isTwoNode := topology == configv1.HighlyAvailableArbiterMode || topology == configv1.DualReplicaTopologyMode - upgradeFailed := hasUpgradeFailedEvent(events) except := func(operator string, condition *configv1.ClusterOperatorStatusCondition, eventInterval monitorapi.Interval, clientConfig *rest.Config) string { // When an upgrade was recorded as failed, we will not care about the operator state transitions if upgradeFailed { - return "upgrade failed, not recording unexpected operator transitions as failure" + return upgradeFailureException } if condition.Status == configv1.ConditionTrue { @@ -600,7 +599,9 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes [] return ret } -func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade, singleNode bool) []*junitapi.JUnitTestCase { +const upgradeFailureException = "upgrade failed, not recording unexpected operator transitions as failure" + +func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade, singleNode, upgradeFailed bool) []*junitapi.JUnitTestCase { var ret []*junitapi.JUnitTestCase upgradeWindows := getUpgradeWindows(events) multiUpgrades := platformidentification.UpgradeNumberDuringCollection(events, time.Time{}, time.Time{}) > 1 @@ -644,6 +645,11 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, } except := func(co string, _ string) string { + // When an upgrade was recorded as failed, we will not care about the operator state transitions + if upgradeFailed { + return upgradeFailureException + } + intervals, ok := COWaiting[co] if !ok { // CO have not shown up in CVO Progressing message @@ -726,6 +732,10 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, } except = func(co string, reason string) string { + // When an upgrade was recorded as failed, we will not care about the operator state transitions + if upgradeFailed { + return upgradeFailureException + } switch co { case "console": if reason == "SyncLoopRefresh_InProgress" {