diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go index 5e0fad828ca0..6f32765aacaf 100644 --- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go +++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go @@ -14,6 +14,7 @@ import ( configv1 "github.com/openshift/api/config/v1" "k8s.io/client-go/rest" + e2e "k8s.io/kubernetes/test/e2e/framework" ) type legacyMonitorTests struct { @@ -90,15 +91,22 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C junits = append(junits, testOperatorOSUpdateStartedEventRecorded(finalIntervals, w.adminRESTConfig)...) isUpgrade := platformidentification.DidUpgradeHappenDuringCollection(finalIntervals, time.Time{}, time.Time{}) + topology, err := getControlPlaneTopology(w.adminRESTConfig) + if err != nil { + e2e.Logf("failed to get control plane topology: %v", err) + } + singleNode := topology == configv1.SingleReplicaTopologyMode + if isUpgrade { - junits = append(junits, testUpgradeOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...) + upgradeFailed := hasUpgradeFailedEvent(finalIntervals) + junits = append(junits, testUpgradeOperatorStateTransitions(finalIntervals, w.adminRESTConfig, topology, upgradeFailed)...) level, err := getUpgradeLevel(w.adminRESTConfig) if err != nil || level == unknownUpgradeLevel { return nil, fmt.Errorf("failed to determine upgrade level: %w", err) } - junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel)...) + junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel, singleNode, upgradeFailed)...) } else { - junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...) + junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig, singleNode)...) } return junits, nil diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go index c67ec339a85d..52ae8d223865 100644 --- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go +++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go @@ -46,13 +46,7 @@ func checkAuthenticationAvailableExceptions(condition *configv1.ClusterOperatorS return false } -func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config) []*junitapi.JUnitTestCase { - topology, err := getControlPlaneTopology(clientConfig) - if err != nil { - logrus.Warnf("Error checking for ControlPlaneTopology configuration (unable to make topology exceptions): %v", err) - } - isSingleNode := topology == configv1.SingleReplicaTopologyMode - +func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config, singleNode bool) []*junitapi.JUnitTestCase { except := func(operator string, condition *configv1.ClusterOperatorStatusCondition, _ monitorapi.Interval, clientConfig *rest.Config) string { if condition.Status == configv1.ConditionTrue { if condition.Type == configv1.OperatorAvailable { @@ -64,30 +58,6 @@ func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clien } } - if isSingleNode { - switch operator { - case "dns": - if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && - strings.Contains(condition.Message, `DNS "default" is unavailable.`) { - return "dns operator is allowed to have Available=False due to serial taint tests on single node" - } - if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue && - strings.Contains(condition.Message, `DNS default is degraded`) { - return "dns operator is allowed to have Degraded=True due to serial taint tests on single node" - } - case "openshift-apiserver": - if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && - strings.Contains(condition.Message, `connect: connection refused`) { - return "openshift apiserver operator is allowed to have Available=False due kube-apiserver force rollout test on single node" - } - case "csi-snapshot-controller": - if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && - strings.Contains(condition.Message, `Waiting for Deployment`) { - return "csi snapshot controller is allowed to have Available=False due to CSI webhook test on single node" - } - } - } - // For the non-upgrade case, if any operator has Available=False, fail the test. if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse { if operator == "authentication" { @@ -156,7 +126,7 @@ func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clien return "We are not worried about other operator condition blips for stable-system tests yet." } - return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, false) + return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, false, singleNode) } func getControlPlaneTopology(clientConfig *rest.Config) (configv1.TopologyMode, error) { @@ -268,25 +238,15 @@ func hasUpgradeFailedEvent(eventList monitorapi.Intervals) bool { return false } -func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config) []*junitapi.JUnitTestCase { +func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config, topology configv1.TopologyMode, upgradeFailed bool) []*junitapi.JUnitTestCase { upgradeWindows := getUpgradeWindows(events) - topology, err := getControlPlaneTopology(clientConfig) - if err != nil { - logrus.Warnf("Error checking for ControlPlaneTopology configuration on upgrade (unable to make topology exceptions): %v", err) - } - isSingleNode := topology == configv1.SingleReplicaTopologyMode isTwoNode := topology == configv1.HighlyAvailableArbiterMode || topology == configv1.DualReplicaTopologyMode - upgradeFailed := hasUpgradeFailedEvent(events) except := func(operator string, condition *configv1.ClusterOperatorStatusCondition, eventInterval monitorapi.Interval, clientConfig *rest.Config) string { // When an upgrade was recorded as failed, we will not care about the operator state transitions if upgradeFailed { - return "upgrade failed, not recording unexpected operator transitions as failure" - } - // SingleNode is expected to go Available=False and Degraded=True for most / all operators during upgrade - if isSingleNode { - return "single node is allowed to be unavailable/degraded during upgrades" + return upgradeFailureException } if condition.Status == configv1.ConditionTrue { @@ -461,9 +421,6 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf } case "kube-apiserver": if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue { - if isSingleNode && condition.Reason == "NodeInstaller_InstallerPodFailed" { - return "https://issues.redhat.com/browse/OCPBUGS-38678" - } return "https://issues.redhat.com/browse/OCPBUGS-38661" } case "kube-controller-manager": @@ -486,7 +443,7 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf return "" } - return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, true) + return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, true, topology == configv1.SingleReplicaTopologyMode) } func isVSphere(config *rest.Config) (bool, error) { @@ -520,7 +477,7 @@ func checkReplicas(namespace string, operator string, clientConfig *rest.Config) return 0, fmt.Errorf("Error fetching replicas") } -func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []configv1.ClusterStatusConditionType, except exceptionCallback, clientConfig *rest.Config, upgrade bool) []*junitapi.JUnitTestCase { +func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []configv1.ClusterStatusConditionType, except exceptionCallback, clientConfig *rest.Config, upgrade, singleNode bool) []*junitapi.JUnitTestCase { ret := []*junitapi.JUnitTestCase{} var start, stop time.Time @@ -548,7 +505,16 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes [] }) continue } - + if singleNode { + // SingleNode is expected to go Available=False and Degraded=True for most / all operators during upgrade + ret = append(ret, &junitapi.JUnitTestCase{ + Name: testName, + SkipMessage: &junitapi.SkipMessage{ + Message: "Test skipped on a single-node cluster", + }, + }) + continue + } excepted := []string{} fatal := []string{} @@ -633,7 +599,9 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes [] return ret } -func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade bool) []*junitapi.JUnitTestCase { +const upgradeFailureException = "upgrade failed, not recording unexpected operator transitions as failure" + +func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade, singleNode, upgradeFailed bool) []*junitapi.JUnitTestCase { var ret []*junitapi.JUnitTestCase upgradeWindows := getUpgradeWindows(events) multiUpgrades := platformidentification.UpgradeNumberDuringCollection(events, time.Time{}, time.Time{}) > 1 @@ -677,6 +645,11 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, } except := func(co string, _ string) string { + // When an upgrade was recorded as failed, we will not care about the operator state transitions + if upgradeFailed { + return upgradeFailureException + } + intervals, ok := COWaiting[co] if !ok { // CO have not shown up in CVO Progressing message @@ -729,6 +702,10 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, mcTestCase.SkipMessage = &junitapi.SkipMessage{ Message: "Test skipped in a patch-level upgrade test", } + } else if singleNode { + mcTestCase.SkipMessage = &junitapi.SkipMessage{ + Message: "Test skipped on a single-node cluster", + } } else if t, ok := coProgressingStart[operatorName]; !ok || t.IsZero() { output := fmt.Sprintf("clusteroperator/%s was never Progressing=True during the upgrade window from %s to %s", operatorName, start.Format(time.RFC3339), stop.Format(time.RFC3339)) exception = except(operatorName, "") @@ -755,6 +732,10 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, } except = func(co string, reason string) string { + // When an upgrade was recorded as failed, we will not care about the operator state transitions + if upgradeFailed { + return upgradeFailureException + } switch co { case "console": if reason == "SyncLoopRefresh_InProgress" { diff --git a/test/extended/machines/scale.go b/test/extended/machines/scale.go index 65811d56e9e1..55aa750c851c 100644 --- a/test/extended/machines/scale.go +++ b/test/extended/machines/scale.go @@ -11,7 +11,7 @@ import ( o "github.com/onsi/gomega" configv1 "github.com/openshift/api/config/v1" configclient "github.com/openshift/client-go/config/clientset/versioned" - bmhelper "github.com/openshift/origin/test/extended/baremetal" + configv1client "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1" "github.com/stretchr/objx" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -23,6 +23,9 @@ import ( "k8s.io/client-go/scale" e2e "k8s.io/kubernetes/test/e2e/framework" e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" + + bmhelper "github.com/openshift/origin/test/extended/baremetal" + exutil "github.com/openshift/origin/test/extended/util" ) const ( @@ -266,7 +269,18 @@ var _ = g.Describe("[sig-cluster-lifecycle][Feature:Machines][Serial] Managed cl violations = append(violations, operator) } } - o.Expect(violations).To(o.BeEmpty(), "those cluster operators left Progressing=False while cluster was scaling: %v", violations) + + cfg, err := e2e.LoadConfig() + o.Expect(err).NotTo(o.HaveOccurred()) + configV1Client, err := configv1client.NewForConfig(cfg) + o.Expect(err).NotTo(o.HaveOccurred()) + topo, err := exutil.GetControlPlaneTopologyFromConfigClient(configV1Client) + if err != nil { + e2e.Logf("failed to get control plane topology: %v", err) + } + if *topo != configv1.SingleReplicaTopologyMode { + o.Expect(violations).To(o.BeEmpty(), "those cluster operators left Progressing=False while cluster was scaling: %v", violations) + } }) // The 30m timeout is essentially required by the baremetal platform environment,