Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (

configv1 "github.com/openshift/api/config/v1"
"k8s.io/client-go/rest"
e2e "k8s.io/kubernetes/test/e2e/framework"
)

type legacyMonitorTests struct {
Expand Down Expand Up @@ -90,15 +91,22 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C
junits = append(junits, testOperatorOSUpdateStartedEventRecorded(finalIntervals, w.adminRESTConfig)...)

isUpgrade := platformidentification.DidUpgradeHappenDuringCollection(finalIntervals, time.Time{}, time.Time{})
topology, err := getControlPlaneTopology(w.adminRESTConfig)
if err != nil {
e2e.Logf("failed to get control plane topology: %v", err)
}
singleNode := topology == configv1.SingleReplicaTopologyMode

if isUpgrade {
junits = append(junits, testUpgradeOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...)
upgradeFailed := hasUpgradeFailedEvent(finalIntervals)
junits = append(junits, testUpgradeOperatorStateTransitions(finalIntervals, w.adminRESTConfig, topology, upgradeFailed)...)
level, err := getUpgradeLevel(w.adminRESTConfig)
if err != nil || level == unknownUpgradeLevel {
return nil, fmt.Errorf("failed to determine upgrade level: %w", err)
}
junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel)...)
junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel, singleNode, upgradeFailed)...)
} else {
junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...)
junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig, singleNode)...)
}

return junits, nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,7 @@ func checkAuthenticationAvailableExceptions(condition *configv1.ClusterOperatorS
return false
}

func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config) []*junitapi.JUnitTestCase {
topology, err := getControlPlaneTopology(clientConfig)
if err != nil {
logrus.Warnf("Error checking for ControlPlaneTopology configuration (unable to make topology exceptions): %v", err)
}
isSingleNode := topology == configv1.SingleReplicaTopologyMode

func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config, singleNode bool) []*junitapi.JUnitTestCase {
except := func(operator string, condition *configv1.ClusterOperatorStatusCondition, _ monitorapi.Interval, clientConfig *rest.Config) string {
if condition.Status == configv1.ConditionTrue {
if condition.Type == configv1.OperatorAvailable {
Expand All @@ -64,30 +58,6 @@ func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clien
}
}

if isSingleNode {
switch operator {
case "dns":
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse &&
strings.Contains(condition.Message, `DNS "default" is unavailable.`) {
return "dns operator is allowed to have Available=False due to serial taint tests on single node"
}
if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue &&
strings.Contains(condition.Message, `DNS default is degraded`) {
return "dns operator is allowed to have Degraded=True due to serial taint tests on single node"
}
case "openshift-apiserver":
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse &&
strings.Contains(condition.Message, `connect: connection refused`) {
return "openshift apiserver operator is allowed to have Available=False due kube-apiserver force rollout test on single node"
}
case "csi-snapshot-controller":
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse &&
strings.Contains(condition.Message, `Waiting for Deployment`) {
return "csi snapshot controller is allowed to have Available=False due to CSI webhook test on single node"
}
}
}

// For the non-upgrade case, if any operator has Available=False, fail the test.
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse {
if operator == "authentication" {
Expand Down Expand Up @@ -156,7 +126,7 @@ func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clien
return "We are not worried about other operator condition blips for stable-system tests yet."
}

return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, false)
return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, false, singleNode)
}

func getControlPlaneTopology(clientConfig *rest.Config) (configv1.TopologyMode, error) {
Expand Down Expand Up @@ -268,25 +238,15 @@ func hasUpgradeFailedEvent(eventList monitorapi.Intervals) bool {
return false
}

func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config) []*junitapi.JUnitTestCase {
func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config, topology configv1.TopologyMode, upgradeFailed bool) []*junitapi.JUnitTestCase {
upgradeWindows := getUpgradeWindows(events)
topology, err := getControlPlaneTopology(clientConfig)
if err != nil {
logrus.Warnf("Error checking for ControlPlaneTopology configuration on upgrade (unable to make topology exceptions): %v", err)
}

isSingleNode := topology == configv1.SingleReplicaTopologyMode
isTwoNode := topology == configv1.HighlyAvailableArbiterMode || topology == configv1.DualReplicaTopologyMode
upgradeFailed := hasUpgradeFailedEvent(events)

except := func(operator string, condition *configv1.ClusterOperatorStatusCondition, eventInterval monitorapi.Interval, clientConfig *rest.Config) string {
// When an upgrade was recorded as failed, we will not care about the operator state transitions
if upgradeFailed {
return "upgrade failed, not recording unexpected operator transitions as failure"
}
// SingleNode is expected to go Available=False and Degraded=True for most / all operators during upgrade
if isSingleNode {
return "single node is allowed to be unavailable/degraded during upgrades"
return upgradeFailureException
}

if condition.Status == configv1.ConditionTrue {
Expand Down Expand Up @@ -461,9 +421,6 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf
}
case "kube-apiserver":
if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue {
if isSingleNode && condition.Reason == "NodeInstaller_InstallerPodFailed" {
return "https://issues.redhat.com/browse/OCPBUGS-38678"
}
return "https://issues.redhat.com/browse/OCPBUGS-38661"
}
case "kube-controller-manager":
Expand All @@ -486,7 +443,7 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf
return ""
}

return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, true)
return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, true, topology == configv1.SingleReplicaTopologyMode)
}

func isVSphere(config *rest.Config) (bool, error) {
Expand Down Expand Up @@ -520,7 +477,7 @@ func checkReplicas(namespace string, operator string, clientConfig *rest.Config)
return 0, fmt.Errorf("Error fetching replicas")
}

func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []configv1.ClusterStatusConditionType, except exceptionCallback, clientConfig *rest.Config, upgrade bool) []*junitapi.JUnitTestCase {
func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []configv1.ClusterStatusConditionType, except exceptionCallback, clientConfig *rest.Config, upgrade, singleNode bool) []*junitapi.JUnitTestCase {
ret := []*junitapi.JUnitTestCase{}

var start, stop time.Time
Expand Down Expand Up @@ -548,7 +505,16 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
})
continue
}

if singleNode {
// SingleNode is expected to go Available=False and Degraded=True for most / all operators during upgrade
ret = append(ret, &junitapi.JUnitTestCase{
Name: testName,
SkipMessage: &junitapi.SkipMessage{
Message: "Test skipped on a single-node cluster",
},
})
continue
}
excepted := []string{}
fatal := []string{}

Expand Down Expand Up @@ -633,7 +599,9 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
return ret
}

func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade bool) []*junitapi.JUnitTestCase {
const upgradeFailureException = "upgrade failed, not recording unexpected operator transitions as failure"

func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade, singleNode, upgradeFailed bool) []*junitapi.JUnitTestCase {
var ret []*junitapi.JUnitTestCase
upgradeWindows := getUpgradeWindows(events)
multiUpgrades := platformidentification.UpgradeNumberDuringCollection(events, time.Time{}, time.Time{}) > 1
Expand Down Expand Up @@ -677,6 +645,11 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals,
}

except := func(co string, _ string) string {
// When an upgrade was recorded as failed, we will not care about the operator state transitions
if upgradeFailed {
return upgradeFailureException
}

intervals, ok := COWaiting[co]
if !ok {
// CO have not shown up in CVO Progressing message
Expand Down Expand Up @@ -729,6 +702,10 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals,
mcTestCase.SkipMessage = &junitapi.SkipMessage{
Message: "Test skipped in a patch-level upgrade test",
}
} else if singleNode {
mcTestCase.SkipMessage = &junitapi.SkipMessage{
Message: "Test skipped on a single-node cluster",
}
} else if t, ok := coProgressingStart[operatorName]; !ok || t.IsZero() {
output := fmt.Sprintf("clusteroperator/%s was never Progressing=True during the upgrade window from %s to %s", operatorName, start.Format(time.RFC3339), stop.Format(time.RFC3339))
exception = except(operatorName, "")
Expand All @@ -755,6 +732,10 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals,
}

except = func(co string, reason string) string {
// When an upgrade was recorded as failed, we will not care about the operator state transitions
if upgradeFailed {
return upgradeFailureException
}
switch co {
case "console":
if reason == "SyncLoopRefresh_InProgress" {
Expand Down
18 changes: 16 additions & 2 deletions test/extended/machines/scale.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (
o "github.com/onsi/gomega"
configv1 "github.com/openshift/api/config/v1"
configclient "github.com/openshift/client-go/config/clientset/versioned"
bmhelper "github.com/openshift/origin/test/extended/baremetal"
configv1client "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
"github.com/stretchr/objx"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -23,6 +23,9 @@ import (
"k8s.io/client-go/scale"
e2e "k8s.io/kubernetes/test/e2e/framework"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"

bmhelper "github.com/openshift/origin/test/extended/baremetal"
exutil "github.com/openshift/origin/test/extended/util"
)

const (
Expand Down Expand Up @@ -266,7 +269,18 @@ var _ = g.Describe("[sig-cluster-lifecycle][Feature:Machines][Serial] Managed cl
violations = append(violations, operator)
}
}
o.Expect(violations).To(o.BeEmpty(), "those cluster operators left Progressing=False while cluster was scaling: %v", violations)

cfg, err := e2e.LoadConfig()
o.Expect(err).NotTo(o.HaveOccurred())
configV1Client, err := configv1client.NewForConfig(cfg)
o.Expect(err).NotTo(o.HaveOccurred())
topo, err := exutil.GetControlPlaneTopologyFromConfigClient(configV1Client)
if err != nil {
e2e.Logf("failed to get control plane topology: %v", err)
}
if *topo != configv1.SingleReplicaTopologyMode {
o.Expect(violations).To(o.BeEmpty(), "those cluster operators left Progressing=False while cluster was scaling: %v", violations)
}
})

// The 30m timeout is essentially required by the baremetal platform environment,
Expand Down