Skip to content

Commit

Permalink
Allow operator to skip the osImageURL check ?
Browse files Browse the repository at this point in the history
Right now, if someone overrides OSImageURL on their master pool, the
requires pools sync in the operator will fail because it checks to make
sure the OSImageURL matches, and when it's overridden it doesn't. (This
results in degradation and timeouts).

The check was originally put in to fix a bug around proper update
completion reporting.
  • Loading branch information
jkyros committed Sep 8, 2022
1 parent e884550 commit 01e9a0b
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 19 deletions.
15 changes: 11 additions & 4 deletions pkg/operator/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,9 @@ func (optr *Operator) allMachineConfigPoolStatus() (map[string]string, error) {

// isMachineConfigPoolConfigurationValid returns nil, or error when the configuration of a `pool` is created by the controller at version `version`,
// when the osImageURL does not match what's in the configmap or when the rendered-config-xxx does not match the OCP release version.
func isMachineConfigPoolConfigurationValid(pool *mcfgv1.MachineConfigPool, version, releaseVersion, newFormatOsURL, osURL string, machineConfigGetter func(string) (*mcfgv1.MachineConfig, error)) error {
// TODO(jkyros): this should probably be an Operator method, but we'd have to mock a lot more for the test, Testability seems to be
// why we keep stuffing arguments into this method.
func isMachineConfigPoolConfigurationValid(pool *mcfgv1.MachineConfigPool, version, releaseVersion, newFormatOsURL, osURL string, machineConfigGetter func(string) (*mcfgv1.MachineConfig, error), isUpgrading bool) error {
// both .status.configuration.name and .status.configuration.source must be set.
if pool.Spec.Configuration.Name == "" {
return fmt.Errorf("configuration spec for pool %s is empty: %v", pool.GetName(), machineConfigPoolStatus(pool))
Expand Down Expand Up @@ -610,10 +612,15 @@ func isMachineConfigPoolConfigurationValid(pool *mcfgv1.MachineConfigPool, versi
return err
}

// TODO(jkyros): There is the "I'm upgrading, I need to make sure I'm on the osimageurl that came with the upgrade" case we still want to
// check, but normally now that we allow OSImageURL, this is okay
// We allow OSImageURL overrides now, so this can't be a hard error by itself anymore
if renderedMC.Spec.OSImageURL != osURL && renderedMC.Spec.OSImageURL != newFormatOsURL {
return fmt.Errorf("osImageURL mismatch for %s in %s expected: %s got: %s", pool.GetName(), renderedMC.Name, osURL, renderedMC.Spec.OSImageURL)
// This gets really spammy if the URL is overridden, mostly just for debugging
glog.V(4).Infof("osImageURL has been overridden for %s in %s expected: %s got: %s", pool.GetName(), renderedMC.Name, osURL, renderedMC.Spec.OSImageURL)

// TODO(jkyros): what I'm going for is "if we're upgrading and we've overridden the osimageURL, that's bad, otherwise it's okay"
if isUpgrading {
return fmt.Errorf("osImageURL mismatch while upgrading for %s in %s expected: %s got: %s", pool.GetName(), renderedMC.Name, osURL, renderedMC.Spec.OSImageURL)
}
}

// check that the rendered config matches the OCP release version for cases where there is no OSImageURL change nor new MCO commit
Expand Down
26 changes: 22 additions & 4 deletions pkg/operator/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ func TestIsMachineConfigPoolConfigurationValid(t *testing.T) {
source []string
testurl string
generated string

err error
isUpgrading bool
err error
}{{
err: errors.New("configuration spec for pool dummy-pool is empty: <unknown>"),
}, {
Expand Down Expand Up @@ -117,7 +117,25 @@ func TestIsMachineConfigPoolConfigurationValid(t *testing.T) {
source: []string{"c-0", "u-0"},
testurl: "wrongurl",
generated: "g",
err: errors.New("osImageURL mismatch for dummy-pool in g expected: myurl got: wrongurl"),
err: nil,
}, {
// When we're upgrading, we should care abot OSImageURL, but not when we aren't because we allow overrides
knownConfigs: []config{{
name: "g",
version: "v2",
releaseVersion: "rv2",
}, {
name: "c-0",
version: "v2",
releaseVersion: "rv2",
}, {
name: "u-0",
}},
source: []string{"c-0", "u-0"},
testurl: "wrongurl",
generated: "g",
isUpgrading: true,
err: errors.New("osImageURL mismatch while upgrading for dummy-pool in g expected: myurl got: wrongurl"),
}, {
knownConfigs: []config{{
name: "g",
Expand Down Expand Up @@ -175,7 +193,7 @@ func TestIsMachineConfigPoolConfigurationValid(t *testing.T) {
Status: mcfgv1.MachineConfigPoolStatus{
Configuration: mcfgv1.MachineConfigPoolStatusConfiguration{ObjectReference: corev1.ObjectReference{Name: test.generated}, Source: source},
},
}, "v2", "rv2", "mynewurl", "myurl", getter)
}, "v2", "rv2", "mynewurl", "myurl", getter, test.isUpgrading)
if !reflect.DeepEqual(err, test.err) {
t.Fatalf("expected %v got %v", test.err, err)
}
Expand Down
37 changes: 26 additions & 11 deletions pkg/operator/sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ import (
daemonconsts "github.com/openshift/machine-config-operator/pkg/daemon/constants"
"github.com/openshift/machine-config-operator/pkg/server"
"github.com/openshift/machine-config-operator/pkg/version"

cov1helpers "github.com/openshift/library-go/pkg/config/clusteroperator/v1helpers"
)

const (
Expand Down Expand Up @@ -652,17 +654,20 @@ func (optr *Operator) syncMachineConfigServer(config *renderConfig) error {
func (optr *Operator) syncRequiredMachineConfigPools(_ *renderConfig) error {
var lastErr error
if err := wait.Poll(time.Second, 10*time.Minute, func() (bool, error) {
// TODO(jkyros): we used to only retrieve this if we had an error, but now we need to know if we're updating so
// we grab it every time.
co, err := optr.fetchClusterOperator()
if err != nil {
errs := kubeErrs.NewAggregate([]error{err, lastErr})
lastErr = fmt.Errorf("failed to fetch clusteroperator: %w", errs)
return false, nil
}
if co == nil {
glog.Warning("no clusteroperator for machine-config")
return false, nil
}

if lastErr != nil {
co, err := optr.fetchClusterOperator()
if err != nil {
errs := kubeErrs.NewAggregate([]error{err, lastErr})
lastErr = fmt.Errorf("failed to fetch clusteroperator: %w", errs)
return false, nil
}
if co == nil {
glog.Warning("no clusteroperator for machine-config")
return false, nil
}
optr.setOperatorStatusExtension(&co.Status, lastErr)
_, err = optr.configClient.ConfigV1().ClusterOperators().UpdateStatus(context.TODO(), co, metav1.UpdateOptions{})
if err != nil {
Expand Down Expand Up @@ -697,8 +702,18 @@ func (optr *Operator) syncRequiredMachineConfigPools(_ *renderConfig) error {
glog.Errorf("Error getting configmap osImageURL: %q", err)
return false, nil
}

// TODO(jkyros): The idea here is to figure out if we're updating because I *think* right now
// (until we have our fancy custom build inspection stuff) that we want to degrade if someone upgrades while
// they have osImageURL overridden (because otherwise we'd be lying about the upgrade being 'complete'? )
// Then again...maybe we do just let it go regardless if it's overridden since they "took the wheel"?
var isUpgrading bool
if !optr.vStore.Equal(co.Status.Versions) && cov1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorProgressing) {
isUpgrading = true
}

releaseVersion, _ := optr.vStore.Get("operator")
if err := isMachineConfigPoolConfigurationValid(pool, version.Hash, releaseVersion, opURL, newFormatOpURL, optr.mcLister.Get); err != nil {
if err := isMachineConfigPoolConfigurationValid(pool, version.Hash, releaseVersion, opURL, newFormatOpURL, optr.mcLister.Get, isUpgrading); err != nil {
lastErr = fmt.Errorf("pool %s has not progressed to latest configuration: %w, retrying", pool.Name, err)
syncerr := optr.syncUpgradeableStatus()
if syncerr != nil {
Expand Down

0 comments on commit 01e9a0b

Please sign in to comment.