From ee0e8e36bcb225978e76f71cead5f05a133c506d Mon Sep 17 00:00:00 2001 From: Antonio Murdaca Date: Wed, 6 Feb 2019 16:51:27 +0100 Subject: [PATCH 1/3] pkg/operator: lay down ClusterOperator for MCO so that the CVO is aware of our ClusterOperator and can gate on us being Available. Signed-off-by: Antonio Murdaca --- .../0000_30_machine-config-operator_06_clusteroperator.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 install/0000_30_machine-config-operator_06_clusteroperator.yaml diff --git a/install/0000_30_machine-config-operator_06_clusteroperator.yaml b/install/0000_30_machine-config-operator_06_clusteroperator.yaml new file mode 100644 index 0000000000..42a379aae6 --- /dev/null +++ b/install/0000_30_machine-config-operator_06_clusteroperator.yaml @@ -0,0 +1,5 @@ +apiVersion: config.openshift.io/v1 +kind: ClusterOperator +metadata: + name: machine-config +spec: {} From bd2488f54d083a81e4bfaede7e23e80365dc64d7 Mon Sep 17 00:00:00 2001 From: Antonio Murdaca Date: Fri, 8 Feb 2019 13:16:03 +0100 Subject: [PATCH 2/3] pkg/operator: always report timeout errors Basically use the same pattern everywhere as we were getting weird logs like: time="2019-02-08T10:35:23Z" level=debug msg="Still waiting for the cluster to initialize: Cluster operator machine-config has not yet reported success" time="2019-02-08T10:39:38Z" level=debug msg="Still waiting for the cluster to initialize: Cluster operator machine-config is reporting a failure: Failed when progressing towards 3.11.0-587-g0e44a773-dirty because: error syncing: timed out waiting for the condition" w/o knowing what was actually failing. Signed-off-by: Antonio Murdaca --- pkg/operator/sync.go | 48 +++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/pkg/operator/sync.go b/pkg/operator/sync.go index a514087d00..fdbfa87c04 100644 --- a/pkg/operator/sync.go +++ b/pkg/operator/sync.go @@ -353,7 +353,6 @@ func (optr *Operator) syncRequiredMachineConfigPools(config renderConfig) error lastErr = fmt.Errorf("error pool %s is not ready. status: (total: %d, updated: %d, unavailable: %d)", p.Name, p.Status.MachineCount, p.Status.UpdatedMachineCount, p.Status.UnavailableMachineCount) return false, nil } - return true, nil }); err != nil { if err.Error() == wait.ErrWaitTimeout.Error() { @@ -379,10 +378,11 @@ const ( ) func (optr *Operator) waitForCustomResourceDefinition(resource *apiextv1beta1.CustomResourceDefinition) error { - return wait.Poll(customResourceReadyInterval, customResourceReadyTimeout, func() (bool, error) { + var lastErr error + if err := wait.Poll(customResourceReadyInterval, customResourceReadyTimeout, func() (bool, error) { crd, err := optr.crdLister.Get(resource.Name) if err != nil { - glog.Errorf("error getting CustomResourceDefinition %s: %v", resource.Name, err) + lastErr = fmt.Errorf("error getting CustomResourceDefinition %s: %v", resource.Name, err) return false, nil } @@ -391,13 +391,20 @@ func (optr *Operator) waitForCustomResourceDefinition(resource *apiextv1beta1.Cu return true, nil } } - glog.V(4).Infof("CustomResourceDefinition %s is not ready. conditions: %v", crd.Name, crd.Status.Conditions) + lastErr = fmt.Errorf("CustomResourceDefinition %s is not ready. conditions: %v", crd.Name, crd.Status.Conditions) return false, nil - }) + }); err != nil { + if err == wait.ErrWaitTimeout { + return fmt.Errorf("%v during syncCustomResourceDefinitions: %v", err, lastErr) + } + return err + } + return nil } func (optr *Operator) waitForDeploymentRollout(resource *appsv1.Deployment) error { - return wait.Poll(deploymentRolloutPollInterval, deploymentRolloutTimeout, func() (bool, error) { + var lastErr error + if err := wait.Poll(deploymentRolloutPollInterval, deploymentRolloutTimeout, func() (bool, error) { d, err := optr.deployLister.Deployments(resource.Namespace).Get(resource.Name) if apierrors.IsNotFound(err) { // exit early to recreate the deployment. @@ -406,7 +413,7 @@ func (optr *Operator) waitForDeploymentRollout(resource *appsv1.Deployment) erro if err != nil { // Do not return error here, as we could be updating the API Server itself, in which case we // want to continue waiting. - glog.Errorf("error getting Deployment %s during rollout: %v", resource.Name, err) + lastErr = fmt.Errorf("error getting Deployment %s during rollout: %v", resource.Name, err) return false, nil } @@ -417,13 +424,20 @@ func (optr *Operator) waitForDeploymentRollout(resource *appsv1.Deployment) erro if d.Generation <= d.Status.ObservedGeneration && d.Status.UpdatedReplicas == d.Status.Replicas && d.Status.UnavailableReplicas == 0 { return true, nil } - glog.V(4).Infof("Deployment %s is not ready. status: (replicas: %d, updated: %d, ready: %d, unavailable: %d)", d.Name, d.Status.Replicas, d.Status.UpdatedReplicas, d.Status.ReadyReplicas, d.Status.UnavailableReplicas) + lastErr = fmt.Errorf("Deployment %s is not ready. status: (replicas: %d, updated: %d, ready: %d, unavailable: %d)", d.Name, d.Status.Replicas, d.Status.UpdatedReplicas, d.Status.ReadyReplicas, d.Status.UnavailableReplicas) return false, nil - }) + }); err != nil { + if err == wait.ErrWaitTimeout { + return fmt.Errorf("%v during waitForDeploymentRollout: %v", err, lastErr) + } + return err + } + return nil } func (optr *Operator) waitForDaemonsetRollout(resource *appsv1.DaemonSet) error { - return wait.Poll(daemonsetRolloutPollInterval, daemonsetRolloutTimeout, func() (bool, error) { + var lastErr error + if err := wait.Poll(daemonsetRolloutPollInterval, daemonsetRolloutTimeout, func() (bool, error) { d, err := optr.daemonsetLister.DaemonSets(resource.Namespace).Get(resource.Name) if apierrors.IsNotFound(err) { // exit early to recreate the daemonset. @@ -432,7 +446,7 @@ func (optr *Operator) waitForDaemonsetRollout(resource *appsv1.DaemonSet) error if err != nil { // Do not return error here, as we could be updating the API Server itself, in which case we // want to continue waiting. - glog.Errorf("error getting Daemonset %s during rollout: %v", resource.Name, err) + lastErr = fmt.Errorf("error getting Daemonset %s during rollout: %v", resource.Name, err) return false, nil } @@ -443,9 +457,15 @@ func (optr *Operator) waitForDaemonsetRollout(resource *appsv1.DaemonSet) error if d.Generation <= d.Status.ObservedGeneration && d.Status.UpdatedNumberScheduled == d.Status.DesiredNumberScheduled && d.Status.NumberUnavailable == 0 { return true, nil } - glog.V(4).Infof("Daemonset %s is not ready. status: (desired: %d, updated: %d, ready: %d, unavailable: %d)", d.Name, d.Status.DesiredNumberScheduled, d.Status.UpdatedNumberScheduled, d.Status.NumberReady, d.Status.NumberAvailable) + lastErr = fmt.Errorf("Daemonset %s is not ready. status: (desired: %d, updated: %d, ready: %d, unavailable: %d)", d.Name, d.Status.DesiredNumberScheduled, d.Status.UpdatedNumberScheduled, d.Status.NumberReady, d.Status.NumberAvailable) return false, nil - }) + }); err != nil { + if err == wait.ErrWaitTimeout { + return fmt.Errorf("%v during waitForDaemonsetRollout: %v", err, lastErr) + } + return err + } + return nil } func (optr *Operator) waitForControllerConfigToBeCompleted(resource *mcfgv1.ControllerConfig) error { @@ -457,7 +477,7 @@ func (optr *Operator) waitForControllerConfigToBeCompleted(resource *mcfgv1.Cont } return true, nil }); err != nil { - if err.Error() == wait.ErrWaitTimeout.Error() { + if err == wait.ErrWaitTimeout { return fmt.Errorf("%v during waitForControllerConfigToBeCompleted: %v", err, lastErr) } return err From 34300ccb4ab266b5791909f267d0c8d2cafd645f Mon Sep 17 00:00:00 2001 From: Antonio Murdaca Date: Fri, 8 Feb 2019 12:54:40 +0100 Subject: [PATCH 3/3] pkg/operator: wait longer for resources to be available to CVO Signed-off-by: Antonio Murdaca --- pkg/operator/sync.go | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pkg/operator/sync.go b/pkg/operator/sync.go index fdbfa87c04..b64b7383b6 100644 --- a/pkg/operator/sync.go +++ b/pkg/operator/sync.go @@ -29,11 +29,11 @@ func (optr *Operator) syncAll(rconfig renderConfig) error { // syncFuncs is the list of sync functions that are executed in order. // any error marks sync as failure but continues to next syncFunc var syncFuncs = []syncFunc{ - { "pools", optr.syncMachineConfigPools }, - { "mcc", optr.syncMachineConfigController }, - { "mcs", optr.syncMachineConfigServer }, - { "mcd", optr.syncMachineConfigDaemon }, - { "required-pools", optr.syncRequiredMachineConfigPools }, + {"pools", optr.syncMachineConfigPools}, + {"mcc", optr.syncMachineConfigController}, + {"mcs", optr.syncMachineConfigServer}, + {"mcd", optr.syncMachineConfigDaemon}, + {"required-pools", optr.syncRequiredMachineConfigPools}, } if err := optr.syncProgressingStatus(); err != nil { @@ -329,7 +329,7 @@ func (optr *Operator) syncRequiredMachineConfigPools(config renderConfig) error return err } var lastErr error - if err := wait.Poll(time.Second, 5*time.Minute, func() (bool, error) { + if err := wait.Poll(time.Second, 10*time.Minute, func() (bool, error) { pools, err := optr.mcpLister.List(sel) if apierrors.IsNotFound(err) { return false, err @@ -355,7 +355,7 @@ func (optr *Operator) syncRequiredMachineConfigPools(config renderConfig) error } return true, nil }); err != nil { - if err.Error() == wait.ErrWaitTimeout.Error() { + if err == wait.ErrWaitTimeout { return fmt.Errorf("%v during syncRequiredMachineConfigPools: %v", err, lastErr) } return err @@ -365,16 +365,16 @@ func (optr *Operator) syncRequiredMachineConfigPools(config renderConfig) error const ( deploymentRolloutPollInterval = time.Second - deploymentRolloutTimeout = 5 * time.Minute + deploymentRolloutTimeout = 10 * time.Minute daemonsetRolloutPollInterval = time.Second - daemonsetRolloutTimeout = 5 * time.Minute + daemonsetRolloutTimeout = 10 * time.Minute customResourceReadyInterval = time.Second - customResourceReadyTimeout = 5 * time.Minute + customResourceReadyTimeout = 10 * time.Minute controllerConfigCompletedInterval = time.Second - controllerConfigCompletedTimeout = time.Minute + controllerConfigCompletedTimeout = 5 * time.Minute ) func (optr *Operator) waitForCustomResourceDefinition(resource *apiextv1beta1.CustomResourceDefinition) error {