diff --git a/controllers/clusteroperator.go b/controllers/clusteroperator.go index 84a490bfab8..5f536e0195b 100644 --- a/controllers/clusteroperator.go +++ b/controllers/clusteroperator.go @@ -49,6 +49,9 @@ const ( // ReasonDeploymentCrashLooping indicates that the deployment is crashlooping ReasonDeploymentCrashLooping StatusReason = "DeploymentCrashLooping" + // ReasonNotFound indicates that the deployment is not found + ReasonNotFound StatusReason = "ResourceNotFound" + // ReasonUnsupported is an unsupported StatusReason ReasonUnsupported StatusReason = "UnsupportedPlatform" ) @@ -223,7 +226,7 @@ func (r *ProvisioningReconciler) updateCOStatus(newReason StatusReason, msg, pro case ReasonComplete: v1helpers.SetStatusCondition(&conds, setStatusCondition(osconfigv1.OperatorAvailable, osconfigv1.ConditionTrue, string(newReason), msg)) v1helpers.SetStatusCondition(&conds, setStatusCondition(osconfigv1.OperatorProgressing, osconfigv1.ConditionFalse, string(newReason), progressMsg)) - case ReasonInvalidConfiguration, ReasonDeployTimedOut: + case ReasonInvalidConfiguration, ReasonDeployTimedOut, ReasonNotFound: v1helpers.SetStatusCondition(&conds, setStatusCondition(osconfigv1.OperatorDegraded, osconfigv1.ConditionTrue, string(newReason), msg)) v1helpers.SetStatusCondition(&conds, setStatusCondition(osconfigv1.OperatorAvailable, osconfigv1.ConditionTrue, string(ReasonEmpty), "")) v1helpers.SetStatusCondition(&conds, setStatusCondition(osconfigv1.OperatorProgressing, osconfigv1.ConditionTrue, string(newReason), progressMsg)) diff --git a/controllers/clusteroperator_test.go b/controllers/clusteroperator_test.go index caa44e4828f..9c03457b954 100644 --- a/controllers/clusteroperator_test.go +++ b/controllers/clusteroperator_test.go @@ -21,29 +21,62 @@ import ( "github.com/openshift/library-go/pkg/config/clusteroperator/v1helpers" ) -func TestUpdateCOStatusDisabled(t *testing.T) { +func TestUpdateCOStatus(t *testing.T) { tCases := []struct { name string + reason StatusReason + msg string + progressMsg string expectedConditions []osconfigv1.ClusterOperatorStatusCondition }{ { - name: "Correct Condition", + name: "Disabled", + reason: ReasonUnsupported, + msg: "Operator is non-functional", + progressMsg: "", expectedConditions: []osconfigv1.ClusterOperatorStatusCondition{ setStatusCondition(osconfigv1.OperatorDegraded, osconfigv1.ConditionFalse, "", ""), - setStatusCondition(osconfigv1.OperatorAvailable, osconfigv1.ConditionTrue, "AsExpected", "Operational"), - setStatusCondition(OperatorDisabled, osconfigv1.ConditionTrue, "UnsupportedPlatform", "Operator is non-functional"), + setStatusCondition(osconfigv1.OperatorAvailable, osconfigv1.ConditionTrue, string(ReasonExpected), "Operational"), + setStatusCondition(OperatorDisabled, osconfigv1.ConditionTrue, string(ReasonUnsupported), "Operator is non-functional"), setStatusCondition(osconfigv1.OperatorProgressing, osconfigv1.ConditionFalse, "", ""), setStatusCondition(osconfigv1.OperatorUpgradeable, osconfigv1.ConditionTrue, "", ""), }, }, + { + name: "Progressing", + reason: ReasonSyncing, + msg: "", + progressMsg: "syncing metal3 pod", + expectedConditions: []osconfigv1.ClusterOperatorStatusCondition{ + setStatusCondition(osconfigv1.OperatorDegraded, osconfigv1.ConditionFalse, "", ""), + setStatusCondition(osconfigv1.OperatorAvailable, osconfigv1.ConditionTrue, string(ReasonSyncing), ""), + setStatusCondition(OperatorDisabled, osconfigv1.ConditionFalse, "", ""), + setStatusCondition(osconfigv1.OperatorProgressing, osconfigv1.ConditionTrue, string(ReasonSyncing), "syncing metal3 pod"), + setStatusCondition(osconfigv1.OperatorUpgradeable, osconfigv1.ConditionTrue, "", ""), + }, + }, + { + name: "Available", + reason: ReasonComplete, + msg: "metal3 pod running", + progressMsg: "", + expectedConditions: []osconfigv1.ClusterOperatorStatusCondition{ + setStatusCondition(osconfigv1.OperatorDegraded, osconfigv1.ConditionFalse, "", ""), + setStatusCondition(osconfigv1.OperatorProgressing, osconfigv1.ConditionFalse, string(ReasonComplete), ""), + setStatusCondition(osconfigv1.OperatorAvailable, osconfigv1.ConditionTrue, string(ReasonComplete), "metal3 pod running"), + setStatusCondition(osconfigv1.OperatorUpgradeable, osconfigv1.ConditionTrue, "", ""), + setStatusCondition(OperatorDisabled, osconfigv1.ConditionFalse, "", ""), + }, + }, } reconciler := newFakeProvisioningReconciler(setUpSchemeForReconciler(), &osconfigv1.Infrastructure{}) - co, _ := reconciler.createClusterOperator() - reconciler.OSClient = fakeconfigclientset.NewSimpleClientset(co) for _, tc := range tCases { - err := reconciler.updateCOStatus(ReasonUnsupported, "Operator is non-functional", "") + co, _ := reconciler.createClusterOperator() + reconciler.OSClient = fakeconfigclientset.NewSimpleClientset(co) + + err := reconciler.updateCOStatus(tc.reason, tc.msg, tc.progressMsg) if err != nil { t.Error(err) } @@ -53,8 +86,8 @@ func TestUpdateCOStatusDisabled(t *testing.T) { if diff != "" { t.Fatal(diff) } + _ = reconciler.OSClient.ConfigV1().ClusterOperators().Delete(context.Background(), clusterOperatorName, metav1.DeleteOptions{}) } - _ = reconciler.OSClient.ConfigV1().ClusterOperators().Delete(context.Background(), clusterOperatorName, metav1.DeleteOptions{}) } func TestEnsureClusterOperator(t *testing.T) { diff --git a/controllers/provisioning_controller.go b/controllers/provisioning_controller.go index c959894cea0..381235463d7 100644 --- a/controllers/provisioning_controller.go +++ b/controllers/provisioning_controller.go @@ -31,15 +31,12 @@ import ( "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" osconfigv1 "github.com/openshift/api/config/v1" osclientset "github.com/openshift/client-go/config/clientset/versioned" metal3iov1alpha1 "github.com/openshift/cluster-baremetal-operator/api/v1alpha1" provisioning "github.com/openshift/cluster-baremetal-operator/provisioning" "github.com/openshift/library-go/pkg/operator/events" - "github.com/openshift/library-go/pkg/operator/resource/resourceapply" - "github.com/openshift/library-go/pkg/operator/resource/resourcemerge" ) const ( @@ -189,6 +186,14 @@ func (r *ProvisioningReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error return ctrl.Result{}, err } + specChanged := baremetalConfig.Generation != baremetalConfig.Status.ObservedGeneration + if specChanged { + err = r.updateCOStatus(ReasonSyncing, "", "Applying metal3 resources") + if err != nil { + return ctrl.Result{}, fmt.Errorf("unable to put %q ClusterOperator in Syncing state: %v", clusterOperatorName, err) + } + } + if err := provisioning.ValidateBaremetalProvisioningConfig(baremetalConfig); err != nil { // Provisioning configuration is not valid. // Requeue request. @@ -215,7 +220,7 @@ func (r *ProvisioningReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error return ctrl.Result{}, err } - //Create Secrets needed for Metal3 deployment + // Create Secrets needed for Metal3 deployment if err := provisioning.CreateMariadbPasswordSecret(r.KubeClient.CoreV1(), ComponentNamespace, baremetalConfig, r.Scheme); err != nil { return ctrl.Result{}, errors.Wrap(err, "failed to create Mariadb password") } @@ -236,32 +241,42 @@ func (r *ProvisioningReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error } if maoOwned { - r.Log.V(1).Info("metal3 deployment already exists") - err = r.updateCOStatus(ReasonComplete, "found existing Metal3 deployment", "") - if err != nil { - return ctrl.Result{}, fmt.Errorf("unable to put %q ClusterOperator in Available state: %v", clusterOperatorName, err) - } - return ctrl.Result{}, nil + r.Log.V(1).Info("Adding annotation for CBO to take ownership of metal3 deployment created by MAO") } - specChanged := baremetalConfig.Generation != baremetalConfig.Status.ObservedGeneration - if specChanged { - err = r.updateCOStatus(ReasonSyncing, "", "Applying the Metal3 deployment") - if err != nil { - return ctrl.Result{}, fmt.Errorf("unable to put %q ClusterOperator in Syncing state: %v", clusterOperatorName, err) - } - } + info := r.provisioningInfo(baremetalConfig, &containerImages) // Proceed with creating or updating the Metal3 deployment - updated, err := r.ensureMetal3Deployment(baremetalConfig, &containerImages, metal3DeploymentSelector) + updated, err := provisioning.EnsureMetal3Deployment(info, metal3DeploymentSelector) if err != nil { return ctrl.Result{}, err } if updated { + err = r.Client.Status().Update(context.Background(), baremetalConfig) return ctrl.Result{Requeue: true}, err } - info := r.provisioningInfo(baremetalConfig, &containerImages) + // Determine the status of the deployment + deploymentState, err := provisioning.GetDeploymentState(r.KubeClient.AppsV1(), ComponentNamespace, baremetalConfig) + if err != nil { + err = r.updateCOStatus(ReasonNotFound, "metal3 deployment inaccessible", "") + if err != nil { + return ctrl.Result{}, fmt.Errorf("unable to put %q ClusterOperator in Degraded state: %v", clusterOperatorName, err) + } + return ctrl.Result{}, errors.Wrap(err, "failed to determine state of metal3 deployment") + } + if deploymentState == appsv1.DeploymentReplicaFailure { + err = r.updateCOStatus(ReasonDeployTimedOut, "metal3 deployment rollout taking too long", "") + if err != nil { + return ctrl.Result{}, fmt.Errorf("unable to put %q ClusterOperator in Degraded state: %v", clusterOperatorName, err) + } + } else if deploymentState == appsv1.DeploymentAvailable { + err = r.updateCOStatus(ReasonSyncing, "metal3 pod running", "starting other metal3 services") + if err != nil { + return ctrl.Result{}, fmt.Errorf("unable to put %q ClusterOperator in Progressing state: %v", clusterOperatorName, err) + } + } + for _, ensureResource := range []ensureFunc{ provisioning.EnsureMetal3StateService, provisioning.EnsureImageCache, @@ -284,36 +299,29 @@ func (r *ProvisioningReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error } } - err = r.updateCOStatus(ReasonComplete, "new Metal3 deployment completed", "") + // Determine the status of the DaemonSet + daemonSetState, err := provisioning.GetDaemonSetState(r.KubeClient.AppsV1(), ComponentNamespace, baremetalConfig) if err != nil { - return ctrl.Result{}, fmt.Errorf("unable to put %q ClusterOperator in Available state: %v", clusterOperatorName, err) - } - - return ctrl.Result{}, nil -} - -func (r *ProvisioningReconciler) ensureMetal3Deployment(provConfig *metal3iov1alpha1.Provisioning, images *provisioning.Images, selector *metav1.LabelSelector) (updated bool, err error) { - metal3Deployment := provisioning.NewMetal3Deployment(ComponentNamespace, images, &provConfig.Spec, selector) - expectedGeneration := resourcemerge.ExpectedDeploymentGeneration(metal3Deployment, provConfig.Status.Generations) + err = r.updateCOStatus(ReasonNotFound, "metal3 image cache daemonset inaccessible", "") + if err != nil { + return ctrl.Result{}, fmt.Errorf("unable to put %q ClusterOperator in Degraded state: %v", clusterOperatorName, err) + } - err = controllerutil.SetControllerReference(provConfig, metal3Deployment, r.Scheme) - if err != nil { - err = fmt.Errorf("unable to set controllerReference on deployment: %w", err) - return + return ctrl.Result{}, errors.Wrap(err, "failed to determine state of metal3 image cache daemonset") } - - deployment, updated, err := resourceapply.ApplyDeployment(r.KubeClient.AppsV1(), - events.NewLoggingEventRecorder(ComponentName), metal3Deployment, expectedGeneration) - if err != nil { - err = fmt.Errorf("unable to apply Metal3 deployment: %w", err) - return + if daemonSetState == provisioning.DaemonSetReplicaFailure { + err = r.updateCOStatus(ReasonDeployTimedOut, "metal3 image cache rollout taking too long", "") + if err != nil { + return ctrl.Result{}, fmt.Errorf("unable to put %q ClusterOperator in Degraded state: %v", clusterOperatorName, err) + } + } else if daemonSetState == provisioning.DaemonSetAvailable { + err = r.updateCOStatus(ReasonComplete, "metal3 pod and image cache are running", "") + if err != nil { + return ctrl.Result{}, fmt.Errorf("unable to put %q ClusterOperator in Progressing state: %v", clusterOperatorName, err) + } } - if updated { - resourcemerge.SetDeploymentGeneration(&provConfig.Status.Generations, deployment) - err = r.Client.Status().Update(context.Background(), provConfig) - } - return + return ctrl.Result{}, nil } func (r *ProvisioningReconciler) provisioningInfo(provConfig *metal3iov1alpha1.Provisioning, images *provisioning.Images) *provisioning.ProvisioningInfo { diff --git a/provisioning/baremetal_pod.go b/provisioning/baremetal_pod.go index ad84dea3be5..d1141ed81ad 100644 --- a/provisioning/baremetal_pod.go +++ b/provisioning/baremetal_pod.go @@ -17,15 +17,20 @@ package provisioning import ( "context" + "fmt" "strconv" + "time" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" appsclientv1 "k8s.io/client-go/kubernetes/typed/apps/v1" "k8s.io/utils/pointer" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" metal3iov1alpha1 "github.com/openshift/cluster-baremetal-operator/api/v1alpha1" + "github.com/openshift/library-go/pkg/operator/resource/resourceapply" + "github.com/openshift/library-go/pkg/operator/resource/resourcemerge" ) const ( @@ -42,6 +47,9 @@ const ( cboLabelName = "baremetal.openshift.io/cluster-baremetal-operator" ) +var deploymentRolloutStartTime = time.Now() +var deploymentRolloutTimeout = 5 * time.Minute + var sharedVolumeMount = corev1.VolumeMount{ Name: baremetalSharedVolume, MountPath: "/shared", @@ -567,7 +575,7 @@ func newMetal3PodTemplateSpec(images *Images, config *metal3iov1alpha1.Provision } } -func NewMetal3Deployment(targetNamespace string, images *Images, config *metal3iov1alpha1.ProvisioningSpec, selector *metav1.LabelSelector) *appsv1.Deployment { +func newMetal3Deployment(targetNamespace string, images *Images, config *metal3iov1alpha1.ProvisioningSpec, selector *metav1.LabelSelector) *appsv1.Deployment { if selector == nil { selector = &metav1.LabelSelector{ MatchLabels: map[string]string{ @@ -583,7 +591,6 @@ func NewMetal3Deployment(targetNamespace string, images *Images, config *metal3i break } } - template := newMetal3PodTemplateSpec(images, config, k8sAppLabel) return &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ @@ -616,3 +623,53 @@ func CheckExistingMetal3Deployment(client appsclientv1.DeploymentsGetter, target } return nil, false, err } + +func EnsureMetal3Deployment(info *ProvisioningInfo, selector *metav1.LabelSelector) (updated bool, err error) { + // Create metal3 deployment object based on current baremetal configuration + // It will be created with the cboOwnedAnnotation + metal3Deployment := newMetal3Deployment(info.Namespace, info.Images, &info.ProvConfig.Spec, selector) + + expectedGeneration := resourcemerge.ExpectedDeploymentGeneration(metal3Deployment, info.ProvConfig.Status.Generations) + + err = controllerutil.SetControllerReference(info.ProvConfig, metal3Deployment, info.Scheme) + if err != nil { + err = fmt.Errorf("unable to set controllerReference on deployment: %w", err) + return + } + + deploymentRolloutStartTime = time.Now() + deployment, updated, err := resourceapply.ApplyDeployment(info.Client.AppsV1(), + info.EventRecorder, metal3Deployment, expectedGeneration) + if err != nil { + err = fmt.Errorf("unable to apply Metal3 deployment: %w", err) + return + } + + if updated { + resourcemerge.SetDeploymentGeneration(&info.ProvConfig.Status.Generations, deployment) + } + return +} + +func getDeploymentCondition(deployment *appsv1.Deployment) appsv1.DeploymentConditionType { + for _, cond := range deployment.Status.Conditions { + if cond.Status == corev1.ConditionTrue { + return cond.Type + } + } + return appsv1.DeploymentProgressing +} + +// Provide the current state of metal3 deployment +func GetDeploymentState(client appsclientv1.DeploymentsGetter, targetNamespace string, config *metal3iov1alpha1.Provisioning) (appsv1.DeploymentConditionType, error) { + existing, err := client.Deployments(targetNamespace).Get(context.Background(), baremetalDeploymentName, metav1.GetOptions{}) + if err != nil || existing == nil { + // There were errors accessing the deployment. + return appsv1.DeploymentReplicaFailure, err + } + deploymentState := getDeploymentCondition(existing) + if deploymentState == appsv1.DeploymentProgressing && deploymentRolloutTimeout <= time.Since(deploymentRolloutStartTime) { + return appsv1.DeploymentReplicaFailure, nil + } + return deploymentState, nil +} diff --git a/provisioning/image_cache.go b/provisioning/image_cache.go index 9cb64b1bf50..a058611f7e8 100644 --- a/provisioning/image_cache.go +++ b/provisioning/image_cache.go @@ -1,17 +1,20 @@ package provisioning import ( + "context" "fmt" "net" "net/url" "path" "regexp" "strconv" + "time" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" + appsclientv1 "k8s.io/client-go/kubernetes/typed/apps/v1" "k8s.io/utils/pointer" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -22,12 +25,18 @@ import ( ) const ( - imageCacheSharedVolume = "metal3-shared-image-cache" - imageCacheService = "metal3-image-cache" - imageCachePort = 6181 - imageCachePortName = "http" + imageCacheSharedVolume = "metal3-shared-image-cache" + imageCacheService = "metal3-image-cache" + imageCachePort = 6181 + imageCachePortName = "http" + DaemonSetProgressing appsv1.DaemonSetConditionType = "Progressing" + DaemonSetReplicaFailure appsv1.DaemonSetConditionType = "ReplicaFailure" + DaemonSetAvailable appsv1.DaemonSetConditionType = "Available" ) +var daemonSetRolloutStartTime = time.Now() +var daemonSetRolloutTimeout = 5 * time.Minute + var fileCompressionSuffix = regexp.MustCompile(`\.[gx]z$`) func imageVolume() corev1.Volume { @@ -217,7 +226,7 @@ func EnsureImageCache(info *ProvisioningInfo) (updated bool, err error) { err = fmt.Errorf("unable to set controllerReference on daemonset: %w", err) return } - + daemonSetRolloutStartTime = time.Now() daemonSet, updated, err := resourceapply.ApplyDaemonSet( info.Client.AppsV1(), info.EventRecorder, @@ -230,3 +239,19 @@ func EnsureImageCache(info *ProvisioningInfo) (updated bool, err error) { resourcemerge.SetDaemonSetGeneration(&info.ProvConfig.Status.Generations, daemonSet) return } + +// Provide the current state of metal3 deployment +func GetDaemonSetState(client appsclientv1.DaemonSetsGetter, targetNamespace string, config *metal3iov1alpha1.Provisioning) (appsv1.DaemonSetConditionType, error) { + existing, err := client.DaemonSets(targetNamespace).Get(context.Background(), imageCacheService, metav1.GetOptions{}) + if err != nil || existing == nil { + // There were errors accessing the deployment. + return DaemonSetReplicaFailure, err + } + if existing.Status.NumberReady == existing.Status.DesiredNumberScheduled { + return DaemonSetAvailable, nil + } + if daemonSetRolloutTimeout <= time.Since(daemonSetRolloutStartTime) { + return DaemonSetReplicaFailure, nil + } + return DaemonSetProgressing, nil +}