diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 000000000..2cdd00882 --- /dev/null +++ b/.codecov.yml @@ -0,0 +1,8 @@ +coverage: + status: + project: + default: + target: auto + threshold: 0.50 + base: auto + patch: off \ No newline at end of file diff --git a/README.md b/README.md index ad80479e3..6359aeaef 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,9 @@ spec: apiVersion: apps/v1 kind: Deployment name: podinfo + # the maximum time in seconds for the canary deployment + # to make progress before it is rollback (default 60s) + progressDeadlineSeconds: 60 # hpa reference (optional) autoscalerRef: apiVersion: autoscaling/v2beta1 diff --git a/artifacts/canaries/canary.yaml b/artifacts/canaries/canary.yaml index 68a15a588..a8fa3d0f9 100644 --- a/artifacts/canaries/canary.yaml +++ b/artifacts/canaries/canary.yaml @@ -9,6 +9,9 @@ spec: apiVersion: apps/v1 kind: Deployment name: podinfo + # the maximum time in seconds for the canary deployment + # to make progress before it is rollback (default 60s) + progressDeadlineSeconds: 60 # HPA reference (optional) autoscalerRef: apiVersion: autoscaling/v2beta1 diff --git a/artifacts/flagger/crd.yaml b/artifacts/flagger/crd.yaml index 705f337e5..9b7da8d16 100644 --- a/artifacts/flagger/crd.yaml +++ b/artifacts/flagger/crd.yaml @@ -23,6 +23,8 @@ spec: - service - canaryAnalysis properties: + progressDeadlineSeconds: + type: number targetRef: properties: apiVersion: diff --git a/charts/flagger/templates/crd.yaml b/charts/flagger/templates/crd.yaml index 818cc5f10..58164a920 100644 --- a/charts/flagger/templates/crd.yaml +++ b/charts/flagger/templates/crd.yaml @@ -26,6 +26,8 @@ spec: - service - canaryAnalysis properties: + progressDeadlineSeconds: + type: number targetRef: properties: apiVersion: diff --git a/docs/README.md b/docs/README.md index 11f869a0b..06315b2de 100644 --- a/docs/README.md +++ b/docs/README.md @@ -84,6 +84,9 @@ spec: apiVersion: apps/v1 kind: Deployment name: podinfo + # the maximum time in seconds for the canary deployment + # to make progress before it is rollback (default 60s) + progressDeadlineSeconds: 60 # hpa reference (optional) autoscalerRef: apiVersion: autoscaling/v2beta1 diff --git a/pkg/apis/flagger/v1alpha1/types.go b/pkg/apis/flagger/v1alpha1/types.go index 504d5ac1e..0e6251cd4 100755 --- a/pkg/apis/flagger/v1alpha1/types.go +++ b/pkg/apis/flagger/v1alpha1/types.go @@ -21,7 +21,10 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -const CanaryKind = "Canary" +const ( + CanaryKind = "Canary" + ProgressDeadlineSeconds = 60 +) // +genclient // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object @@ -48,6 +51,10 @@ type CanarySpec struct { // metrics and thresholds CanaryAnalysis CanaryAnalysis `json:"canaryAnalysis"` + + // the maximum time in seconds for a canary deployment to make progress + // before it is considered to be failed. Defaults to 60s. + ProgressDeadlineSeconds *int32 `json:"progressDeadlineSeconds,omitempty"` } // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object @@ -60,11 +67,21 @@ type CanaryList struct { Items []Canary `json:"items"` } +// CanaryState used for status state op +type CanaryState string + +const ( + CanaryRunning CanaryState = "running" + CanaryFinished CanaryState = "finished" + CanaryFailed CanaryState = "failed" + CanaryInitialized CanaryState = "initialized" +) + // CanaryStatus is used for state persistence (read-only) type CanaryStatus struct { - State string `json:"state"` - CanaryRevision string `json:"canaryRevision"` - FailedChecks int `json:"failedChecks"` + State CanaryState `json:"state"` + CanaryRevision string `json:"canaryRevision"` + FailedChecks int `json:"failedChecks"` // +optional LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"` } @@ -91,3 +108,11 @@ type CanaryMetric struct { Interval string `json:"interval"` Threshold int `json:"threshold"` } + +func (c *Canary) GetProgressDeadlineSeconds() int { + if c.Spec.ProgressDeadlineSeconds != nil { + return int(*c.Spec.ProgressDeadlineSeconds) + } + + return ProgressDeadlineSeconds +} diff --git a/pkg/apis/flagger/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/flagger/v1alpha1/zz_generated.deepcopy.go index cacaa8151..c0361ad46 100644 --- a/pkg/apis/flagger/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/flagger/v1alpha1/zz_generated.deepcopy.go @@ -155,6 +155,11 @@ func (in *CanarySpec) DeepCopyInto(out *CanarySpec) { out.AutoscalerRef = in.AutoscalerRef in.Service.DeepCopyInto(&out.Service) in.CanaryAnalysis.DeepCopyInto(&out.CanaryAnalysis) + if in.ProgressDeadlineSeconds != nil { + in, out := &in.ProgressDeadlineSeconds, &out.ProgressDeadlineSeconds + *out = new(int32) + **out = **in + } return } diff --git a/pkg/controller/deployer.go b/pkg/controller/deployer.go index f45249913..ca2dfa203 100644 --- a/pkg/controller/deployer.go +++ b/pkg/controller/deployer.go @@ -4,6 +4,7 @@ import ( "encoding/base64" "encoding/json" "fmt" + "time" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" @@ -48,6 +49,7 @@ func (c *CanaryDeployer) Promote(cd *flaggerv1.Canary) error { return fmt.Errorf("deployment %s.%s query error %v", primaryName, cd.Namespace, err) } + primary.Spec.ProgressDeadlineSeconds = canary.Spec.ProgressDeadlineSeconds primary.Spec.MinReadySeconds = canary.Spec.MinReadySeconds primary.Spec.RevisionHistoryLimit = canary.Spec.RevisionHistoryLimit primary.Spec.Strategy = canary.Spec.Strategy @@ -61,37 +63,58 @@ func (c *CanaryDeployer) Promote(cd *flaggerv1.Canary) error { return nil } -// IsReady checks the primary and canary deployment status and returns an error if -// the deployments are in the middle of a rolling update or if the pods are unhealthy -func (c *CanaryDeployer) IsReady(cd *flaggerv1.Canary) error { - canary, err := c.kubeClient.AppsV1().Deployments(cd.Namespace).Get(cd.Spec.TargetRef.Name, metav1.GetOptions{}) +// IsPrimaryReady checks the primary deployment status and returns an error if +// the deployment is in the middle of a rolling update or if the pods are unhealthy +// it will return a non retriable error if the rolling update is stuck +func (c *CanaryDeployer) IsPrimaryReady(cd *flaggerv1.Canary) (bool, error) { + primaryName := fmt.Sprintf("%s-primary", cd.Spec.TargetRef.Name) + primary, err := c.kubeClient.AppsV1().Deployments(cd.Namespace).Get(primaryName, metav1.GetOptions{}) if err != nil { if errors.IsNotFound(err) { - return fmt.Errorf("deployment %s.%s not found", cd.Spec.TargetRef.Name, cd.Namespace) + return true, fmt.Errorf("deployment %s.%s not found", primaryName, cd.Namespace) } - return fmt.Errorf("deployment %s.%s query error %v", cd.Spec.TargetRef.Name, cd.Namespace, err) + return true, fmt.Errorf("deployment %s.%s query error %v", primaryName, cd.Namespace, err) } - if msg, healthy := c.getDeploymentStatus(canary); !healthy { - return fmt.Errorf("Halt %s.%s advancement %s", cd.Name, cd.Namespace, msg) + + retriable, err := c.isDeploymentReady(primary, cd.GetProgressDeadlineSeconds()) + if err != nil { + if retriable { + return retriable, fmt.Errorf("Halt %s.%s advancement %s", cd.Name, cd.Namespace, err.Error()) + } else { + return retriable, err + } } - primaryName := fmt.Sprintf("%s-primary", cd.Spec.TargetRef.Name) - primary, err := c.kubeClient.AppsV1().Deployments(cd.Namespace).Get(primaryName, metav1.GetOptions{}) + if primary.Spec.Replicas == int32p(0) { + return true, fmt.Errorf("halt %s.%s advancement primary deployment is scaled to zero", + cd.Name, cd.Namespace) + } + return true, nil +} + +// IsCanaryReady checks the primary deployment status and returns an error if +// the deployment is in the middle of a rolling update or if the pods are unhealthy +// it will return a non retriable error if the rolling update is stuck +func (c *CanaryDeployer) IsCanaryReady(cd *flaggerv1.Canary) (bool, error) { + canary, err := c.kubeClient.AppsV1().Deployments(cd.Namespace).Get(cd.Spec.TargetRef.Name, metav1.GetOptions{}) if err != nil { if errors.IsNotFound(err) { - return fmt.Errorf("deployment %s.%s not found", primaryName, cd.Namespace) + return true, fmt.Errorf("deployment %s.%s not found", cd.Spec.TargetRef.Name, cd.Namespace) } - return fmt.Errorf("deployment %s.%s query error %v", primaryName, cd.Namespace, err) - } - if msg, healthy := c.getDeploymentStatus(primary); !healthy { - return fmt.Errorf("Halt %s.%s advancement %s", cd.Name, cd.Namespace, msg) + return true, fmt.Errorf("deployment %s.%s query error %v", cd.Spec.TargetRef.Name, cd.Namespace, err) } - if primary.Spec.Replicas == int32p(0) { - return fmt.Errorf("halt %s.%s advancement %s", - cd.Name, cd.Namespace, "primary deployment is scaled to zero") + retriable, err := c.isDeploymentReady(canary, cd.GetProgressDeadlineSeconds()) + if err != nil { + if retriable { + return retriable, fmt.Errorf("Halt %s.%s advancement %s", cd.Name, cd.Namespace, err.Error()) + } else { + return retriable, fmt.Errorf("deployment does not have minimum availability for more than %vs", + cd.GetProgressDeadlineSeconds()) + } } - return nil + + return true, nil } // IsNewSpec returns true if the canary deployment pod spec has changed @@ -139,7 +162,7 @@ func (c *CanaryDeployer) SetFailedChecks(cd *flaggerv1.Canary, val int) error { } // SetState updates the canary status state -func (c *CanaryDeployer) SetState(cd *flaggerv1.Canary, state string) error { +func (c *CanaryDeployer) SetState(cd *flaggerv1.Canary, state flaggerv1.CanaryState) error { cd.Status.State = state cd.Status.LastTransitionTime = metav1.Now() cd, err := c.flaggerClient.FlaggerV1alpha1().Canaries(cd.Namespace).Update(cd) @@ -244,10 +267,11 @@ func (c *CanaryDeployer) createPrimaryDeployment(cd *flaggerv1.Canary) error { }, }, Spec: appsv1.DeploymentSpec{ - MinReadySeconds: canaryDep.Spec.MinReadySeconds, - RevisionHistoryLimit: canaryDep.Spec.RevisionHistoryLimit, - Replicas: canaryDep.Spec.Replicas, - Strategy: canaryDep.Spec.Strategy, + ProgressDeadlineSeconds: canaryDep.Spec.ProgressDeadlineSeconds, + MinReadySeconds: canaryDep.Spec.MinReadySeconds, + RevisionHistoryLimit: canaryDep.Spec.RevisionHistoryLimit, + Replicas: canaryDep.Spec.Replicas, + Strategy: canaryDep.Spec.Strategy, Selector: &metav1.LabelSelector{ MatchLabels: map[string]string{ "app": primaryName, @@ -322,26 +346,41 @@ func (c *CanaryDeployer) createPrimaryHpa(cd *flaggerv1.Canary) error { return nil } -func (c *CanaryDeployer) getDeploymentStatus(deployment *appsv1.Deployment) (string, bool) { +// isDeploymentReady determines if a deployment is ready by checking the status conditions +// if a deployment has exceeded the progress deadline it returns a non retriable error +func (c *CanaryDeployer) isDeploymentReady(deployment *appsv1.Deployment, deadline int) (bool, error) { + retriable := true if deployment.Generation <= deployment.Status.ObservedGeneration { - cond := c.getDeploymentCondition(deployment.Status, appsv1.DeploymentProgressing) - if cond != nil && cond.Reason == "ProgressDeadlineExceeded" { - return fmt.Sprintf("deployment %q exceeded its progress deadline", deployment.GetName()), false + progress := c.getDeploymentCondition(deployment.Status, appsv1.DeploymentProgressing) + if progress != nil { + // Determine if the deployment is stuck by checking if there is a minimum replicas unavailable condition + // and if the last update time exceeds the deadline + available := c.getDeploymentCondition(deployment.Status, appsv1.DeploymentAvailable) + if available != nil && available.Status == "False" && available.Reason == "MinimumReplicasUnavailable" { + from := available.LastUpdateTime + delta := time.Duration(deadline) * time.Second + retriable = !from.Add(delta).Before(time.Now()) + } + } + + if progress != nil && progress.Reason == "ProgressDeadlineExceeded" { + return false, fmt.Errorf("deployment %q exceeded its progress deadline", deployment.GetName()) } else if deployment.Spec.Replicas != nil && deployment.Status.UpdatedReplicas < *deployment.Spec.Replicas { - return fmt.Sprintf("waiting for rollout to finish: %d out of %d new replicas have been updated", - deployment.Status.UpdatedReplicas, *deployment.Spec.Replicas), false + return retriable, fmt.Errorf("waiting for rollout to finish: %d out of %d new replicas have been updated", + deployment.Status.UpdatedReplicas, *deployment.Spec.Replicas) } else if deployment.Status.Replicas > deployment.Status.UpdatedReplicas { - return fmt.Sprintf("waiting for rollout to finish: %d old replicas are pending termination", - deployment.Status.Replicas-deployment.Status.UpdatedReplicas), false + return retriable, fmt.Errorf("waiting for rollout to finish: %d old replicas are pending termination", + deployment.Status.Replicas-deployment.Status.UpdatedReplicas) } else if deployment.Status.AvailableReplicas < deployment.Status.UpdatedReplicas { - return fmt.Sprintf("waiting for rollout to finish: %d of %d updated replicas are available", - deployment.Status.AvailableReplicas, deployment.Status.UpdatedReplicas), false + return retriable, fmt.Errorf("waiting for rollout to finish: %d of %d updated replicas are available", + deployment.Status.AvailableReplicas, deployment.Status.UpdatedReplicas) } + } else { - return "waiting for rollout to finish: observed deployment generation less then desired generation", false + return true, fmt.Errorf("waiting for rollout to finish: observed deployment generation less then desired generation") } - return "ready", true + return true, nil } func (c *CanaryDeployer) getDeploymentCondition( diff --git a/pkg/controller/deployer_test.go b/pkg/controller/deployer_test.go index 9171057cd..e25e997b9 100644 --- a/pkg/controller/deployer_test.go +++ b/pkg/controller/deployer_test.go @@ -319,7 +319,12 @@ func TestCanaryDeployer_IsReady(t *testing.T) { t.Fatal(err.Error()) } - err = deployer.IsReady(canary) + _, err = deployer.IsPrimaryReady(canary) + if err != nil { + t.Fatal(err.Error()) + } + + _, err = deployer.IsCanaryReady(canary) if err != nil { t.Fatal(err.Error()) } @@ -382,7 +387,7 @@ func TestCanaryDeployer_SetState(t *testing.T) { t.Fatal(err.Error()) } - err = deployer.SetState(canary, "running") + err = deployer.SetState(canary, v1alpha1.CanaryRunning) if err != nil { t.Fatal(err.Error()) } @@ -392,8 +397,8 @@ func TestCanaryDeployer_SetState(t *testing.T) { t.Fatal(err.Error()) } - if res.Status.State != "running" { - t.Errorf("Got %v wanted %v", res.Status.State, "running") + if res.Status.State != v1alpha1.CanaryRunning { + t.Errorf("Got %v wanted %v", res.Status.State, v1alpha1.CanaryRunning) } } @@ -419,7 +424,7 @@ func TestCanaryDeployer_SyncStatus(t *testing.T) { } status := v1alpha1.CanaryStatus{ - State: "running", + State: v1alpha1.CanaryRunning, FailedChecks: 2, } err = deployer.SyncStatus(canary, status) diff --git a/pkg/controller/recorder.go b/pkg/controller/recorder.go index e293989f2..717b40911 100644 --- a/pkg/controller/recorder.go +++ b/pkg/controller/recorder.go @@ -73,9 +73,9 @@ func (cr *CanaryRecorder) SetTotal(namespace string, total int) { func (cr *CanaryRecorder) SetStatus(cd *flaggerv1.Canary) { status := 1 switch cd.Status.State { - case "running": + case flaggerv1.CanaryRunning: status = 0 - case "failed": + case flaggerv1.CanaryFailed: status = 2 default: status = 1 diff --git a/pkg/controller/scheduler.go b/pkg/controller/scheduler.go index ff5e7b4ad..f2f2a6096 100644 --- a/pkg/controller/scheduler.go +++ b/pkg/controller/scheduler.go @@ -56,8 +56,8 @@ func (c *Controller) advanceCanary(name string, namespace string) { maxWeight = cd.Spec.CanaryAnalysis.MaxWeight } - // check primary and canary deployments status - if err := c.deployer.IsReady(cd); err != nil { + // check primary deployment status + if _, err := c.deployer.IsPrimaryReady(cd); err != nil { c.recordEventWarningf(cd, "%v", err) return } @@ -81,10 +81,26 @@ func (c *Controller) advanceCanary(name string, namespace string) { c.recorder.SetDuration(cd, time.Since(begin)) }() + // check canary deployment status + retriable, err := c.deployer.IsCanaryReady(cd) + if err != nil && retriable { + c.recordEventWarningf(cd, "%v", err) + return + } + // check if the number of failed checks reached the threshold - if cd.Status.State == "running" && cd.Status.FailedChecks >= cd.Spec.CanaryAnalysis.Threshold { - c.recordEventWarningf(cd, "Rolling back %s.%s failed checks threshold reached %v", - cd.Name, cd.Namespace, cd.Status.FailedChecks) + if cd.Status.State == flaggerv1.CanaryRunning && + (!retriable || cd.Status.FailedChecks >= cd.Spec.CanaryAnalysis.Threshold) { + + if cd.Status.FailedChecks >= cd.Spec.CanaryAnalysis.Threshold { + c.recordEventWarningf(cd, "Rolling back %s.%s failed checks threshold reached %v", + cd.Name, cd.Namespace, cd.Status.FailedChecks) + } + + if !retriable { + c.recordEventWarningf(cd, "Rolling back %s.%s progress deadline exceeded %v", + cd.Name, cd.Namespace, err) + } // route all traffic back to primary primaryRoute.Weight = 100 @@ -105,10 +121,11 @@ func (c *Controller) advanceCanary(name string, namespace string) { } // mark canary as failed - if err := c.deployer.SetState(cd, "failed"); err != nil { + if err := c.deployer.SyncStatus(cd, flaggerv1.CanaryStatus{State: flaggerv1.CanaryFailed}); err != nil { c.logger.Errorf("%v", err) return } + c.recorder.SetStatus(cd) c.sendNotification(cd.Spec.TargetRef.Name, cd.Namespace, "Canary analysis failed, rollback finished.", true) @@ -177,7 +194,7 @@ func (c *Controller) advanceCanary(name string, namespace string) { } // update status - if err := c.deployer.SetState(cd, "finished"); err != nil { + if err := c.deployer.SetState(cd, flaggerv1.CanaryFinished); err != nil { c.recordEventWarningf(cd, "%v", err) return } @@ -194,7 +211,7 @@ func (c *Controller) checkCanaryStatus(cd *flaggerv1.Canary, deployer CanaryDepl } if cd.Status.State == "" { - if err := deployer.SyncStatus(cd, flaggerv1.CanaryStatus{State: "initialized"}); err != nil { + if err := deployer.SyncStatus(cd, flaggerv1.CanaryStatus{State: flaggerv1.CanaryInitialized}); err != nil { c.logger.Errorf("%v", err) return false } @@ -213,7 +230,7 @@ func (c *Controller) checkCanaryStatus(cd *flaggerv1.Canary, deployer CanaryDepl c.recordEventErrorf(cd, "%v", err) return false } - if err := deployer.SyncStatus(cd, flaggerv1.CanaryStatus{State: "running"}); err != nil { + if err := deployer.SyncStatus(cd, flaggerv1.CanaryStatus{State: flaggerv1.CanaryRunning}); err != nil { c.logger.Errorf("%v", err) return false } diff --git a/pkg/controller/scheduler_test.go b/pkg/controller/scheduler_test.go index 2653af4a5..16c6fbfda 100644 --- a/pkg/controller/scheduler_test.go +++ b/pkg/controller/scheduler_test.go @@ -6,6 +6,7 @@ import ( "time" fakeIstio "github.com/knative/pkg/client/clientset/versioned/fake" + "github.com/stefanprodan/flagger/pkg/apis/flagger/v1alpha1" fakeFlagger "github.com/stefanprodan/flagger/pkg/client/clientset/versioned/fake" informers "github.com/stefanprodan/flagger/pkg/client/informers/externalversions" "github.com/stefanprodan/flagger/pkg/logging" @@ -142,3 +143,71 @@ func TestScheduler_NewRevision(t *testing.T) { t.Errorf("Got canary replicas %v wanted %v", *c.Spec.Replicas, 1) } } + +func TestScheduler_Rollback(t *testing.T) { + canary := newTestCanary() + dep := newTestDeployment() + hpa := newTestHPA() + + flaggerClient := fakeFlagger.NewSimpleClientset(canary) + kubeClient := fake.NewSimpleClientset(dep, hpa) + istioClient := fakeIstio.NewSimpleClientset() + + logger, _ := logging.NewLogger("debug") + deployer := CanaryDeployer{ + flaggerClient: flaggerClient, + kubeClient: kubeClient, + logger: logger, + } + router := CanaryRouter{ + flaggerClient: flaggerClient, + kubeClient: kubeClient, + istioClient: istioClient, + logger: logger, + } + observer := CanaryObserver{ + metricsServer: "fake", + } + + flaggerInformerFactory := informers.NewSharedInformerFactory(flaggerClient, noResyncPeriodFunc()) + flaggerInformer := flaggerInformerFactory.Flagger().V1alpha1().Canaries() + + ctrl := &Controller{ + kubeClient: kubeClient, + istioClient: istioClient, + flaggerClient: flaggerClient, + flaggerLister: flaggerInformer.Lister(), + flaggerSynced: flaggerInformer.Informer().HasSynced, + workqueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), controllerAgentName), + eventRecorder: &record.FakeRecorder{}, + logger: logger, + canaries: new(sync.Map), + flaggerWindow: time.Second, + deployer: deployer, + router: router, + observer: observer, + recorder: NewCanaryRecorder(false), + } + ctrl.flaggerSynced = alwaysReady + + // init + ctrl.advanceCanary("podinfo", "default") + + // update failed checks to max + err := deployer.SyncStatus(canary, v1alpha1.CanaryStatus{State: v1alpha1.CanaryRunning, FailedChecks: 11}) + if err != nil { + t.Fatal(err.Error()) + } + + // detect changes + ctrl.advanceCanary("podinfo", "default") + + c, err := flaggerClient.FlaggerV1alpha1().Canaries("default").Get("podinfo", metav1.GetOptions{}) + if err != nil { + t.Fatal(err.Error()) + } + + if c.Status.State != v1alpha1.CanaryFailed { + t.Errorf("Got canary state %v wanted %v", c.Status.State, v1alpha1.CanaryFailed) + } +}