Skip to content

Commit

Permalink
Implement canary rollback based on failed checks threshold
Browse files Browse the repository at this point in the history
- use CRD status to store failed checks count
- refactor canary status check and remove annotations
  • Loading branch information
stefanprodan committed Sep 30, 2018
1 parent e4d4ab0 commit 887daea
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 23 deletions.
2 changes: 2 additions & 0 deletions pkg/apis/rollout/v1beta1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ type VirtualService struct {
}

type CanaryAnalysis struct {
Threshold int `json:"threshold"`
MaxWeight int `json:"maxWeight"`
StepWeight int `json:"stepWeight"`
Metrics []Metric `json:"metrics"`
Expand All @@ -66,6 +67,7 @@ type Metric struct {
type RolloutStatus struct {
State string `json:"state"`
CanaryRevision string `json:"canaryRevision"`
FailedChecks int `json:"failedChecks"`
}

// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
Expand Down
83 changes: 60 additions & 23 deletions pkg/controller/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,36 @@ func (c *Controller) advanceDeploymentRollout(name string, namespace string) {
return
}

// gate stage: check if the number of failed checks reached the threshold
if r.Status.State == "running" && r.Status.FailedChecks >= r.Spec.CanaryAnalysis.Threshold {
c.recordEventWarningf(r, "Rolling back %s.%s failed checks threshold reached %v",
r.Spec.Canary.Name, r.Namespace, r.Status.FailedChecks)

// route all traffic back to primary
primaryRoute.Weight = 100
canaryRoute.Weight = 0
if ok := c.updateVirtualServiceRoutes(r, vs, primaryRoute, canaryRoute); !ok {
return
}

c.recordEventWarningf(r, "Canary failed! Scaling down %s.%s",
canary.GetName(), canary.Namespace)

// shutdown canary
c.scaleToZeroCanary(r)

// mark rollout as failed
c.updateRolloutStatus(r, "failed")
return
}

// gate stage: check if the canary success rate is above the threshold
// skip check if no traffic is routed to canary
if canaryRoute.Weight == 0 {
c.recordEventInfof(r, "Starting rollout for %s.%s", r.Name, r.Namespace)
} else {
if ok := c.checkDeploymentMetrics(r); !ok {
c.updateRolloutFailedChecks(r, r.Status.FailedChecks+1)
return
}
}
Expand Down Expand Up @@ -111,7 +135,7 @@ func (c *Controller) advanceDeploymentRollout(name string, namespace string) {

// final stage: mark rollout as finished and scale canary to zero replicas
c.updateRolloutStatus(r, "finished")
c.recordEventInfof(r, "%s.%s promotion complete! Scaling down %s.%s",
c.recordEventInfof(r, "Promotion completed! Scaling down %s.%s",
r.Name, r.Namespace, canary.GetName(), canary.Namespace)
c.scaleToZeroCanary(r)
}
Expand All @@ -129,48 +153,61 @@ func (c *Controller) getRollout(name string, namespace string) (*rolloutv1.Rollo

func (c *Controller) checkRolloutStatus(r *rolloutv1.Rollout, canaryVersion string) bool {
var err error
if val, ok := r.Annotations[revisionAnnotation]; !ok {
r.Annotations[revisionAnnotation] = canaryVersion
r.Annotations[statusAnnotation] = "running"
r.Status.State = "running"
r.Status.CanaryRevision = canaryVersion
if r.Status.State == "" {
r.Status = rolloutv1.RolloutStatus{
State: "running",
CanaryRevision: canaryVersion,
FailedChecks: 0,
}
r, err = c.rolloutClient.AppsV1beta1().Rollouts(r.Namespace).Update(r)
if err != nil {
c.recordEventErrorf(r, "Rollout %s.%s status update failed: %v", r.Name, r.Namespace, err)
c.logger.Errorf( "Rollout %s.%s status update failed: %v", r.Name, r.Namespace, err)
return false
}
return true
} else {
if r.Annotations[statusAnnotation] == "running" {
return true
}

if r.Status.State == "running" {
return true
}

if r.Status.CanaryRevision != canaryVersion {
r.Status = rolloutv1.RolloutStatus{
State: "running",
CanaryRevision: canaryVersion,
FailedChecks: 0,
}
if val != canaryVersion {
r.Annotations[revisionAnnotation] = canaryVersion
r.Annotations[statusAnnotation] = "running"
r.Status.State = "running"
r.Status.CanaryRevision = canaryVersion
r, err = c.rolloutClient.AppsV1beta1().Rollouts(r.Namespace).Update(r)
if err != nil {
c.recordEventErrorf(r, "Rollout %s.%s status update failed: %v", r.Name, r.Namespace, err)
return false
}
return true
r, err = c.rolloutClient.AppsV1beta1().Rollouts(r.Namespace).Update(r)
if err != nil {
c.logger.Errorf( "Rollout %s.%s status update failed: %v", r.Name, r.Namespace, err)
return false
}
return true
}

return false
}

func (c *Controller) updateRolloutStatus(r *rolloutv1.Rollout, status string) bool {
var err error
r.Annotations[statusAnnotation] = status
r.Status.State = status
r, err = c.rolloutClient.AppsV1beta1().Rollouts(r.Namespace).Update(r)
if err != nil {
c.recordEventErrorf(r, "Rollout %s.%s status update failed: %v", r.Name, r.Namespace, err)
c.logger.Errorf( "Rollout %s.%s status update failed: %v", r.Name, r.Namespace, err)
return false
}
return true
}

func (c *Controller) updateRolloutFailedChecks(r *rolloutv1.Rollout, val int) bool {
var err error
r.Status.FailedChecks = val
r, err = c.rolloutClient.AppsV1beta1().Rollouts(r.Namespace).Update(r)
if err != nil {
c.logger.Errorf( "Rollout %s.%s status update failed: %v", r.Name, r.Namespace, err)
return false
}
return true
}

func (c *Controller) getDeployment(r *rolloutv1.Rollout, name string, namespace string) (*appsv1.Deployment, bool) {
Expand Down

0 comments on commit 887daea

Please sign in to comment.