Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rollback canary based on the deployment progress deadline check #10

Merged
merged 11 commits into from
Nov 28, 2018
8 changes: 8 additions & 0 deletions .codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
coverage:
status:
project:
default:
target: auto
threshold: 0.50
base: auto
patch: off
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ spec:
apiVersion: apps/v1
kind: Deployment
name: podinfo
# the maximum time in seconds for the canary deployment
# to make progress before it is rollback (default 60s)
progressDeadlineSeconds: 60
# hpa reference (optional)
autoscalerRef:
apiVersion: autoscaling/v2beta1
Expand Down
3 changes: 3 additions & 0 deletions artifacts/canaries/canary.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ spec:
apiVersion: apps/v1
kind: Deployment
name: podinfo
# the maximum time in seconds for the canary deployment
# to make progress before it is rollback (default 60s)
progressDeadlineSeconds: 60
# HPA reference (optional)
autoscalerRef:
apiVersion: autoscaling/v2beta1
Expand Down
2 changes: 2 additions & 0 deletions artifacts/flagger/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ spec:
- service
- canaryAnalysis
properties:
progressDeadlineSeconds:
type: number
targetRef:
properties:
apiVersion:
Expand Down
2 changes: 2 additions & 0 deletions charts/flagger/templates/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ spec:
- service
- canaryAnalysis
properties:
progressDeadlineSeconds:
type: number
targetRef:
properties:
apiVersion:
Expand Down
3 changes: 3 additions & 0 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ spec:
apiVersion: apps/v1
kind: Deployment
name: podinfo
# the maximum time in seconds for the canary deployment
# to make progress before it is rollback (default 60s)
progressDeadlineSeconds: 60
# hpa reference (optional)
autoscalerRef:
apiVersion: autoscaling/v2beta1
Expand Down
33 changes: 29 additions & 4 deletions pkg/apis/flagger/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

const CanaryKind = "Canary"
const (
CanaryKind = "Canary"
ProgressDeadlineSeconds = 60
)

// +genclient
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
Expand All @@ -48,6 +51,10 @@ type CanarySpec struct {

// metrics and thresholds
CanaryAnalysis CanaryAnalysis `json:"canaryAnalysis"`

// the maximum time in seconds for a canary deployment to make progress
// before it is considered to be failed. Defaults to 60s.
ProgressDeadlineSeconds *int32 `json:"progressDeadlineSeconds,omitempty"`
}

// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
Expand All @@ -60,11 +67,21 @@ type CanaryList struct {
Items []Canary `json:"items"`
}

// CanaryState used for status state op
type CanaryState string

const (
CanaryRunning CanaryState = "running"
CanaryFinished CanaryState = "finished"
CanaryFailed CanaryState = "failed"
CanaryInitialized CanaryState = "initialized"
)

// CanaryStatus is used for state persistence (read-only)
type CanaryStatus struct {
State string `json:"state"`
CanaryRevision string `json:"canaryRevision"`
FailedChecks int `json:"failedChecks"`
State CanaryState `json:"state"`
CanaryRevision string `json:"canaryRevision"`
FailedChecks int `json:"failedChecks"`
// +optional
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"`
}
Expand All @@ -91,3 +108,11 @@ type CanaryMetric struct {
Interval string `json:"interval"`
Threshold int `json:"threshold"`
}

func (c *Canary) GetProgressDeadlineSeconds() int {
if c.Spec.ProgressDeadlineSeconds != nil {
return int(*c.Spec.ProgressDeadlineSeconds)
}

return ProgressDeadlineSeconds
}
5 changes: 5 additions & 0 deletions pkg/apis/flagger/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

111 changes: 75 additions & 36 deletions pkg/controller/deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"encoding/base64"
"encoding/json"
"fmt"
"time"

"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
Expand Down Expand Up @@ -48,6 +49,7 @@ func (c *CanaryDeployer) Promote(cd *flaggerv1.Canary) error {
return fmt.Errorf("deployment %s.%s query error %v", primaryName, cd.Namespace, err)
}

primary.Spec.ProgressDeadlineSeconds = canary.Spec.ProgressDeadlineSeconds
primary.Spec.MinReadySeconds = canary.Spec.MinReadySeconds
primary.Spec.RevisionHistoryLimit = canary.Spec.RevisionHistoryLimit
primary.Spec.Strategy = canary.Spec.Strategy
Expand All @@ -61,37 +63,58 @@ func (c *CanaryDeployer) Promote(cd *flaggerv1.Canary) error {
return nil
}

// IsReady checks the primary and canary deployment status and returns an error if
// the deployments are in the middle of a rolling update or if the pods are unhealthy
func (c *CanaryDeployer) IsReady(cd *flaggerv1.Canary) error {
canary, err := c.kubeClient.AppsV1().Deployments(cd.Namespace).Get(cd.Spec.TargetRef.Name, metav1.GetOptions{})
// IsPrimaryReady checks the primary deployment status and returns an error if
// the deployment is in the middle of a rolling update or if the pods are unhealthy
// it will return a non retriable error if the rolling update is stuck
func (c *CanaryDeployer) IsPrimaryReady(cd *flaggerv1.Canary) (bool, error) {
primaryName := fmt.Sprintf("%s-primary", cd.Spec.TargetRef.Name)
primary, err := c.kubeClient.AppsV1().Deployments(cd.Namespace).Get(primaryName, metav1.GetOptions{})
if err != nil {
if errors.IsNotFound(err) {
return fmt.Errorf("deployment %s.%s not found", cd.Spec.TargetRef.Name, cd.Namespace)
return true, fmt.Errorf("deployment %s.%s not found", primaryName, cd.Namespace)
}
return fmt.Errorf("deployment %s.%s query error %v", cd.Spec.TargetRef.Name, cd.Namespace, err)
return true, fmt.Errorf("deployment %s.%s query error %v", primaryName, cd.Namespace, err)
}
if msg, healthy := c.getDeploymentStatus(canary); !healthy {
return fmt.Errorf("Halt %s.%s advancement %s", cd.Name, cd.Namespace, msg)

retriable, err := c.isDeploymentReady(primary, cd.GetProgressDeadlineSeconds())
if err != nil {
if retriable {
return retriable, fmt.Errorf("Halt %s.%s advancement %s", cd.Name, cd.Namespace, err.Error())
} else {
return retriable, err
}
}

primaryName := fmt.Sprintf("%s-primary", cd.Spec.TargetRef.Name)
primary, err := c.kubeClient.AppsV1().Deployments(cd.Namespace).Get(primaryName, metav1.GetOptions{})
if primary.Spec.Replicas == int32p(0) {
return true, fmt.Errorf("halt %s.%s advancement primary deployment is scaled to zero",
cd.Name, cd.Namespace)
}
return true, nil
}

// IsCanaryReady checks the primary deployment status and returns an error if
// the deployment is in the middle of a rolling update or if the pods are unhealthy
// it will return a non retriable error if the rolling update is stuck
func (c *CanaryDeployer) IsCanaryReady(cd *flaggerv1.Canary) (bool, error) {
canary, err := c.kubeClient.AppsV1().Deployments(cd.Namespace).Get(cd.Spec.TargetRef.Name, metav1.GetOptions{})
if err != nil {
if errors.IsNotFound(err) {
return fmt.Errorf("deployment %s.%s not found", primaryName, cd.Namespace)
return true, fmt.Errorf("deployment %s.%s not found", cd.Spec.TargetRef.Name, cd.Namespace)
}
return fmt.Errorf("deployment %s.%s query error %v", primaryName, cd.Namespace, err)
}
if msg, healthy := c.getDeploymentStatus(primary); !healthy {
return fmt.Errorf("Halt %s.%s advancement %s", cd.Name, cd.Namespace, msg)
return true, fmt.Errorf("deployment %s.%s query error %v", cd.Spec.TargetRef.Name, cd.Namespace, err)
}

if primary.Spec.Replicas == int32p(0) {
return fmt.Errorf("halt %s.%s advancement %s",
cd.Name, cd.Namespace, "primary deployment is scaled to zero")
retriable, err := c.isDeploymentReady(canary, cd.GetProgressDeadlineSeconds())
if err != nil {
if retriable {
return retriable, fmt.Errorf("Halt %s.%s advancement %s", cd.Name, cd.Namespace, err.Error())
} else {
return retriable, fmt.Errorf("deployment does not have minimum availability for more than %vs",
cd.GetProgressDeadlineSeconds())
}
}
return nil

return true, nil
}

// IsNewSpec returns true if the canary deployment pod spec has changed
Expand Down Expand Up @@ -139,7 +162,7 @@ func (c *CanaryDeployer) SetFailedChecks(cd *flaggerv1.Canary, val int) error {
}

// SetState updates the canary status state
func (c *CanaryDeployer) SetState(cd *flaggerv1.Canary, state string) error {
func (c *CanaryDeployer) SetState(cd *flaggerv1.Canary, state flaggerv1.CanaryState) error {
cd.Status.State = state
cd.Status.LastTransitionTime = metav1.Now()
cd, err := c.flaggerClient.FlaggerV1alpha1().Canaries(cd.Namespace).Update(cd)
Expand Down Expand Up @@ -244,10 +267,11 @@ func (c *CanaryDeployer) createPrimaryDeployment(cd *flaggerv1.Canary) error {
},
},
Spec: appsv1.DeploymentSpec{
MinReadySeconds: canaryDep.Spec.MinReadySeconds,
RevisionHistoryLimit: canaryDep.Spec.RevisionHistoryLimit,
Replicas: canaryDep.Spec.Replicas,
Strategy: canaryDep.Spec.Strategy,
ProgressDeadlineSeconds: canaryDep.Spec.ProgressDeadlineSeconds,
MinReadySeconds: canaryDep.Spec.MinReadySeconds,
RevisionHistoryLimit: canaryDep.Spec.RevisionHistoryLimit,
Replicas: canaryDep.Spec.Replicas,
Strategy: canaryDep.Spec.Strategy,
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{
"app": primaryName,
Expand Down Expand Up @@ -322,26 +346,41 @@ func (c *CanaryDeployer) createPrimaryHpa(cd *flaggerv1.Canary) error {
return nil
}

func (c *CanaryDeployer) getDeploymentStatus(deployment *appsv1.Deployment) (string, bool) {
// isDeploymentReady determines if a deployment is ready by checking the status conditions
// if a deployment has exceeded the progress deadline it returns a non retriable error
func (c *CanaryDeployer) isDeploymentReady(deployment *appsv1.Deployment, deadline int) (bool, error) {
retriable := true
if deployment.Generation <= deployment.Status.ObservedGeneration {
cond := c.getDeploymentCondition(deployment.Status, appsv1.DeploymentProgressing)
if cond != nil && cond.Reason == "ProgressDeadlineExceeded" {
return fmt.Sprintf("deployment %q exceeded its progress deadline", deployment.GetName()), false
progress := c.getDeploymentCondition(deployment.Status, appsv1.DeploymentProgressing)
if progress != nil {
// Determine if the deployment is stuck by checking if there is a minimum replicas unavailable condition
// and if the last update time exceeds the deadline
available := c.getDeploymentCondition(deployment.Status, appsv1.DeploymentAvailable)
if available != nil && available.Status == "False" && available.Reason == "MinimumReplicasUnavailable" {
from := available.LastUpdateTime
delta := time.Duration(deadline) * time.Second
retriable = !from.Add(delta).Before(time.Now())
}
}

if progress != nil && progress.Reason == "ProgressDeadlineExceeded" {
return false, fmt.Errorf("deployment %q exceeded its progress deadline", deployment.GetName())
} else if deployment.Spec.Replicas != nil && deployment.Status.UpdatedReplicas < *deployment.Spec.Replicas {
return fmt.Sprintf("waiting for rollout to finish: %d out of %d new replicas have been updated",
deployment.Status.UpdatedReplicas, *deployment.Spec.Replicas), false
return retriable, fmt.Errorf("waiting for rollout to finish: %d out of %d new replicas have been updated",
deployment.Status.UpdatedReplicas, *deployment.Spec.Replicas)
} else if deployment.Status.Replicas > deployment.Status.UpdatedReplicas {
return fmt.Sprintf("waiting for rollout to finish: %d old replicas are pending termination",
deployment.Status.Replicas-deployment.Status.UpdatedReplicas), false
return retriable, fmt.Errorf("waiting for rollout to finish: %d old replicas are pending termination",
deployment.Status.Replicas-deployment.Status.UpdatedReplicas)
} else if deployment.Status.AvailableReplicas < deployment.Status.UpdatedReplicas {
return fmt.Sprintf("waiting for rollout to finish: %d of %d updated replicas are available",
deployment.Status.AvailableReplicas, deployment.Status.UpdatedReplicas), false
return retriable, fmt.Errorf("waiting for rollout to finish: %d of %d updated replicas are available",
deployment.Status.AvailableReplicas, deployment.Status.UpdatedReplicas)
}

} else {
return "waiting for rollout to finish: observed deployment generation less then desired generation", false
return true, fmt.Errorf("waiting for rollout to finish: observed deployment generation less then desired generation")
}

return "ready", true
return true, nil
}

func (c *CanaryDeployer) getDeploymentCondition(
Expand Down
15 changes: 10 additions & 5 deletions pkg/controller/deployer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,12 @@ func TestCanaryDeployer_IsReady(t *testing.T) {
t.Fatal(err.Error())
}

err = deployer.IsReady(canary)
_, err = deployer.IsPrimaryReady(canary)
if err != nil {
t.Fatal(err.Error())
}

_, err = deployer.IsCanaryReady(canary)
if err != nil {
t.Fatal(err.Error())
}
Expand Down Expand Up @@ -382,7 +387,7 @@ func TestCanaryDeployer_SetState(t *testing.T) {
t.Fatal(err.Error())
}

err = deployer.SetState(canary, "running")
err = deployer.SetState(canary, v1alpha1.CanaryRunning)
if err != nil {
t.Fatal(err.Error())
}
Expand All @@ -392,8 +397,8 @@ func TestCanaryDeployer_SetState(t *testing.T) {
t.Fatal(err.Error())
}

if res.Status.State != "running" {
t.Errorf("Got %v wanted %v", res.Status.State, "running")
if res.Status.State != v1alpha1.CanaryRunning {
t.Errorf("Got %v wanted %v", res.Status.State, v1alpha1.CanaryRunning)
}
}

Expand All @@ -419,7 +424,7 @@ func TestCanaryDeployer_SyncStatus(t *testing.T) {
}

status := v1alpha1.CanaryStatus{
State: "running",
State: v1alpha1.CanaryRunning,
FailedChecks: 2,
}
err = deployer.SyncStatus(canary, status)
Expand Down
4 changes: 2 additions & 2 deletions pkg/controller/recorder.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ func (cr *CanaryRecorder) SetTotal(namespace string, total int) {
func (cr *CanaryRecorder) SetStatus(cd *flaggerv1.Canary) {
status := 1
switch cd.Status.State {
case "running":
case flaggerv1.CanaryRunning:
status = 0
case "failed":
case flaggerv1.CanaryFailed:
status = 2
default:
status = 1
Expand Down
Loading