feat(flagD): support zero downtime during upgrades (#731)

## This PR  - implements graceful shutdown of flagD, which leads to zero-downtime -> this means disabling the readiness probes and sending a shutdown event to all connected SDKs - create example manifests for deploying flagD as a standalone Deployment - create Makefile entry to deploy flagD to cluster - create ZD test with README description how to run - create Makefile entry to run ZD test ### Related Issues  Fixes #728 ### Follow-up Tasks - running ZD test as part of CI #732 --------- Signed-off-by: odubajDT <ondrej.dubaj@dynatrace.com>
open-feature · Jul 13, 2023 · 7df8d39 · 7df8d39 · github-actions · Jul 13, 2023
1 parent 46ac4a3
commit 7df8d39
Show file tree

Hide file tree

Showing 10 changed files with 258 additions and 2 deletions.
diff --git a/Makefile b/Makefile
@@ -1,8 +1,10 @@
-IMG ?= flagd:latest
 PHONY: .docker-build .build .run .mockgen
 PREFIX=/usr/local
 ALL_GO_MOD_DIRS := $(shell find . -type f -name 'go.mod' -exec dirname {} \; | sort)
 
+FLAGD_DEV_NAMESPACE ?= flagd-dev
+ZD_TEST_NAMESPACE ?= flagd-zd-test
+
 workspace-init: workspace-clean
  go work init
  $(foreach module, $(ALL_GO_MOD_DIRS), go work use $(module);)
@@ -67,6 +69,22 @@ mockgen: install-mockgen
 generate-docs:
  cd flagd; go run ./cmd/doc/main.go
 
+.PHONY: deploy-dev-env
+export IMG?= ghcr.io/open-feature/flagd:latest
+deploy-dev-env: undeploy-dev-env
+ kubectl create ns "$(FLAGD_DEV_NAMESPACE)"
+ envsubst '$${IMG}' < config/deployments/flagd/deployment.yaml | kubectl apply -f - -n "$(FLAGD_DEV_NAMESPACE)"
+ kubectl apply -f config/deployments/flagd/service.yaml -n "$(FLAGD_DEV_NAMESPACE)"
+ kubectl wait --for=condition=available deployment/flagd -n "$(FLAGD_DEV_NAMESPACE)" --timeout=300s
+
+undeploy-dev-env:
+ kubectl delete ns "$(FLAGD_DEV_NAMESPACE)" --ignore-not-found=true
+
+run-zd-test:
+ kubectl delete ns "$(ZD_TEST_NAMESPACE)" --ignore-not-found=true
+ kubectl create ns "$(ZD_TEST_NAMESPACE)"
+ ZD_TEST_NAMESPACE="$(ZD_TEST_NAMESPACE)" FLAGD_DEV_NAMESPACE=$(FLAGD_DEV_NAMESPACE) IMG="$(IMG)" IMG_ZD="$(IMG_ZD)" ./test/zero-downtime/zd_test.sh
+
 # Markdown lint configuration
 #
 # - .markdownlintignore holds the configuration for files to be ignored

diff --git a/config/deployments/flagd/deployment.yaml b/config/deployments/flagd/deployment.yaml
@@ -0,0 +1,74 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ labels:
+ app: flagd
+ name: flagd
+spec:
+ replicas: 1
+ strategy:
+ type: RollingUpdate
+ rollingUpdate:
+ maxSurge: 1
+ maxUnavailable: 0
+ selector:
+ matchLabels:
+ app: flagd
+ template:
+ metadata:
+ labels:
+ app.kubernetes.io/name: flagd
+ app: flagd
+ spec:
+ containers:
+ - name: flagd
+ image: ${IMG}
+ volumeMounts:
+ - name: config-volume
+ mountPath: /etc/flagd
+ readinessProbe:
+ httpGet:
+ path: /readyz
+ port: 8014
+ initialDelaySeconds: 5
+ periodSeconds: 5
+ livenessProbe:
+ httpGet:
+ path: /healthz
+ port: 8014
+ initialDelaySeconds: 5
+ periodSeconds: 60
+ ports:
+ - containerPort: 8013
+ args:
+ - start
+ - --uri
+ - file:/etc/flagd/config.json
+ - --debug
+ volumes:
+ - name: config-volume
+ configMap:
+ name: open-feature-flags
+ items:
+ - key: flags
+ path: config.json
+---
+# ConfigMap for Flagd OpenFeatuer provider
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: open-feature-flags
+data:
+ flags: |
+ {
+ "flags": {
+ "myStringFlag": {
+ "state": "ENABLED",
+ "variants": {
+ "key1": "val1",
+ "key2": "val2"
+ },
+ "defaultVariant": "key1"
+ }
+ }
+ }
diff --git a/config/deployments/flagd/service.yaml b/config/deployments/flagd/service.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: flagd-svc
+spec:
+ selector:
+ app.kubernetes.io/name: flagd
+ ports:
+ - port: 8013
+ targetPort: 8013
diff --git a/core/pkg/runtime/runtime.go b/core/pkg/runtime/runtime.go
@@ -84,6 +84,13 @@ func (r *Runtime) Start() error {
  return nil
  })
  }
+
+ defer func() {
+ r.Logger.Info("Shutting down server...")
+ r.Service.Shutdown()
+ r.Logger.Info("Server successfully shutdown.")
+ }()
+
  g.Go(func() error {
  // Readiness probe rely on the runtime
  r.ServiceConfig.ReadinessProbe = r.isReady

diff --git a/core/pkg/service/flag-evaluation/connect_service.go b/core/pkg/service/flag-evaluation/connect_service.go
@@ -37,6 +37,8 @@ type ConnectService struct {
 
  serverMtx sync.RWMutex
  metricsServerMtx sync.RWMutex
+
+ readinessEnabled bool
 }
 
 // NewConnectService creates a ConnectService with provided parameters
@@ -57,6 +59,7 @@ func NewConnectService(
 // Serve serves services with provided configuration options
 func (s *ConnectService) Serve(ctx context.Context, svcConf service.Configuration) error {
  g, gCtx := errgroup.WithContext(ctx)
+ s.readinessEnabled = true
 
  g.Go(func() error {
  return s.startServer(svcConf)
@@ -152,6 +155,14 @@ func (s *ConnectService) AddMiddleware(mw middleware.IMiddleware) {
  s.server.Handler = mw.Handler(s.server.Handler)
 }
 
+func (s *ConnectService) Shutdown() {
+ s.readinessEnabled = false
+ s.eventingConfiguration.emitToAll(service.Notification{
+ Type: service.Shutdown,
+ Data: map[string]interface{}{},
+ })
+}
+
 func (s *ConnectService) startServer(svcConf service.Configuration) error {
  lis, err := s.setupServer(svcConf)
  if err != nil {
@@ -189,7 +200,7 @@ func (s *ConnectService) startMetricsServer(svcConf service.Configuration) error
  case "/healthz":
  w.WriteHeader(http.StatusOK)
  case "/readyz":
- if svcConf.ReadinessProbe() {
+ if s.readinessEnabled && svcConf.ReadinessProbe() {
  w.WriteHeader(http.StatusOK)
  } else {
  w.WriteHeader(http.StatusPreconditionFailed)

diff --git a/core/pkg/service/flag-evaluation/connect_service_test.go b/core/pkg/service/flag-evaluation/connect_service_test.go
@@ -204,3 +204,43 @@ func TestConnectServiceNotify(t *testing.T) {
  t.Error("timeout while waiting for notifications")
  }
 }
+
+func TestConnectServiceShutdown(t *testing.T) {
+ // given
+ ctrl := gomock.NewController(t)
+ eval := mock.NewMockIEvaluator(ctrl)
+
+ exp := metric.NewManualReader()
+ rs := resource.NewWithAttributes("testSchema")
+ metricRecorder := telemetry.NewOTelRecorder(exp, rs, "my-exporter")
+
+ service := NewConnectService(logger.NewLogger(nil, false), eval, metricRecorder)
+
+ sChan := make(chan iservice.Notification, 1)
+ eventing := service.eventingConfiguration
+ eventing.subs["key"] = sChan
+
+ // notification type
+ ofType := iservice.Shutdown
+
+ // emit notification in routine
+ go func() {
+ service.Notify(iservice.Notification{
+ Type: ofType,
+ Data: map[string]interface{}{},
+ })
+ }()
+
+ // wait for notification
+ timeout, cancelFunc := context.WithTimeout(context.Background(), 2*time.Second)
+ defer cancelFunc()
+
+ require.False(t, service.readinessEnabled)
+
+ select {
+ case n := <-sChan:
+ require.Equal(t, ofType, n.Type, "expected notification type: %s, but received %s", ofType, n.Type)
+ case <-timeout.Done():
+ t.Error("timeout while waiting for notifications")
+ }
+}
diff --git a/core/pkg/service/iservice.go b/core/pkg/service/iservice.go
@@ -10,6 +10,7 @@ type NotificationType string
 
 const (
  ConfigurationChange NotificationType = "configuration_change"
+ Shutdown NotificationType = "provider_shutdown"
  ProviderReady NotificationType = "provider_ready"
  KeepAlive NotificationType = "keep_alive"
 )
@@ -40,6 +41,7 @@ which call the IEvaluator implementation.
 type IFlagEvaluationService interface {
  Serve(ctx context.Context, svcConf Configuration) error
  Notify(n Notification)
+ Shutdown()
 }
 
 /*

diff --git a/test/zero-downtime/README.md b/test/zero-downtime/README.md
@@ -0,0 +1,25 @@
+# FlagD Zero downtime test
+
+## How to run
+
+Clone this repository and run the following command to deploy a standalone flagD:
+
+```shell
+IMG=your-flagd-image make deploy-dev-env
+```
+
+This will create a flagd deployment `flagd-dev` namespace.
+
+To run the test, execute:
+
+```shell
+IMG=your-flagd-image IMG_ZD=your-flagd-image2 make run-zd-test
+```
+
+Please be aware, you need to build your two custom images with different tags for flagD first.
+
+To build your images using Docker execute:
+
+```shell
+docker build . -t image-name:tag -f flagd/build.Dockerfile
+```
diff --git a/test/zero-downtime/test-pod.yaml b/test/zero-downtime/test-pod.yaml
@@ -0,0 +1,25 @@
+apiVersion: v1
+kind: Pod
+metadata:
+ name: test-zd
+spec:
+ containers:
+ - name: test-zd
+ image: curlimages/curl:8.1.2
+ # yamllint disable rule:line-length
+ command:
+ - 'sh'
+ - '-c'
+ - |
+ for i in $(seq 1 3000); do
+ curl -H 'Cache-Control: no-cache, no-store' -X POST flagd-svc.$FLAGD_DEV_NAMESPACE.svc.cluster.local:8013/schema.v1.Service/ResolveString?$RANDOM -d '{"flagKey":"myStringFlag","context":{}}' -H "Content-Type: application/json" > ~/out.txt
+ if ! grep -q "val1" ~/out.txt
+ then
+ cat ~/out.txt
+ echo "\n\nCannot fetch data from flagD, exiting...\n\n"
+ exit 1
+ fi
+ sleep 1
+ done
+ exit 0
+ # yamllint enable rule:line-length
diff --git a/test/zero-downtime/zd_test.sh b/test/zero-downtime/zd_test.sh
@@ -0,0 +1,44 @@
+#!/bin/sh
+
+set -eu
+
+# Store the flagD image to a helper variable
+IMG_ORIGINAL=$IMG
+
+# Create pod requesting the values from flagD
+envsubst < test/zero-downtime/test-pod.yaml | kubectl apply -f - -n $ZD_TEST_NAMESPACE
+
+for count in 1 2 3;
+do
+ # Update the flagD deployment with the second image
+ IMG=$IMG_ZD
+ envsubst < config/deployments/flagd/deployment.yaml | kubectl apply -f - -n $FLAGD_DEV_NAMESPACE
+ kubectl wait --for=condition=available deployment/flagd -n $FLAGD_DEV_NAMESPACE --timeout=30s
+
+ # Wait until the client pod executes curl requests agains flagD
+ sleep 20
+
+ # Update the flagDT deployment back to original image
+ IMG=$IMG_ORIGINAL
+ envsubst < config/deployments/flagd/deployment.yaml | kubectl apply -f - -n $FLAGD_DEV_NAMESPACE
+ kubectl wait --for=condition=available deployment/flagd -n $FLAGD_DEV_NAMESPACE --timeout=30s
+
+ # Wait until the client pod executes curl requests agains flagD
+ sleep 20
+done
+
+# Pod will fail only when it fails to get a proper response from curl (that means we do not have zero downtime)
+# If it is still running, the last curl request was successfull.
+kubectl wait --for=condition=ready pod/test-zd -n $ZD_TEST_NAMESPACE --timeout=30s
+
+# If curl request once not successful and another curl request was, pod might be in a ready state again.
+# Therefore we need to check that the restart count is equal to zero -> this means every request provided valid data.
+restart_count=$(kubectl get pods test-zd -o=jsonpath='{.status.containerStatuses[0].restartCount}' -n $ZD_TEST_NAMESPACE)
+if [ "$restart_count" -ne 0 ]; then
+ echo "Restart count of the test-zd pod is not equal to zero."
+ exit 1
+fi
+
+# Cleanup only when the test passed
+kubectl delete ns $ZD_TEST_NAMESPACE --ignore-not-found=true
+
Benchmark suite	Current: `7df8d39`	Previous: `46ac4a3`	Ratio
`BenchmarkResolveBooleanValue/test_staticBoolFlag - ns/op`	`2687` ns/op	`1579` ns/op	`1.70`
`BenchmarkResolveBooleanValue/test_targetingBoolFlag - ns/op`	`14814` ns/op	`10661` ns/op	`1.39`
`BenchmarkResolveStringValue/test_staticStringFlag - ns/op`	`2152` ns/op	`1640` ns/op	`1.31`
`BenchmarkResolveStringValue/test_targetingStringFlag - ns/op`	`15188` ns/op	`10955` ns/op	`1.39`
`BenchmarkResolveFloatValue/test:_targetingFloatFlag - ns/op`	`15707` ns/op	`11127` ns/op	`1.41`
`BenchmarkResolveFloatValue/test:_staticObjectFlag - ns/op`	`1940` ns/op	`1485` ns/op	`1.31`
`BenchmarkResolveFloatValue/test:_disabledFlag - ns/op`	`2176` ns/op	`1663` ns/op	`1.31`
`BenchmarkResolveIntValue/test_staticIntFlag - ns/op`	`2055` ns/op	`1579` ns/op	`1.30`
`BenchmarkResolveIntValue/test_targetingNumberFlag - ns/op`	`13777` ns/op	`9888` ns/op	`1.39`
`BenchmarkResolveObjectValue/test_staticObjectFlag - ns/op`	`6936` ns/op	`5196` ns/op	`1.33`
`BenchmarkResolveObjectValue/test_targetingObjectFlag - ns/op`	`20114` ns/op	`14467` ns/op	`1.39`
`BenchmarkFlag_Evaluation_ResolveString/happy_path - ns/op`	`12687` ns/op	`9517` ns/op	`1.33`
`BenchmarkFlag_Evaluation_ResolveFloat/happy_path - ns/op`	`12921` ns/op	`9787` ns/op	`1.32`
`BenchmarkFlag_Evaluation_ResolveInt/happy_path - ns/op`	`12838` ns/op	`9662` ns/op	`1.33`
`BenchmarkFlag_Evaluation_ResolveObject/happy_path - ns/op`	`15750` ns/op	`11423` ns/op	`1.38`