Skip to content

Commit

Permalink
fix: Graceful shutdown for the API server (argoproj#18642)
Browse files Browse the repository at this point in the history
Closes argoproj#18642

Implements a graceful shutdown the the API server. Without this, ArgoCD API server will eventually return 502 during rolling update. However, healthcheck would return 503 if the server is terminating.

Signed-off-by: Andrii Korotkov <andrii.korotkov@verkada.com>
Co-authored-by: Leonardo Luz Almeida <leonardo_almeida@intuit.com>
Co-authored-by: Michael Crenshaw <350466+crenshaw-dev@users.noreply.github.com>
  • Loading branch information
3 people committed Nov 28, 2024
1 parent 8bce61e commit de51fbb
Show file tree
Hide file tree
Showing 4 changed files with 258 additions and 39 deletions.
17 changes: 10 additions & 7 deletions cmd/argocd-server/commands/argocd_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -256,23 +256,26 @@ func NewCommand() *cobra.Command {
stats.StartStatsTicker(10 * time.Minute)
stats.RegisterHeapDumper("memprofile")
argocd := server.NewServer(ctx, argoCDOpts, appsetOpts)
argocd.Init(ctx)
lns, err := argocd.Listen()
errors.CheckError(err)
for {
var closer func()
ctx, cancel := context.WithCancel(ctx)
serverCtx, cancel := context.WithCancel(ctx)
argocd.Init(serverCtx)
lns, err := argocd.Listen()
errors.CheckError(err)
if otlpAddress != "" {
closer, err = traceutil.InitTracer(ctx, "argocd-server", otlpAddress, otlpInsecure, otlpHeaders, otlpAttrs)
closer, err = traceutil.InitTracer(serverCtx, "argocd-server", otlpAddress, otlpInsecure, otlpHeaders, otlpAttrs)
if err != nil {
log.Fatalf("failed to initialize tracing: %v", err)
}
}
argocd.Run(ctx, lns)
cancel()
argocd.Run(serverCtx, lns)
if closer != nil {
closer()
}
cancel()
if argocd.TerminateRequested() {
break
}
}
},
Example: templates.Examples(`
Expand Down
165 changes: 133 additions & 32 deletions server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@ import (
"net/url"
"os"
"os/exec"
"os/signal"
"path"
"path/filepath"
"reflect"
"regexp"
go_runtime "runtime"
"strings"
gosync "sync"
"sync/atomic"
"syscall"
"time"

// nolint:staticcheck
Expand Down Expand Up @@ -187,17 +190,20 @@ type ArgoCDServer struct {
db db.ArgoDB

// stopCh is the channel which when closed, will shutdown the Argo CD server
stopCh chan struct{}
userStateStorage util_session.UserStateStorage
indexDataInit gosync.Once
indexData []byte
indexDataErr error
staticAssets http.FileSystem
apiFactory api.Factory
secretInformer cache.SharedIndexInformer
configMapInformer cache.SharedIndexInformer
serviceSet *ArgoCDServiceSet
extensionManager *extension.Manager
stopCh chan os.Signal
userStateStorage util_session.UserStateStorage
indexDataInit gosync.Once
indexData []byte
indexDataErr error
staticAssets http.FileSystem
apiFactory api.Factory
secretInformer cache.SharedIndexInformer
configMapInformer cache.SharedIndexInformer
serviceSet *ArgoCDServiceSet
extensionManager *extension.Manager
shutdown func()
terminateRequested atomic.Bool
available atomic.Bool
}

type ArgoCDServerOpts struct {
Expand Down Expand Up @@ -329,6 +335,9 @@ func NewServer(ctx context.Context, opts ArgoCDServerOpts, appsetOpts Applicatio
pg := extension.NewDefaultProjectGetter(projLister, dbInstance)
ug := extension.NewDefaultUserGetter(policyEnf)
em := extension.NewManager(logger, opts.Namespace, sg, ag, pg, enf, ug)
noopShutdown := func() {
log.Error("API Server Shutdown function called but server is not started yet.")
}

a := &ArgoCDServer{
ArgoCDServerOpts: opts,
Expand All @@ -352,6 +361,8 @@ func NewServer(ctx context.Context, opts ArgoCDServerOpts, appsetOpts Applicatio
secretInformer: secretInformer,
configMapInformer: configMapInformer,
extensionManager: em,
shutdown: noopShutdown,
stopCh: make(chan os.Signal, 1),
}

err = a.logInClusterWarnings()
Expand All @@ -369,6 +380,12 @@ const (
)

func (a *ArgoCDServer) healthCheck(r *http.Request) error {
if a.terminateRequested.Load() {
return errors.New("API Server is terminating and unable to serve requests.")
}
if !a.available.Load() {
return errors.New("API Server is not available. It either hasn't started or is restarting.")
}
if val, ok := r.URL.Query()["full"]; ok && len(val) > 0 && val[0] == "true" {
argoDB := db.NewDB(a.Namespace, a.settingsMgr, a.KubeClientset)
_, err := argoDB.ListClusters(r.Context())
Expand Down Expand Up @@ -601,35 +618,118 @@ func (a *ArgoCDServer) Run(ctx context.Context, listeners *Listeners) {
log.Fatal("Timed out waiting for project cache to sync")
}

a.stopCh = make(chan struct{})
<-a.stopCh
shutdownFunc := func() {
log.Info("API Server shutdown initiated. Shutting down servers...")
a.available.Store(false)
shutdownCtx, cancel := context.WithTimeout(ctx, 20*time.Second)
defer cancel()
var wg gosync.WaitGroup

// Shutdown http server
wg.Add(1)
go func() {
defer wg.Done()
err := httpS.Shutdown(shutdownCtx)
if err != nil {
log.Errorf("Error shutting down http server: %s", err)
}
}()

if a.useTLS() {
// Shutdown https server
wg.Add(1)
go func() {
defer wg.Done()
err := httpsS.Shutdown(shutdownCtx)
if err != nil {
log.Errorf("Error shutting down https server: %s", err)
}
}()
}

// Shutdown gRPC server
wg.Add(1)
go func() {
defer wg.Done()
grpcS.GracefulStop()
}()

// Shutdown metrics server
wg.Add(1)
go func() {
defer wg.Done()
err := metricsServ.Shutdown(shutdownCtx)
if err != nil {
log.Errorf("Error shutting down metrics server: %s", err)
}
}()

if a.useTLS() {
// Shutdown tls server
wg.Add(1)
go func() {
defer wg.Done()
tlsm.Close()
}()
}

// Shutdown tcp server
wg.Add(1)
go func() {
defer wg.Done()
tcpm.Close()
}()

c := make(chan struct{})
// This goroutine will wait for all servers to conclude the shutdown
// process
go func() {
defer close(c)
wg.Wait()
}()

select {
case <-c:
log.Info("All servers were gracefully shutdown. Exiting...")
case <-shutdownCtx.Done():
log.Warn("Graceful shutdown timeout. Exiting...")
}
}
a.shutdown = shutdownFunc
signal.Notify(a.stopCh, os.Interrupt, syscall.SIGINT, syscall.SIGTERM)
a.available.Store(true)

select {
case signal := <-a.stopCh:
log.Infof("API Server received signal: %s", signal.String())
// SIGUSR1 is used for triggering a server restart
if signal != syscall.SIGUSR1 {
a.terminateRequested.Store(true)
}
a.shutdown()
case <-ctx.Done():
log.Infof("API Server: %s", ctx.Err())
a.terminateRequested.Store(true)
a.shutdown()
}
}

func (a *ArgoCDServer) Initialized() bool {
return a.projInformer.HasSynced() && a.appInformer.HasSynced()
}

// TerminateRequested returns whether a shutdown was initiated by a signal or context cancel
// as opposed to a watch.
func (a *ArgoCDServer) TerminateRequested() bool {
return a.terminateRequested.Load()
}

// checkServeErr checks the error from a .Serve() call to decide if it was a graceful shutdown
func (a *ArgoCDServer) checkServeErr(name string, err error) {
if err != nil {
if a.stopCh == nil {
// a nil stopCh indicates a graceful shutdown
log.Infof("graceful shutdown %s: %v", name, err)
} else {
log.Fatalf("%s: %v", name, err)
}
if err != nil && !errors.Is(err, http.ErrServerClosed) {
log.Errorf("Error received from server %s: %v", name, err)
} else {
log.Infof("graceful shutdown %s", name)
}
}

// Shutdown stops the Argo CD server
func (a *ArgoCDServer) Shutdown() {
log.Info("Shut down requested")
stopCh := a.stopCh
a.stopCh = nil
if stopCh != nil {
close(stopCh)
log.Infof("Graceful shutdown of %s initiated", name)
}
}

Expand Down Expand Up @@ -734,9 +834,10 @@ func (a *ArgoCDServer) watchSettings() {
}
}
log.Info("shutting down settings watch")
a.Shutdown()
a.settingsMgr.Unsubscribe(updateCh)
close(updateCh)
// Triggers server restart
a.stopCh <- syscall.SIGUSR1
}

func (a *ArgoCDServer) rbacPolicyLoader(ctx context.Context) {
Expand Down
68 changes: 68 additions & 0 deletions server/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ import (
"os"
"path/filepath"
"strings"
gosync "sync"
"syscall"
"testing"
"time"

Expand Down Expand Up @@ -419,6 +421,72 @@ func TestCertsAreNotGeneratedInInsecureMode(t *testing.T) {
assert.Nil(t, s.settings.Certificate)
}

func TestGracefulShutdown(t *testing.T) {
port, err := test.GetFreePort()
require.NoError(t, err)
mockRepoClient := &mocks.Clientset{RepoServerServiceClient: &mocks.RepoServerServiceClient{}}
kubeclientset := fake.NewSimpleClientset(test.NewFakeConfigMap(), test.NewFakeSecret())
redis, redisCloser := test.NewInMemoryRedis()
defer redisCloser()
s := NewServer(
context.Background(),
ArgoCDServerOpts{
ListenPort: port,
Namespace: test.FakeArgoCDNamespace,
KubeClientset: kubeclientset,
AppClientset: apps.NewSimpleClientset(),
RepoClientset: mockRepoClient,
RedisClient: redis,
},
ApplicationSetOpts{},
)

projInformerCancel := test.StartInformer(s.projInformer)
defer projInformerCancel()
appInformerCancel := test.StartInformer(s.appInformer)
defer appInformerCancel()
appsetInformerCancel := test.StartInformer(s.appsetInformer)
defer appsetInformerCancel()

lns, err := s.Listen()
require.NoError(t, err)

shutdown := false
runCtx, runCancel := context.WithTimeout(context.Background(), 2*time.Second)
defer runCancel()

err = s.healthCheck(&http.Request{URL: &url.URL{Path: "/healthz", RawQuery: "full=true"}})
require.Error(t, err, "API Server is not running. It either hasn't started or is restarting.")

var wg gosync.WaitGroup
wg.Add(1)
go func(shutdown *bool) {
defer wg.Done()
s.Run(runCtx, lns)
*shutdown = true
}(&shutdown)

for {
if s.available.Load() {
err = s.healthCheck(&http.Request{URL: &url.URL{Path: "/healthz", RawQuery: "full=true"}})
require.NoError(t, err)
break
}
time.Sleep(10 * time.Millisecond)
}

s.stopCh <- syscall.SIGINT

wg.Wait()

err = s.healthCheck(&http.Request{URL: &url.URL{Path: "/healthz", RawQuery: "full=true"}})
require.Error(t, err, "API Server is terminating and unable to serve requests.")

assert.True(t, s.terminateRequested.Load())
assert.False(t, s.available.Load())
assert.True(t, shutdown)
}

func TestAuthenticate(t *testing.T) {
type testData struct {
test string
Expand Down
47 changes: 47 additions & 0 deletions test/e2e/aa_graceful_restart_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package e2e

import (
"net/http"
"strings"
"testing"
"time"

"github.com/stretchr/testify/require"

"github.com/argoproj/argo-cd/v2/test/e2e/fixture"
. "github.com/argoproj/argo-cd/v2/test/e2e/fixture"
)

func checkHealth(t *testing.T, requireHealthy bool) {
t.Helper()
resp, err := DoHttpRequest("GET", "/healthz?full=true", "")
if requireHealthy {
require.NoError(t, err)
require.Equal(t, http.StatusOK, resp.StatusCode)
} else {
if err != nil {
if !strings.Contains(err.Error(), "connection refused") && !strings.Contains(err.Error(), "connection reset by peer") {
require.NoErrorf(t, err, "If an error returned, it must be about connection refused or reset by peer")
}
} else {
require.Contains(t, []int{http.StatusOK, http.StatusServiceUnavailable}, resp.StatusCode)
}
}
}

func TestAPIServerGracefulRestart(t *testing.T) {
EnsureCleanState(t)

// Should be healthy.
checkHealth(t, true)
// Should trigger API server restart.
fixture.SetParamInSettingConfigMap("additionalUrls", "- http://test")

// Wait for ~30 seconds
for i := 0; i < 300; i++ {
checkHealth(t, false)
time.Sleep(100 * time.Millisecond)
}
// One final time, should be healthy, or restart is considered too slow.
checkHealth(t, true)
}

0 comments on commit de51fbb

Please sign in to comment.