From b5337ce4d01657cb9f8b3993488f62c27eee1ce5 Mon Sep 17 00:00:00 2001 From: Bruno Bressi <52347078+puffitos@users.noreply.github.com> Date: Mon, 27 Jan 2025 12:47:04 +0100 Subject: [PATCH] No more sending on closed channels during checkscontroller shutdown (#245) * feat: added metrics to targetmanager First metric announces registration state. * test: added metrics to the tests * fix: race condition when shutting down checks When teh sparrow check controller would shut down, it would iterate over the actual slice of the checks, shutdown each check and then procceed to delete said check from the slice. Since the shutting down procedure is instant, there was a race condition that would delete a wrong check from the slice and then the same shutting down loop would try and shutdown the same check again. Just returning a copy of the slice resolves this problem, as the iteration is now done on the copy only. A more sophisticated deletion routine for the checks slice could be another way to handle this, but it would probably increase the complexity of the checks and checkscontroller structs. * chore: shutdown message on end * test: added validating test case Signed-off-by: Bruno Bressi * chore: marked function as helper * test: added test for the controller's update function This test proves that the shallow clone works as intended and returns a clone of the slice, where the original references can still be used and updated. * chore: bumped golangci lint to latest version This should fix the bodyclose linting remarks Signed-off-by: Bruno Bressi --------- Signed-off-by: Bruno Bressi --- .github/workflows/pre-commit.yml | 2 +- cmd/run.go | 7 +- pkg/api/api_test.go | 2 +- pkg/checks/runtime/checks.go | 3 +- pkg/sparrow/controller_test.go | 147 +++++++++++++++++++++++++++++++ pkg/sparrow/handlers_test.go | 2 +- pkg/sparrow/run.go | 5 +- pkg/sparrow/run_errors.go | 3 + 8 files changed, 162 insertions(+), 9 deletions(-) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index bc19f374..fdecb610 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -17,7 +17,7 @@ jobs: - name: Install go toolchain for pre-commit run: | - curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.60.3 + curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.63.4 go install mvdan.cc/gofumpt@latest go install github.com/matryer/moq@latest go install github.com/norwoodj/helm-docs/cmd/helm-docs@latest diff --git a/cmd/run.go b/cmd/run.go index 6cea2eee..6a5eca7f 100644 --- a/cmd/run.go +++ b/cmd/run.go @@ -86,17 +86,18 @@ func run() func(cmd *cobra.Command, args []string) error { s := sparrow.New(cfg) cErr := make(chan error, 1) - log.Info("Running sparrow") + log.InfoContext(ctx, "Running sparrow") go func() { cErr <- s.Run(ctx) }() select { case <-sigChan: - log.Info("Signal received, shutting down") + log.InfoContext(ctx, "Signal received, shutting down") cancel() <-cErr - case err := <-cErr: + case err = <-cErr: + log.InfoContext(ctx, "Sparrow was shut down") return err } diff --git a/pkg/api/api_test.go b/pkg/api/api_test.go index 1637e16e..e22da703 100644 --- a/pkg/api/api_test.go +++ b/pkg/api/api_test.go @@ -77,7 +77,7 @@ func TestAPI_Run(t *testing.T) { rec := httptest.NewRecorder() a.router.ServeHTTP(rec, req) - if status := rec.Result().StatusCode; status != tt.want.status { //nolint:bodyclose // closed in defer below + if status := rec.Result().StatusCode; status != tt.want.status { t.Errorf("Handler for route %s returned wrong status code: got %v want %v", tt.want.path, status, tt.want.status) } diff --git a/pkg/checks/runtime/checks.go b/pkg/checks/runtime/checks.go index 5a34cbca..e5c550da 100644 --- a/pkg/checks/runtime/checks.go +++ b/pkg/checks/runtime/checks.go @@ -19,6 +19,7 @@ package runtime import ( + "slices" "sync" "github.com/telekom/sparrow/pkg/checks" @@ -53,5 +54,5 @@ func (c *Checks) Delete(check checks.Check) { func (c *Checks) Iter() []checks.Check { c.mu.RLock() defer c.mu.RUnlock() - return c.checks + return slices.Clone(c.checks) } diff --git a/pkg/sparrow/controller_test.go b/pkg/sparrow/controller_test.go index f3b22e20..bbebc002 100644 --- a/pkg/sparrow/controller_test.go +++ b/pkg/sparrow/controller_test.go @@ -103,6 +103,58 @@ func TestRun_ContextCancellation(t *testing.T) { } } +// TestChecksController_Shutdown tests the shutdown of the ChecksController +// when none, one or multiple checks are registered. The test checks that after shutdown no +// checks are registered anymore (the checks slice is empty) and that the done channel is closed. +func TestChecksController_Shutdown(t *testing.T) { + tests := []struct { + name string + checks []checks.Check + }{ + { + name: "no checks registered", + checks: nil, + }, + { + name: "one check registered", + checks: []checks.Check{newMockCheck(t, "mockCheck")}, + }, + { + name: "multiple checks registered", + checks: []checks.Check{ + newMockCheck(t, "mockCheck1"), + newMockCheck(t, "mockCheck2"), + newMockCheck(t, "mockCheck3"), + newMockCheck(t, "mockCheck4"), + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cc := NewChecksController(db.NewInMemory(), metrics.New(metrics.Config{})) + + if tt.checks != nil { + for _, check := range tt.checks { + cc.RegisterCheck(context.Background(), check) + } + } + + cc.Shutdown(context.Background()) + + select { + case <-cc.done: + if len(cc.checks.Iter()) != 0 { + t.Errorf("Expected no checks to be registered") + } + return + case <-time.After(time.Second): + t.Fatal("Expected done channel to be closed") + } + }) + } +} + func TestChecksController_Reconcile(t *testing.T) { ctx, cancel := logger.NewContextWithLogger(context.Background()) defer cancel() @@ -235,6 +287,74 @@ func TestChecksController_Reconcile(t *testing.T) { } } +// TestChecksController_Reconcile_Update tests the update of the checks +// when the runtime configuration changes. +func TestChecksController_Reconcile_Update(t *testing.T) { + ctx, cancel := logger.NewContextWithLogger(context.Background()) + defer cancel() + + tests := []struct { + name string + checks []checks.Check + newRuntimeConfig runtime.Config + }{ + { + name: "update health check", + checks: []checks.Check{ + health.NewCheck(), + }, + newRuntimeConfig: runtime.Config{ + Health: &health.Config{ + Targets: []string{"https://new.com"}, + Interval: 200 * time.Millisecond, + Timeout: 1000 * time.Millisecond, + }, + }, + }, + { + name: "update health & latency check", + checks: []checks.Check{ + health.NewCheck(), + latency.NewCheck(), + }, + newRuntimeConfig: runtime.Config{ + Health: &health.Config{ + Targets: []string{"https://new.com"}, + Interval: 200 * time.Millisecond, + Timeout: 1000 * time.Millisecond, + }, + Latency: &latency.Config{ + Targets: []string{"https://new.com"}, + Interval: 200 * time.Millisecond, + Timeout: 1000 * time.Millisecond, + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cc := NewChecksController(db.NewInMemory(), metrics.New(metrics.Config{})) + for _, c := range tt.checks { + cc.checks.Add(c) + } + + cc.Reconcile(ctx, tt.newRuntimeConfig) + + for _, c := range cc.checks.Iter() { + switch c.GetConfig().For() { + case health.CheckName: + hc := c.(*health.Health) + assert.Equal(t, tt.newRuntimeConfig.Health.Targets, hc.GetConfig().(*health.Config).Targets) + case latency.CheckName: + lc := c.(*latency.Latency) + assert.Equal(t, tt.newRuntimeConfig.Latency.Targets, lc.GetConfig().(*latency.Config).Targets) + } + } + }) + } +} + func TestChecksController_RegisterCheck(t *testing.T) { tests := []struct { name string @@ -379,3 +499,30 @@ func TestGenerateCheckSpecs(t *testing.T) { }) } } + +// newMockCheck creates a new mock check with the given name. +func newMockCheck(t *testing.T, name string) *checks.CheckMock { + t.Helper() + return &checks.CheckMock{ + GetMetricCollectorsFunc: func() []prometheus.Collector { + return []prometheus.Collector{ + prometheus.NewCounter(prometheus.CounterOpts{ + Name: fmt.Sprintf("%s_mock_metric", name), + }), + } + }, + NameFunc: func() string { + return name + }, + RemoveLabelledMetricsFunc: nil, + RunFunc: func(ctx context.Context, cResult chan checks.ResultDTO) error { + t.Logf("Run called for check %s", name) + return nil + }, + SchemaFunc: nil, + ShutdownFunc: func() { + t.Logf("Shutdown called for check %s", name) + }, + UpdateConfigFunc: nil, + } +} diff --git a/pkg/sparrow/handlers_test.go b/pkg/sparrow/handlers_test.go index 32793f80..a21b6d91 100644 --- a/pkg/sparrow/handlers_test.go +++ b/pkg/sparrow/handlers_test.go @@ -129,7 +129,7 @@ func TestSparrow_handleCheckMetrics(t *testing.T) { } s.handleCheckMetrics(w, r) - resp := w.Result() //nolint:bodyclose + resp := w.Result() body, _ := io.ReadAll(resp.Body) if tt.wantCode == http.StatusOK { diff --git a/pkg/sparrow/run.go b/pkg/sparrow/run.go index 19e72a30..4a2f0b3e 100644 --- a/pkg/sparrow/run.go +++ b/pkg/sparrow/run.go @@ -131,6 +131,7 @@ func (s *Sparrow) Run(ctx context.Context) error { s.shutdown(ctx) } case <-s.cDone: + log.InfoContext(ctx, "Sparrow was shut down") return fmt.Errorf("sparrow was shut down") } } @@ -181,7 +182,7 @@ func (s *Sparrow) shutdown(ctx context.Context) { defer cancel() s.shutOnce.Do(func() { - log.Info("Shutting down sparrow gracefully") + log.InfoContext(ctx, "Shutting down sparrow") var sErrs ErrShutdown if s.tarMan != nil { sErrs.errTarMan = s.tarMan.Shutdown(ctx) @@ -192,7 +193,7 @@ func (s *Sparrow) shutdown(ctx context.Context) { s.controller.Shutdown(ctx) if sErrs.HasError() { - log.Error("Failed to shutdown gracefully", "contextError", errC, "errors", sErrs) + log.ErrorContext(ctx, "Failed to shutdown gracefully", "contextError", errC, "errors", sErrs) } // Signal that shutdown is complete diff --git a/pkg/sparrow/run_errors.go b/pkg/sparrow/run_errors.go index e1f983ba..36630b4d 100644 --- a/pkg/sparrow/run_errors.go +++ b/pkg/sparrow/run_errors.go @@ -18,12 +18,15 @@ package sparrow +// ErrShutdown holds any errors that may +// have occurred during shutdown of the Sparrow type ErrShutdown struct { errAPI error errTarMan error errMetrics error } +// HasError returns true if any of the errors are set func (e ErrShutdown) HasError() bool { return e.errAPI != nil || e.errTarMan != nil || e.errMetrics != nil }