From b5337ce4d01657cb9f8b3993488f62c27eee1ce5 Mon Sep 17 00:00:00 2001
From: Bruno Bressi <52347078+puffitos@users.noreply.github.com>
Date: Mon, 27 Jan 2025 12:47:04 +0100
Subject: [PATCH] No more sending on closed channels during checkscontroller
 shutdown (#245)

* feat: added metrics to targetmanager

First metric announces registration state.

* test: added metrics to the tests

* fix: race condition when shutting down checks

When teh sparrow check controller would shut down, it would iterate over
the actual slice of the checks, shutdown each check and then procceed to
delete said check from the slice.

Since the shutting down procedure is instant, there was a race condition
that would delete a wrong check from the slice and then the same
shutting down loop would try and shutdown the same check again.

Just returning a copy of the slice resolves this problem, as the
iteration is now done on the copy only. A more sophisticated deletion
routine for the checks slice could be another way to handle this, but it
would probably increase the complexity of the checks and
checkscontroller structs.

* chore: shutdown message on end

* test: added validating test case

Signed-off-by: Bruno Bressi <puffito@gmail.com>

* chore: marked function as helper

* test: added test for the controller's update function

This test proves that the shallow clone works as intended and returns a clone of the slice, where the original references can still be used and updated.

* chore: bumped golangci lint to latest version

This should fix the bodyclose linting remarks

Signed-off-by: Bruno Bressi <puffito@gmail.com>

---------

Signed-off-by: Bruno Bressi <puffito@gmail.com>
---
 .github/workflows/pre-commit.yml |   2 +-
 cmd/run.go                       |   7 +-
 pkg/api/api_test.go              |   2 +-
 pkg/checks/runtime/checks.go     |   3 +-
 pkg/sparrow/controller_test.go   | 147 +++++++++++++++++++++++++++++++
 pkg/sparrow/handlers_test.go     |   2 +-
 pkg/sparrow/run.go               |   5 +-
 pkg/sparrow/run_errors.go        |   3 +
 8 files changed, 162 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index bc19f374..fdecb610 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -17,7 +17,7 @@ jobs:
 
       - name: Install go toolchain for pre-commit
         run: |
-          curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.60.3
+          curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.63.4
           go install mvdan.cc/gofumpt@latest
           go install github.com/matryer/moq@latest
           go install github.com/norwoodj/helm-docs/cmd/helm-docs@latest
diff --git a/cmd/run.go b/cmd/run.go
index 6cea2eee..6a5eca7f 100644
--- a/cmd/run.go
+++ b/cmd/run.go
@@ -86,17 +86,18 @@ func run() func(cmd *cobra.Command, args []string) error {
 
 		s := sparrow.New(cfg)
 		cErr := make(chan error, 1)
-		log.Info("Running sparrow")
+		log.InfoContext(ctx, "Running sparrow")
 		go func() {
 			cErr <- s.Run(ctx)
 		}()
 
 		select {
 		case <-sigChan:
-			log.Info("Signal received, shutting down")
+			log.InfoContext(ctx, "Signal received, shutting down")
 			cancel()
 			<-cErr
-		case err := <-cErr:
+		case err = <-cErr:
+			log.InfoContext(ctx, "Sparrow was shut down")
 			return err
 		}
 
diff --git a/pkg/api/api_test.go b/pkg/api/api_test.go
index 1637e16e..e22da703 100644
--- a/pkg/api/api_test.go
+++ b/pkg/api/api_test.go
@@ -77,7 +77,7 @@ func TestAPI_Run(t *testing.T) {
 				rec := httptest.NewRecorder()
 				a.router.ServeHTTP(rec, req)
 
-				if status := rec.Result().StatusCode; status != tt.want.status { //nolint:bodyclose // closed in defer below
+				if status := rec.Result().StatusCode; status != tt.want.status {
 					t.Errorf("Handler for route %s returned wrong status code: got %v want %v", tt.want.path, status, tt.want.status)
 				}
 
diff --git a/pkg/checks/runtime/checks.go b/pkg/checks/runtime/checks.go
index 5a34cbca..e5c550da 100644
--- a/pkg/checks/runtime/checks.go
+++ b/pkg/checks/runtime/checks.go
@@ -19,6 +19,7 @@
 package runtime
 
 import (
+	"slices"
 	"sync"
 
 	"github.com/telekom/sparrow/pkg/checks"
@@ -53,5 +54,5 @@ func (c *Checks) Delete(check checks.Check) {
 func (c *Checks) Iter() []checks.Check {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
-	return c.checks
+	return slices.Clone(c.checks)
 }
diff --git a/pkg/sparrow/controller_test.go b/pkg/sparrow/controller_test.go
index f3b22e20..bbebc002 100644
--- a/pkg/sparrow/controller_test.go
+++ b/pkg/sparrow/controller_test.go
@@ -103,6 +103,58 @@ func TestRun_ContextCancellation(t *testing.T) {
 	}
 }
 
+// TestChecksController_Shutdown tests the shutdown of the ChecksController
+// when none, one or multiple checks are registered. The test checks that after shutdown no
+// checks are registered anymore (the checks slice is empty) and that the done channel is closed.
+func TestChecksController_Shutdown(t *testing.T) {
+	tests := []struct {
+		name   string
+		checks []checks.Check
+	}{
+		{
+			name:   "no checks registered",
+			checks: nil,
+		},
+		{
+			name:   "one check registered",
+			checks: []checks.Check{newMockCheck(t, "mockCheck")},
+		},
+		{
+			name: "multiple checks registered",
+			checks: []checks.Check{
+				newMockCheck(t, "mockCheck1"),
+				newMockCheck(t, "mockCheck2"),
+				newMockCheck(t, "mockCheck3"),
+				newMockCheck(t, "mockCheck4"),
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			cc := NewChecksController(db.NewInMemory(), metrics.New(metrics.Config{}))
+
+			if tt.checks != nil {
+				for _, check := range tt.checks {
+					cc.RegisterCheck(context.Background(), check)
+				}
+			}
+
+			cc.Shutdown(context.Background())
+
+			select {
+			case <-cc.done:
+				if len(cc.checks.Iter()) != 0 {
+					t.Errorf("Expected no checks to be registered")
+				}
+				return
+			case <-time.After(time.Second):
+				t.Fatal("Expected done channel to be closed")
+			}
+		})
+	}
+}
+
 func TestChecksController_Reconcile(t *testing.T) {
 	ctx, cancel := logger.NewContextWithLogger(context.Background())
 	defer cancel()
@@ -235,6 +287,74 @@ func TestChecksController_Reconcile(t *testing.T) {
 	}
 }
 
+// TestChecksController_Reconcile_Update tests the update of the checks
+// when the runtime configuration changes.
+func TestChecksController_Reconcile_Update(t *testing.T) {
+	ctx, cancel := logger.NewContextWithLogger(context.Background())
+	defer cancel()
+
+	tests := []struct {
+		name             string
+		checks           []checks.Check
+		newRuntimeConfig runtime.Config
+	}{
+		{
+			name: "update health check",
+			checks: []checks.Check{
+				health.NewCheck(),
+			},
+			newRuntimeConfig: runtime.Config{
+				Health: &health.Config{
+					Targets:  []string{"https://new.com"},
+					Interval: 200 * time.Millisecond,
+					Timeout:  1000 * time.Millisecond,
+				},
+			},
+		},
+		{
+			name: "update health & latency check",
+			checks: []checks.Check{
+				health.NewCheck(),
+				latency.NewCheck(),
+			},
+			newRuntimeConfig: runtime.Config{
+				Health: &health.Config{
+					Targets:  []string{"https://new.com"},
+					Interval: 200 * time.Millisecond,
+					Timeout:  1000 * time.Millisecond,
+				},
+				Latency: &latency.Config{
+					Targets:  []string{"https://new.com"},
+					Interval: 200 * time.Millisecond,
+					Timeout:  1000 * time.Millisecond,
+				},
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			cc := NewChecksController(db.NewInMemory(), metrics.New(metrics.Config{}))
+			for _, c := range tt.checks {
+				cc.checks.Add(c)
+			}
+
+			cc.Reconcile(ctx, tt.newRuntimeConfig)
+
+			for _, c := range cc.checks.Iter() {
+				switch c.GetConfig().For() {
+				case health.CheckName:
+					hc := c.(*health.Health)
+					assert.Equal(t, tt.newRuntimeConfig.Health.Targets, hc.GetConfig().(*health.Config).Targets)
+				case latency.CheckName:
+					lc := c.(*latency.Latency)
+					assert.Equal(t, tt.newRuntimeConfig.Latency.Targets, lc.GetConfig().(*latency.Config).Targets)
+				}
+			}
+		})
+	}
+}
+
 func TestChecksController_RegisterCheck(t *testing.T) {
 	tests := []struct {
 		name  string
@@ -379,3 +499,30 @@ func TestGenerateCheckSpecs(t *testing.T) {
 		})
 	}
 }
+
+// newMockCheck creates a new mock check with the given name.
+func newMockCheck(t *testing.T, name string) *checks.CheckMock {
+	t.Helper()
+	return &checks.CheckMock{
+		GetMetricCollectorsFunc: func() []prometheus.Collector {
+			return []prometheus.Collector{
+				prometheus.NewCounter(prometheus.CounterOpts{
+					Name: fmt.Sprintf("%s_mock_metric", name),
+				}),
+			}
+		},
+		NameFunc: func() string {
+			return name
+		},
+		RemoveLabelledMetricsFunc: nil,
+		RunFunc: func(ctx context.Context, cResult chan checks.ResultDTO) error {
+			t.Logf("Run called for check %s", name)
+			return nil
+		},
+		SchemaFunc: nil,
+		ShutdownFunc: func() {
+			t.Logf("Shutdown called for check %s", name)
+		},
+		UpdateConfigFunc: nil,
+	}
+}
diff --git a/pkg/sparrow/handlers_test.go b/pkg/sparrow/handlers_test.go
index 32793f80..a21b6d91 100644
--- a/pkg/sparrow/handlers_test.go
+++ b/pkg/sparrow/handlers_test.go
@@ -129,7 +129,7 @@ func TestSparrow_handleCheckMetrics(t *testing.T) {
 			}
 
 			s.handleCheckMetrics(w, r)
-			resp := w.Result() //nolint:bodyclose
+			resp := w.Result()
 			body, _ := io.ReadAll(resp.Body)
 
 			if tt.wantCode == http.StatusOK {
diff --git a/pkg/sparrow/run.go b/pkg/sparrow/run.go
index 19e72a30..4a2f0b3e 100644
--- a/pkg/sparrow/run.go
+++ b/pkg/sparrow/run.go
@@ -131,6 +131,7 @@ func (s *Sparrow) Run(ctx context.Context) error {
 				s.shutdown(ctx)
 			}
 		case <-s.cDone:
+			log.InfoContext(ctx, "Sparrow was shut down")
 			return fmt.Errorf("sparrow was shut down")
 		}
 	}
@@ -181,7 +182,7 @@ func (s *Sparrow) shutdown(ctx context.Context) {
 	defer cancel()
 
 	s.shutOnce.Do(func() {
-		log.Info("Shutting down sparrow gracefully")
+		log.InfoContext(ctx, "Shutting down sparrow")
 		var sErrs ErrShutdown
 		if s.tarMan != nil {
 			sErrs.errTarMan = s.tarMan.Shutdown(ctx)
@@ -192,7 +193,7 @@ func (s *Sparrow) shutdown(ctx context.Context) {
 		s.controller.Shutdown(ctx)
 
 		if sErrs.HasError() {
-			log.Error("Failed to shutdown gracefully", "contextError", errC, "errors", sErrs)
+			log.ErrorContext(ctx, "Failed to shutdown gracefully", "contextError", errC, "errors", sErrs)
 		}
 
 		// Signal that shutdown is complete
diff --git a/pkg/sparrow/run_errors.go b/pkg/sparrow/run_errors.go
index e1f983ba..36630b4d 100644
--- a/pkg/sparrow/run_errors.go
+++ b/pkg/sparrow/run_errors.go
@@ -18,12 +18,15 @@
 
 package sparrow
 
+// ErrShutdown holds any errors that may
+// have occurred during shutdown of the Sparrow
 type ErrShutdown struct {
 	errAPI     error
 	errTarMan  error
 	errMetrics error
 }
 
+// HasError returns true if any of the errors are set
 func (e ErrShutdown) HasError() bool {
 	return e.errAPI != nil || e.errTarMan != nil || e.errMetrics != nil
 }