pillar/watcher: Add unit tests for goroutine leak detection.

Introduce a suite of tests to verify the functionality of the goroutine leak detection system, simulating various scenarios to ensure robust leak identification. Growth scenarios: Test scenarios for initial goroutine growth with stabilization, sudden spikes, and decreases after spikes to validate that these typical patterns do not trigger false positives. Leak scenario: Include tests where the goroutine count gradually increases, simulating a leak. The test verifies that the detector correctly identifies this pattern as a leak. Monitoring tests: Implement tests for the `goroutinesMonitor` function to check detection behavior at regular intervals. Edge cases for the helper function: Add tests to handle empty data inputs, minimal data length, and cases where the moving average window exceeds data length, ensuring resilience against edge cases. Signed-off-by: Nikolay Martyanov <nikolay@zededa.com>
lf-edge · Nov 6, 2024 · cc28001 · cc28001
1 parent 1177842
commit cc28001
Showing 1 changed file with 340 additions and 0 deletions.
diff --git a/pkg/pillar/cmd/watcher/watcher_test.go b/pkg/pillar/cmd/watcher/watcher_test.go
@@ -0,0 +1,340 @@
+// Copyright (c) 2024 Zededa, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+package watcher
+
+import (
+	"github.com/lf-edge/eve/pkg/pillar/agentlog"
+	"io"
+	"math"
+	"os"
+	"reflect"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+)
+
+// Scenario 1: Initial growth that slows down and then stabilizes
+func emulateSystemStart(startDuration, stabilizationDuration int) []int {
+	totalDuration := startDuration + stabilizationDuration
+	data := make([]int, startDuration)
+	baseGoroutines := 0
+	for i := 0; i < startDuration; i++ {
+		// Simulate fast growth that slows down
+		growth := int(500 * (1 - math.Exp(-float64(i)/10))) // Exponential decay
+		data[i] = baseGoroutines + growth
+	}
+	// Stabilize after growth
+	stableGoroutines := data[len(data)-1]
+	for i := startDuration; i < totalDuration; i++ {
+		data = append(data, stableGoroutines)
+	}
+	return data
+}
+
+// Scenario 2: Stabilization, then a process creates a lot of goroutines quickly, which then stabilizes
+func emulateSpikeAfterSystemStart(startDuration, stabilizationDuration, spikeDuration int) []int {
+	data := emulateSystemStart(startDuration, stabilizationDuration)
+	totalStartDuration := len(data)
+	baseGoroutines := data[len(data)-1]
+	for i := totalStartDuration; i < totalStartDuration+spikeDuration; i++ {
+		growth := int(100 * (1 - math.Exp(-float64(i-startDuration)/5))) // Quick spike
+		data = append(data, baseGoroutines+growth)
+	}
+	// Stabilize after spike
+	stableGoroutines := data[len(data)-1]
+	for i := totalStartDuration + spikeDuration; i < totalStartDuration+spikeDuration+stabilizationDuration; i++ {
+		data = append(data, stableGoroutines)
+	}
+	return data
+}
+
+// Scenario 3: After the spike, goroutine count decreases
+func emulateDecreaseAfterSpike(decreaseDuration, spikeDuration, stabilizationDuration, startDuration int) []int {
+	data := emulateSpikeAfterSystemStart(startDuration, stabilizationDuration, spikeDuration)
+	decreaseStart := len(data)
+	baseGoroutines := data[decreaseStart-1]
+	for i := 0; i < decreaseDuration; i++ {
+		decrease := int(float64(baseGoroutines) * (1 - float64(i)/float64(decreaseDuration)))
+		data = append(data, decrease)
+	}
+	return data
+}
+
+// Scenario 4: After the spike, goroutine count starts to slowly increase over time
+func emulateLeakAfterSpike(leakDuration, stabilizationDuration, spikeDuration, startDuration int) []int {
+	data := emulateSpikeAfterSystemStart(startDuration, stabilizationDuration, spikeDuration)
+	increaseStart := len(data)
+	baseGoroutines := data[increaseStart-1]
+	for i := increaseStart; i < increaseStart+leakDuration; i++ {
+		// Slow linear increase
+		growth := baseGoroutines + (i-increaseStart)*5
+		data = append(data, growth)
+	}
+	return data
+}
+
+func TestMain(m *testing.M) {
+	logger, log = agentlog.Init("watcher")
+	os.Exit(m.Run())
+}
+
+// Computes moving average correctly for valid data and window size
+func TestMovingAverageValidData(t *testing.T) {
+	data := []int{1, 2, 3, 4, 5}
+	windowSize := 3
+	expected := []float64{2.0, 3.0, 4.0}
+
+	result := movingAverage(data, windowSize)
+
+	if !reflect.DeepEqual(result, expected) {
+		t.Errorf("Expected %v, got %v", expected, result)
+	}
+}
+
+// Handles window size equal to data length by returning a single average
+func TestMovingAverageWindowSizeEqualDataLength(t *testing.T) {
+	data := []int{1, 2, 3, 4, 5}
+	windowSize := 5
+	expected := []float64{3.0}
+
+	result := movingAverage(data, windowSize)
+
+	if !reflect.DeepEqual(result, expected) {
+		t.Errorf("Expected %v, got %v", expected, result)
+	}
+}
+
+// Handles empty data array gracefully
+func TestMovingAverageEmptyData(t *testing.T) {
+	data := []int{}
+	windowSize := 3
+	var expected []float64
+	expected = nil
+
+	result := movingAverage(data, windowSize)
+
+	if !reflect.DeepEqual(result, expected) {
+		t.Errorf("Expected %v (type %T), got %v (type %T)", expected, expected, result, result)
+	}
+}
+
+// Manages window size of zero by defaulting to 1
+func TestMovingAverageWindowSizeZero(t *testing.T) {
+	data := []int{1, 2, 3, 4, 5}
+	windowSize := 0
+	expected := []float64{1.0, 2.0, 3.0, 4.0, 5.0}
+
+	result := movingAverage(data, windowSize)
+
+	if !reflect.DeepEqual(result, expected) {
+		t.Errorf("Expected %v, got %v", expected, result)
+	}
+}
+
+// Deals with window size larger than data length by defaulting to data length
+func TestMovingAverageWindowSizeLargerThanDataLength(t *testing.T) {
+	data := []int{1, 2, 3}
+	windowSize := 5
+	expected := []float64{2.0}
+
+	result := movingAverage(data, windowSize)
+
+	if !reflect.DeepEqual(result, expected) {
+		t.Errorf("Expected %v, got %v", expected, result)
+	}
+}
+
+func TestGoroutineLeakDetectorWithSystemStart(t *testing.T) {
+	stats := emulateSystemStart(5, 5)
+	detected, _ := detectGoroutineLeaks(stats)
+	if detected {
+		t.Errorf("Expected no goroutine leak, but detected one")
+	}
+}
+
+func TestGoroutineLeakDetectorWithLegitSpike(t *testing.T) {
+	stats := emulateSpikeAfterSystemStart(5, 5, 20)
+	detected, _ := detectGoroutineLeaks(stats)
+	if detected {
+		t.Errorf("Expected no goroutine leak, but detected one")
+	}
+}
+
+func TestGoroutineLeakDetectorWithDecreaseAfterSpike(t *testing.T) {
+	stats := emulateDecreaseAfterSpike(10, 20, 5, 5)
+	detected, _ := detectGoroutineLeaks(stats)
+	if detected {
+		t.Errorf("Expected no goroutine leak, but detected one")
+	}
+}
+
+func TestGoroutineLeakDetectorWithLeakAfterSpike(t *testing.T) {
+	stats := emulateLeakAfterSpike(100, 5, 20, 5)
+	detected, _ := detectGoroutineLeaks(stats)
+	if !detected {
+		t.Errorf("Expected goroutine leak to be detected, but it was not")
+	}
+}
+
+func TestGoroutineLeakDetectorWithLeakEachStep(t *testing.T) {
+	startDuration := 5
+	stabilizationDuration := 5
+	spikeDuration := 20
+	leakDuration := 100
+	leakMayBeDetectedAfter := startDuration + stabilizationDuration + spikeDuration + stabilizationDuration
+	possibleFalsePositives := 60
+	leakMustBeDetectedAfter := leakMayBeDetectedAfter + possibleFalsePositives
+	stats := emulateLeakAfterSpike(leakDuration, stabilizationDuration, spikeDuration, startDuration)
+	// Now check the behavior of detector on each new data point
+	for i := 0; i < len(stats); i++ {
+		detected, _ := detectGoroutineLeaks(stats[:i])
+		// Leak should be detected after the slow increase starts
+		if detected && i < startDuration+stabilizationDuration+spikeDuration+stabilizationDuration {
+			t.Errorf("Expected no goroutine leak, but detected one at step %d", i)
+		}
+		if !detected && i >= leakMayBeDetectedAfter && i < leakMustBeDetectedAfter {
+			t.Logf("Expected goroutine leak to be detected, but it was not at step %d", i)
+		}
+		if !detected && i >= leakMustBeDetectedAfter {
+			t.Errorf("Expected goroutine leak to be detected, but it was not at step %d", i)
+		}
+	}
+}
+
+// Handles empty input data gracefully
+func TestEmptyInputData(t *testing.T) {
+	stats := []int{}
+	detected, smoothedData := detectGoroutineLeaks(stats)
+	if detected || smoothedData != nil {
+		t.Errorf("Expected no detection and nil smoothed data for empty input")
+	}
+}
+
+// Handles input data with fewer than two elements
+func TestInputDataWithFewerThanTwoElements(t *testing.T) {
+	stats := []int{5}
+	detected, smoothedData := detectGoroutineLeaks(stats)
+	if detected || len(smoothedData) != 0 {
+		t.Errorf("Expected no detection and empty smoothed data for input with fewer than two elements")
+	}
+}
+
+// Handles window size larger than data length
+func TestWindowSizeLargerThanDataLength(t *testing.T) {
+	stats := []int{1, 2}
+	windowSize := len(stats) + 1
+	smoothedData := movingAverage(stats, windowSize)
+	if len(smoothedData) != 1 {
+		t.Errorf("Expected smoothed data length of 1 when window size is larger than data length")
+	}
+}
+
+// Monitors goroutine count at regular intervals
+func TestGoroutinesMonitorNoLeak(t *testing.T) {
+	keepStatsFor := 24 * 60 * time.Millisecond
+	goroutinesThreshold := 100
+	checkInterval := 1 * time.Millisecond
+	checkStatsFor := 60 * time.Millisecond
+	cooldownPeriod := 10 * time.Millisecond
+
+	backupOut := logger.Out
+	// Create a pipe to capture log output
+	r, w, _ := os.Pipe()
+	logger.Out = w
+
+	go func() {
+		goroutinesMonitor(goroutinesThreshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod)
+	}()
+
+	var wg sync.WaitGroup
+
+	wg.Add(1)
+
+	// Create a goroutine to simulate the creation of goroutines
+	go func() {
+		timeStart := time.Now()
+		for {
+			if time.Since(timeStart) > 2*keepStatsFor {
+				break
+			}
+			// Create a goroutine
+			go func() {
+				time.Sleep(checkInterval / 2)
+			}()
+			time.Sleep(2 * checkInterval)
+		}
+		wg.Done()
+	}()
+
+	wg.Wait()
+
+	// Close the pipe
+	w.Close()
+
+	// Read the log output
+	output, _ := io.ReadAll(r)
+
+	// Check if the log output does not contain the detection message
+	// If it does, it means that the goroutine leak was detected
+	if strings.Contains(string(output), "leak detected") {
+		t.Errorf("Expected no goroutine leak to be detected")
+	}
+
+	logger.Out = backupOut
+
+}
+
+func TestGoroutinesMonitorLeak(t *testing.T) {
+	keepStatsFor := 24 * 60 * time.Millisecond
+	goroutinesThreshold := 100
+	checkInterval := 1 * time.Millisecond
+	checkStatsFor := 60 * time.Millisecond
+	cooldownPeriod := 10 * time.Millisecond
+
+	backupOut := logger.Out
+	// Create a pipe to capture log output
+	r, w, _ := os.Pipe()
+	logger.Out = w
+
+	go func() {
+		goroutinesMonitor(goroutinesThreshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod)
+	}()
+
+	var wg sync.WaitGroup
+
+	wg.Add(1)
+
+	// Create a goroutine to simulate the creation of goroutines
+	go func() {
+		timeStart := time.Now()
+		for {
+			if time.Since(timeStart) > 2*keepStatsFor {
+				break
+			}
+			// Create a goroutine
+			go func() {
+				time.Sleep(checkInterval * 100)
+			}()
+			time.Sleep(checkInterval / 2)
+		}
+		wg.Done()
+	}()
+
+	wg.Wait()
+
+	// Close the pipe
+	_ = w.Close()
+
+	logger.Out = backupOut
+
+	// Read the log output
+	output, _ := io.ReadAll(r)
+
+	// Check if the log output contains the expected message
+	if !strings.Contains(string(output), "leak detected") {
+		t.Errorf("Expected log output to contain 'leak detected'")
+	}
+
+}