diff --git a/cmd/cachewd/main.go b/cmd/cachewd/main.go index b41814c..fcba618 100644 --- a/cmd/cachewd/main.go +++ b/cmd/cachewd/main.go @@ -26,6 +26,7 @@ import ( "github.com/block/cachew/internal/jobscheduler" "github.com/block/cachew/internal/logging" "github.com/block/cachew/internal/metrics" + "github.com/block/cachew/internal/reaper" "github.com/block/cachew/internal/strategy" "github.com/block/cachew/internal/strategy/git" "github.com/block/cachew/internal/strategy/gomod" @@ -64,6 +65,8 @@ func main() { ctx := context.Background() logger, ctx := logging.Configure(ctx, globalConfig.LoggingConfig) + reaper.Start(ctx) + // Start initialising tokenManagerProvider := githubapp.NewTokenManagerProvider(globalConfig.GithubAppConfig, logger) managerProvider := gitclone.NewManagerProvider(ctx, globalConfig.GitCloneConfig, func() (gitclone.CredentialProvider, error) { diff --git a/internal/reaper/reaper.go b/internal/reaper/reaper.go new file mode 100644 index 0000000..f5d364b --- /dev/null +++ b/internal/reaper/reaper.go @@ -0,0 +1,62 @@ +// Package reaper provides a background zombie process reaper. +// +// When a Go process runs as PID 1 (e.g. inside a container), it inherits +// orphaned child processes. If those children exit without being waited on, +// they accumulate as zombies. This package periodically calls waitpid(-1) +// with WNOHANG to reap any such zombies. +package reaper + +import ( + "context" + "log/slog" + "os" + "syscall" + "time" + + "github.com/block/cachew/internal/logging" +) + +// Start launches a background goroutine that reaps zombie child processes. +// It only activates when the current process is PID 1. The goroutine exits +// when ctx is cancelled. +func Start(ctx context.Context) { + logger := logging.FromContext(ctx) + if os.Getpid() != 1 { + logger.DebugContext(ctx, "Zombie reaper not needed, not running as PID 1") + return + } + logger.InfoContext(ctx, "Running as PID 1, starting zombie reaper") + go run(ctx) +} + +// StartForTest is like Start but skips the PID 1 check. +func StartForTest(ctx context.Context) { + go run(ctx) +} + +// Reap collects all currently-zombie child processes without blocking. +func Reap(ctx context.Context) { + logger := logging.FromContext(ctx) + for { + var status syscall.WaitStatus + pid, err := syscall.Wait4(-1, &status, syscall.WNOHANG, nil) + if pid <= 0 || err != nil { + return + } + logger.DebugContext(ctx, "Reaped zombie process", slog.Int("pid", pid), slog.Int("status", status.ExitStatus())) + } +} + +func run(ctx context.Context) { + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + Reap(ctx) + } + } +} diff --git a/internal/reaper/reaper_test.go b/internal/reaper/reaper_test.go new file mode 100644 index 0000000..c14c7fa --- /dev/null +++ b/internal/reaper/reaper_test.go @@ -0,0 +1,68 @@ +package reaper_test + +import ( + "context" + "log/slog" + "os" + "os/exec" + "runtime" + "syscall" + "testing" + "time" + + "github.com/alecthomas/assert/v2" + + "github.com/block/cachew/internal/logging" + "github.com/block/cachew/internal/reaper" +) + +func testContext(t *testing.T) context.Context { + t.Helper() + _, ctx := logging.Configure(t.Context(), logging.Config{Level: slog.LevelDebug}) + return ctx +} + +func TestStartReapsZombies(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("zombie reaping not applicable on Windows") + } + if os.Getpid() == 1 { + t.Skip("test assumes we are not PID 1") + } + + ctx, cancel := context.WithCancel(testContext(t)) + defer cancel() + + reaper.StartForTest(ctx) + + // Create a child process that exits immediately. We deliberately + // don't call cmd.Wait(), so the exited process becomes a zombie + // that the reaper should collect. + cmd := exec.Command("true") + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + err := cmd.Start() + assert.NoError(t, err) + + pid := cmd.Process.Pid + + // Give the child time to exit and then reap it. + time.Sleep(200 * time.Millisecond) + reaper.Reap(ctx) + + // Verify the zombie was reaped: waitpid should return nothing. + var status syscall.WaitStatus + wpid, err := syscall.Wait4(pid, &status, syscall.WNOHANG, nil) + assert.True(t, wpid <= 0 || err != nil, "expected zombie to have been reaped") +} + +func TestStartSkipsWhenNotPID1(t *testing.T) { + if os.Getpid() == 1 { + t.Skip("unexpectedly running as PID 1") + } + + ctx, cancel := context.WithCancel(testContext(t)) + defer cancel() + + // Should return immediately without starting a goroutine. + reaper.Start(ctx) +}