From fbed3a01d289bc3fb245e4940086d4c4a06e3a63 Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Tue, 17 Sep 2024 16:48:37 +0200 Subject: [PATCH] wait: fix handling of multiple conditions with exited As it turns on things are not so simple after all... In podman-py it was reported[1] that waiting might hang, per our docs wait on multiple conditions should exit once the first one is hit and not all of them. However because the new wait logic never checked if the context was cancelled the goroutine kept running until conmon exited and because we used a waitgroup to wait for all of them to finish it blocked until that happened. First we can remove the waitgroup as we only need to wait for one of them anyway via the channel. While this alone fixes the hang it would still leak the other goroutine. As there is no way to cancel a goroutine all the code must check for a cancelled context in the wait loop to no leak. Fixes 8a943311db ("libpod: simplify WaitForExit()") [1] https://github.com/containers/podman-py/issues/425 Signed-off-by: Paul Holzinger --- libpod/container_api.go | 30 +++++++++++++++++------------- test/e2e/wait_test.go | 15 +++++++++++++++ 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/libpod/container_api.go b/libpod/container_api.go index 2b5e305348..9e225940d5 100644 --- a/libpod/container_api.go +++ b/libpod/container_api.go @@ -9,7 +9,6 @@ import ( "io" "net/http" "os" - "sync" "time" "github.com/containers/common/pkg/resize" @@ -596,7 +595,7 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration) // we cannot wait locked as we would hold the lock forever, so we unlock and then lock again c.lock.Unlock() - err := waitForConmonExit(conmonPID, conmonPidFd, pollInterval) + err := waitForConmonExit(ctx, conmonPID, conmonPidFd, pollInterval) c.lock.Lock() if err != nil { return -1, fmt.Errorf("failed to wait for conmon to exit: %w", err) @@ -619,15 +618,24 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration) return c.runtime.state.GetContainerExitCode(id) } -func waitForConmonExit(conmonPID, conmonPidFd int, pollInterval time.Duration) error { +func waitForConmonExit(ctx context.Context, conmonPID, conmonPidFd int, pollInterval time.Duration) error { if conmonPidFd > -1 { for { fds := []unix.PollFd{{Fd: int32(conmonPidFd), Events: unix.POLLIN}} - if _, err := unix.Poll(fds, -1); err != nil { + if n, err := unix.Poll(fds, int(pollInterval.Milliseconds())); err != nil { if err == unix.EINTR { continue } return err + } else if n == 0 { + // n == 0 means timeout + select { + case <-ctx.Done(): + return define.ErrCanceled + default: + // context not done, wait again + continue + } } return nil } @@ -640,7 +648,11 @@ func waitForConmonExit(conmonPID, conmonPidFd int, pollInterval time.Duration) e } return err } - time.Sleep(pollInterval) + select { + case <-ctx.Done(): + return define.ErrCanceled + case <-time.After(pollInterval): + } } return nil } @@ -695,22 +707,15 @@ func (c *Container) WaitForConditionWithInterval(ctx context.Context, waitTimeou } } - var wg sync.WaitGroup - if waitForExit { - wg.Add(1) go func() { - defer wg.Done() - code, err := c.WaitForExit(ctx, waitTimeout) trySend(code, err) }() } if len(wantedStates) > 0 || len(wantedHealthStates) > 0 { - wg.Add(1) go func() { - defer wg.Done() stoppedCount := 0 for { if len(wantedStates) > 0 { @@ -780,7 +785,6 @@ func (c *Container) WaitForConditionWithInterval(ctx context.Context, waitTimeou case <-ctx.Done(): result = waitResult{-1, define.ErrCanceled} } - wg.Wait() return result.code, result.err } diff --git a/test/e2e/wait_test.go b/test/e2e/wait_test.go index a2f3a13175..99dad8583f 100644 --- a/test/e2e/wait_test.go +++ b/test/e2e/wait_test.go @@ -108,4 +108,19 @@ var _ = Describe("Podman wait", func() { Expect(session).Should(ExitCleanly()) Expect(session.OutputToStringArray()).To(Equal([]string{"0", "0", "0"})) }) + + It("podman wait on multiple conditions", func() { + session := podmanTest.Podman([]string{"run", "-d", ALPINE, "sleep", "100"}) + session.Wait(20) + Expect(session).Should(ExitCleanly()) + cid := session.OutputToString() + + // condition should return once nay of the condition is met not all of them, + // as the container is running this should return immediately + // https://github.com/containers/podman-py/issues/425 + session = podmanTest.Podman([]string{"wait", "--condition", "running,exited", cid}) + session.Wait(20) + Expect(session).Should(ExitCleanly()) + Expect(session.OutputToString()).To(Equal("-1")) + }) })