Skip to content

Commit

Permalink
Merge pull request containers#24358 from Luap99/healthcheck-startup-leak
Browse files Browse the repository at this point in the history
healthcheck: do not leak startup service
  • Loading branch information
openshift-merge-bot[bot] authored Oct 25, 2024
2 parents 584109f + c0f4e2c commit 2f6fca6
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 7 deletions.
8 changes: 8 additions & 0 deletions libpod/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"time"

"github.com/containers/podman/v5/libpod/define"
"github.com/containers/podman/v5/libpod/shutdown"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
Expand Down Expand Up @@ -273,6 +274,13 @@ func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) {
// Which happens to be us.
// So this has to be last - after this, systemd serves us a
// SIGTERM and we exit.
// Special case, via SIGTERM we exit(1) which means systemd logs a failure in the unit.
// We do not want this as the unit will be leaked on failure states unless "reset-failed"
// is called. Fundamentally this is expected so switch it to exit 0.
// NOTE: This is only safe while being called from "podman healthcheck run" which we know
// is the case here as we should not alter the exit code of another process that just
// happened to call this.
shutdown.SetExitCode(0)
if err := c.removeTransientFiles(ctx, true, oldUnit); err != nil {
logrus.Errorf("Error removing container %s healthcheck: %v", c.ID(), err)
return
Expand Down
11 changes: 6 additions & 5 deletions libpod/healthcheck_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,20 +137,21 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, un
stopErrors = append(stopErrors, fmt.Errorf("stopping systemd health-check timer %q: %w", timerFile, err))
}

// Reset the service before stopping it to make sure it's being removed
// on stop.
serviceChan := make(chan string)
serviceFile := fmt.Sprintf("%s.service", unitName)
if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil {
logrus.Debugf("Failed to reset unit file: %q", err)
}
if _, err := conn.StopUnitContext(ctx, serviceFile, "ignore-dependencies", serviceChan); err != nil {
if !strings.HasSuffix(err.Error(), ".service not loaded.") {
stopErrors = append(stopErrors, fmt.Errorf("removing health-check service %q: %w", serviceFile, err))
}
} else if err := systemdOpSuccessful(serviceChan); err != nil {
stopErrors = append(stopErrors, fmt.Errorf("stopping systemd health-check service %q: %w", serviceFile, err))
}
// Reset the service after stopping it to make sure it's being removed, systemd keep failed transient services
// around in its state. We do not care about the error and we need to ensure to reset the state so we do not
// leak resources forever.
if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil {
logrus.Debugf("Failed to reset unit file: %q", err)
}

return errorhandling.JoinErrors(stopErrors)
}
Expand Down
13 changes: 11 additions & 2 deletions test/system/220-healthcheck.bats
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,11 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\\\n\"

# Check that we now we do have valid podman units with this
# name so that the leak check below does not turn into a NOP without noticing.
assert "$(systemctl list-units --type timer | grep $cid)" =~ "podman" "Healthcheck systemd unit exists"
run -0 systemctl list-units
cidmatch=$(grep "$cid" <<<"$output")
echo "$cidmatch"
assert "$cidmatch" =~ " $cid-[0-9a-f]+\.timer *.*/podman healthcheck run $cid" \
"Healthcheck systemd unit exists"

current_time=$(date --iso-8601=ns)
# After three successive failures, container should no longer be healthy
Expand All @@ -117,7 +121,12 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\\\n\"

# Important check for https://github.com/containers/podman/issues/22884
# We never should leak the unit files, healthcheck uses the cid in name so just grep that.
assert "$(systemctl list-units --type timer | grep $cid)" == "" "Healthcheck systemd unit cleanup"
# (Ignore .scope units, those are conmon and can linger for 5 minutes)
# (Ignore .mount, too. They are created/removed by systemd based on the actual real mounts
# on the host and that is async and might be slow enough in CI to cause failures.)
run -0 systemctl list-units --quiet "*$cid*"
except_scope_mount=$(grep -vF ".scope " <<<"$output" | { grep -vF ".mount" || true; } )
assert "$except_scope_mount" == "" "Healthcheck systemd unit cleanup: no units leaked"
}

@test "podman healthcheck - restart cleans up old state" {
Expand Down

0 comments on commit 2f6fca6

Please sign in to comment.