Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 27 additions & 12 deletions libpod/oci_conmon_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -973,15 +973,14 @@ func getPreserveFdExtraFiles(preserveFD []uint, preserveFDs uint) (uint, []*os.F
return preserveFDs, filesToClose, extraFiles, nil
}

// createOCIContainer generates this container's main conmon instance and prepares it for starting
func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) {
// createOCIContainerBase generates this container's main conmon instance (base implementation)
func (r *ConmonOCIRuntime) createOCIContainerBase(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) {
var stderrBuf bytes.Buffer

parentSyncPipe, childSyncPipe, err := newPipe()
if err != nil {
return 0, fmt.Errorf("creating socket pair: %w", err)
}
defer errorhandling.CloseQuiet(parentSyncPipe)

childStartPipe, parentStartPipe, err := newPipe()
if err != nil {
Expand Down Expand Up @@ -1038,6 +1037,9 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
args = append(args, "--conmon-pidfile", ctr.config.ConmonPidFile)
}

// Add healthcheck-related arguments (platform-specific implementation)
args = r.addHealthCheckArgs(ctr, args)

if r.noPivot {
args = append(args, "--no-pivot")
}
Expand Down Expand Up @@ -1199,6 +1201,8 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
// regardless of whether we errored or not, we no longer need the children pipes
childSyncPipe.Close()
childStartPipe.Close()

// Note: parentSyncPipe is NOT closed here because it's used for continuous healthcheck monitoring
if err != nil {
return 0, err
}
Expand All @@ -1219,7 +1223,7 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
return 0, fmt.Errorf("conmon failed: %w", err)
}

pid, err := readConmonPipeData(r.name, parentSyncPipe, ociLog)
pid, err := readConmonPipeData(r.name, parentSyncPipe, ociLog, ctr)
if err != nil {
if err2 := r.DeleteContainer(ctr); err2 != nil {
logrus.Errorf("Removing container %s from runtime after creation failed", ctr.ID())
Expand Down Expand Up @@ -1322,7 +1326,6 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p
logDriverArg = define.NoLogging
case define.PassthroughLogging, define.PassthroughTTYLogging:
logDriverArg = define.PassthroughLogging
//lint:ignore ST1015 the default case has to be here
default: //nolint:gocritic
// No case here should happen except JSONLogging, but keep this here in case the options are extended
logrus.Errorf("%s logging specified but not supported. Choosing k8s-file logging instead", ctr.LogDriver())
Expand Down Expand Up @@ -1390,13 +1393,15 @@ func readConmonPidFile(pidFile string) (int, error) {
return 0, nil
}

// readConmonPipeData attempts to read a syncInfo struct from the pipe
func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int, error) {
// syncInfo is used to return data from monitor process to daemon
type syncInfo struct {
Data int `json:"data"`
Message string `json:"message,omitempty"`
}
// syncInfo is used to return data from monitor process to daemon
type syncInfo struct {
Data int `json:"data"`
Message string `json:"message,omitempty"`
}

// readConmonPipeDataBase reads the initial container creation response from conmon (base implementation)
// If ctr is provided, it will also start continuous healthcheck monitoring in a separate goroutine
func readConmonPipeDataBase(runtimeName string, pipe *os.File, ociLog string, ctr ...*Container) (int, error) {

// Wait to get container pid from conmon
type syncStruct struct {
Expand All @@ -1412,11 +1417,14 @@ func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int,
// if it is no valid json unmarshal will fail below
if err != nil && !errors.Is(err, io.EOF) {
ch <- syncStruct{err: err}
return
}
if err := json.Unmarshal(b, &si); err != nil {
logrus.Debugf("Failed to unmarshal JSON from conmon: %v", err)
ch <- syncStruct{err: fmt.Errorf("conmon bytes %q: %w", string(b), err)}
return
}
logrus.Debugf("Successfully parsed JSON from conmon: Data=%d, Message=%q", si.Data, si.Message)
ch <- syncStruct{si: si}
}()

Expand All @@ -1436,6 +1444,13 @@ func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int,
return -1, fmt.Errorf("container create failed (no logs from conmon): %w", ss.err)
}
logrus.Debugf("Received: %d", ss.si.Data)

// Start continuous pipe monitoring if container is provided and PID is valid
// (platform-specific implementation)
if len(ctr) > 0 && ctr[0] != nil && ss.si.Data > 0 {
startContinuousPipeMonitoring(ctr[0], pipe, ss.si.Data)
}

if ss.si.Data < 0 {
if ociLog != "" {
ociLogData, err := os.ReadFile(ociLog)
Expand Down
260 changes: 260 additions & 0 deletions libpod/oci_conmon_nosystemd.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
//go:build !remote && (linux || freebsd) && !systemd

package libpod

import (
"bufio"
jsonlib "encoding/json"
"io"
"os"
"strconv"
"time"

"github.com/containers/podman/v5/libpod/define"
"github.com/sirupsen/logrus"
)

const (
// Healthcheck message type from conmon (using negative to avoid PID conflicts)
HealthCheckMsgStatusUpdate = -100

// Healthcheck status values sent by conmon (added to base message type -100)
HealthCheckStatusNone = 0
HealthCheckStatusStarting = 1
HealthCheckStatusHealthy = 2
HealthCheckStatusUnhealthy = 3
)

// createOCIContainer generates this container's main conmon instance with healthcheck support
func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) {
// Call the base implementation from common file
result, err := r.createOCIContainerBase(ctr, restoreOptions)
if err != nil {
return result, err
}

// Add healthcheck-specific logic for non-systemd builds
logrus.Debugf("HEALTHCHECK: Container %s created with healthcheck support", ctr.ID())

return result, nil
}

// readConmonPipeData reads container creation response and starts healthcheck monitoring
func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string, ctr ...*Container) (int, error) {
// Call the base implementation from common file
data, err := readConmonPipeDataBase(runtimeName, pipe, ociLog, ctr...)
if err != nil {
return data, err
}

// Add healthcheck monitoring for non-systemd builds
if len(ctr) > 0 && ctr[0] != nil && data > 0 {
logrus.Debugf("HEALTHCHECK: Starting pipe monitoring for container %s (PID: %d)", ctr[0].ID(), data)
startContinuousPipeMonitoring(ctr[0], pipe, data)
}

return data, nil
}

// addHealthCheckArgs adds healthcheck-related arguments to conmon for non-systemd builds
func (r *ConmonOCIRuntime) addHealthCheckArgs(ctr *Container, args []string) []string {
// Add healthcheck configuration as CLI arguments if container has healthcheck config
if ctr.HasHealthCheck() {
healthConfig := ctr.HealthCheckConfig()
if healthConfig != nil {
logrus.Debugf("HEALTHCHECK: Adding healthcheck CLI args for container %s", ctr.ID())

// Build healthcheck command and arguments from test array
healthCmd, healthArgs := r.buildHealthcheckCmdAndArgs(healthConfig.Test)
if healthCmd != "" {
args = append(args, "--healthcheck-cmd", healthCmd)

// Add all healthcheck arguments
for _, arg := range healthArgs {
args = append(args, "--healthcheck-arg", arg)
}

// Add optional healthcheck parameters with validation and defaults
interval := r.validateAndGetInterval(healthConfig.Interval)
timeout := r.validateAndGetTimeout(healthConfig.Timeout)
retries := r.validateAndGetRetries(healthConfig.Retries)
startPeriod := r.validateAndGetStartPeriod(healthConfig.StartPeriod)

args = append(args, "--healthcheck-interval", strconv.Itoa(interval))
args = append(args, "--healthcheck-timeout", strconv.Itoa(timeout))
args = append(args, "--healthcheck-retries", strconv.Itoa(retries))
args = append(args, "--healthcheck-start-period", strconv.Itoa(startPeriod))

logrus.Debugf("HEALTHCHECK: Added healthcheck args for container %s: cmd=%s, args=%v, interval=%ds, timeout=%ds, retries=%d, start-period=%ds",
ctr.ID(), healthCmd, healthArgs, interval, timeout, retries, startPeriod)
} else {
logrus.Warnf("HEALTHCHECK: Container %s has healthcheck config but no valid command", ctr.ID())
}
}
} else {
logrus.Debugf("HEALTHCHECK: Container %s does not have healthcheck config, skipping healthcheck args", ctr.ID())
}
return args
}

// startContinuousPipeMonitoring starts continuous pipe monitoring for non-systemd builds
func startContinuousPipeMonitoring(ctr *Container, pipe *os.File, pid int) {
logrus.Debugf("Starting continuous pipe monitoring for container %s (PID: %d)", ctr.ID())
go readConmonHealthCheckPipeData(ctr, pipe)
}

// readConmonHealthCheckPipeData continuously reads healthcheck status updates from conmon
func readConmonHealthCheckPipeData(ctr *Container, pipe *os.File) {
logrus.Debugf("HEALTHCHECK: Starting continuous healthcheck monitoring for container %s", ctr.ID())

rdr := bufio.NewReader(pipe)
for {
// Read one line from the pipe
b, err := rdr.ReadBytes('\n')
if err != nil {
if err == io.EOF {
logrus.Debugf("HEALTHCHECK: Pipe closed for container %s, stopping monitoring", ctr.ID())
return
}
logrus.Errorf("HEALTHCHECK: Error reading from pipe for container %s: %v", ctr.ID(), err)
return
}

// Log the raw JSON string received from conmon
logrus.Debugf("HEALTHCHECK: Raw JSON received from conmon for container %s: %q", ctr.ID(), string(b))
logrus.Debugf("HEALTHCHECK: JSON length: %d bytes", len(b))

// Parse the JSON
var si syncInfo
if err := jsonlib.Unmarshal(b, &si); err != nil {
logrus.Errorf("HEALTHCHECK: Failed to parse JSON from conmon for container %s: %v", ctr.ID(), err)
continue
}

logrus.Debugf("HEALTHCHECK: Parsed sync info for container %s: Data=%d, Message=%q", ctr.ID(), si.Data, si.Message)

// Handle healthcheck status updates based on your new encoding scheme
// Base message type is -100, status values are added to it:
// -100 + 0 (none) = -100
// -100 + 1 (starting) = -99
// -100 + 2 (healthy) = -98
// -100 + 3 (unhealthy) = -97
if si.Data >= HealthCheckMsgStatusUpdate && si.Data <= HealthCheckMsgStatusUpdate+HealthCheckStatusUnhealthy {
statusValue := si.Data - HealthCheckMsgStatusUpdate // Convert back to status value
var status string

switch statusValue {
case HealthCheckStatusNone:
status = define.HealthCheckReset // "reset" or "none"
case HealthCheckStatusStarting:
status = define.HealthCheckStarting // "starting"
case HealthCheckStatusHealthy:
status = define.HealthCheckHealthy // "healthy"
case HealthCheckStatusUnhealthy:
status = define.HealthCheckUnhealthy // "unhealthy"
default:
logrus.Errorf("HEALTHCHECK: Unknown status value %d for container %s", statusValue, ctr.ID())
continue
}

logrus.Infof("HEALTHCHECK: Received healthcheck status update for container %s: %s (message type: %d, status value: %d)",
ctr.ID(), status, si.Data, statusValue)

// Update the container's healthcheck status
if err := ctr.updateHealthStatus(status); err != nil {
logrus.Errorf("HEALTHCHECK: Failed to update healthcheck status for container %s: %v", ctr.ID(), err)
} else {
logrus.Infof("HEALTHCHECK: Successfully updated healthcheck status for container %s to %s", ctr.ID(), status)
}
} else if si.Data < 0 {
// Other negative message types - might be healthcheck related but not recognized
logrus.Debugf("HEALTHCHECK: Received unrecognized negative message type %d for container %s - might be healthcheck related", si.Data, ctr.ID())
} else if si.Data > 0 {
// Positive message types - not healthcheck related
logrus.Debugf("HEALTHCHECK: Received positive message type %d for container %s - not healthcheck related", si.Data, ctr.ID())
}
}
}

// buildHealthcheckCmdAndArgs converts Podman's healthcheck test array to command and arguments
func (r *ConmonOCIRuntime) buildHealthcheckCmdAndArgs(test []string) (string, []string) {
if len(test) == 0 {
return "", nil
}

// Handle special cases
switch test[0] {
case "", "NONE":
return "", nil
case "CMD":
// CMD format: ["CMD", "curl", "-f", "http://localhost:8080/health"]
// -> cmd="curl", args=["-f", "http://localhost:8080/health"]
if len(test) > 1 {
return test[1], test[2:]
}
return "", nil
case "CMD-SHELL":
// CMD-SHELL format: ["CMD-SHELL", "curl -f http://localhost:8080/health"]
// -> cmd="/bin/sh", args=["-c", "curl -f http://localhost:8080/health"]
if len(test) > 1 {
return "/bin/sh", []string{"-c", test[1]}
}
return "", nil
default:
// Direct command format: ["curl", "-f", "http://localhost:8080/health"]
// -> cmd="curl", args=["-f", "http://localhost:8080/health"]
return test[0], test[1:]
}
}

// validateAndGetInterval validates and returns the healthcheck interval in seconds
func (r *ConmonOCIRuntime) validateAndGetInterval(interval time.Duration) int {
// Default interval is 30 seconds
if interval <= 0 {
return 30
}
// Ensure minimum interval of 1 second
if interval < time.Second {
logrus.Warnf("HEALTHCHECK: Interval %v is less than 1 second, using 1 second", interval)
return 1
}
return int(interval.Seconds())
}

// validateAndGetTimeout validates and returns the healthcheck timeout in seconds
func (r *ConmonOCIRuntime) validateAndGetTimeout(timeout time.Duration) int {
// Default timeout is 30 seconds
if timeout <= 0 {
return 30
}
// Ensure minimum timeout of 1 second
if timeout < time.Second {
logrus.Warnf("HEALTHCHECK: Timeout %v is less than 1 second, using 1 second", timeout)
return 1
}
return int(timeout.Seconds())
}

// validateAndGetRetries validates and returns the healthcheck retries count
func (r *ConmonOCIRuntime) validateAndGetRetries(retries int) int {
// Default retries is 3
if retries <= 0 {
return 3
}
// Ensure reasonable maximum retries (conmon should handle this too)
if retries > 10 {
logrus.Warnf("HEALTHCHECK: Retries %d is very high, using 10", retries)
return 10
}
return retries
}

// validateAndGetStartPeriod validates and returns the healthcheck start period in seconds
func (r *ConmonOCIRuntime) validateAndGetStartPeriod(startPeriod time.Duration) int {
// Default start period is 0 seconds
if startPeriod < 0 {
logrus.Warnf("HEALTHCHECK: Start period %v is negative, using 0", startPeriod)
return 0
}
return int(startPeriod.Seconds())
}
Loading