containers · samifruit514 · Sep 11, 2025 · Oct 12, 2025 · Oct 14, 2025
diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go
@@ -973,15 +973,14 @@ func getPreserveFdExtraFiles(preserveFD []uint, preserveFDs uint) (uint, []*os.F
 	return preserveFDs, filesToClose, extraFiles, nil
 }
 
-// createOCIContainer generates this container's main conmon instance and prepares it for starting
-func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) {
+// createOCIContainerBase generates this container's main conmon instance (base implementation)
+func (r *ConmonOCIRuntime) createOCIContainerBase(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) {
 	var stderrBuf bytes.Buffer
 
 	parentSyncPipe, childSyncPipe, err := newPipe()
 	if err != nil {
 		return 0, fmt.Errorf("creating socket pair: %w", err)
 	}
-	defer errorhandling.CloseQuiet(parentSyncPipe)
 
 	childStartPipe, parentStartPipe, err := newPipe()
 	if err != nil {
@@ -1038,6 +1037,9 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
 		args = append(args, "--conmon-pidfile", ctr.config.ConmonPidFile)
 	}
 
+	// Add healthcheck-related arguments (platform-specific implementation)
+	args = r.addHealthCheckArgs(ctr, args)
+
 	if r.noPivot {
 		args = append(args, "--no-pivot")
 	}
@@ -1199,6 +1201,8 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
 	// regardless of whether we errored or not, we no longer need the children pipes
 	childSyncPipe.Close()
 	childStartPipe.Close()
+
+	// Note: parentSyncPipe is NOT closed here because it's used for continuous healthcheck monitoring
 	if err != nil {
 		return 0, err
 	}
@@ -1219,7 +1223,7 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
 		return 0, fmt.Errorf("conmon failed: %w", err)
 	}
 
-	pid, err := readConmonPipeData(r.name, parentSyncPipe, ociLog)
+	pid, err := readConmonPipeData(r.name, parentSyncPipe, ociLog, ctr)
 	if err != nil {
 		if err2 := r.DeleteContainer(ctr); err2 != nil {
 			logrus.Errorf("Removing container %s from runtime after creation failed", ctr.ID())
@@ -1322,7 +1326,6 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p
 		logDriverArg = define.NoLogging
 	case define.PassthroughLogging, define.PassthroughTTYLogging:
 		logDriverArg = define.PassthroughLogging
-	//lint:ignore ST1015 the default case has to be here
 	default: //nolint:gocritic
 		// No case here should happen except JSONLogging, but keep this here in case the options are extended
 		logrus.Errorf("%s logging specified but not supported. Choosing k8s-file logging instead", ctr.LogDriver())
@@ -1390,13 +1393,15 @@ func readConmonPidFile(pidFile string) (int, error) {
 	return 0, nil
 }
 
-// readConmonPipeData attempts to read a syncInfo struct from the pipe
-func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int, error) {
-	// syncInfo is used to return data from monitor process to daemon
-	type syncInfo struct {
-		Data    int    `json:"data"`
-		Message string `json:"message,omitempty"`
-	}
+// syncInfo is used to return data from monitor process to daemon
+type syncInfo struct {
+	Data    int    `json:"data"`
+	Message string `json:"message,omitempty"`
+}
+
+// readConmonPipeDataBase reads the initial container creation response from conmon (base implementation)
+// If ctr is provided, it will also start continuous healthcheck monitoring in a separate goroutine
+func readConmonPipeDataBase(runtimeName string, pipe *os.File, ociLog string, ctr ...*Container) (int, error) {
 
 	// Wait to get container pid from conmon
 	type syncStruct struct {
@@ -1412,11 +1417,14 @@ func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int,
 		// if it is no valid json unmarshal will fail below
 		if err != nil && !errors.Is(err, io.EOF) {
 			ch <- syncStruct{err: err}
+			return
 		}
 		if err := json.Unmarshal(b, &si); err != nil {
+			logrus.Debugf("Failed to unmarshal JSON from conmon: %v", err)
 			ch <- syncStruct{err: fmt.Errorf("conmon bytes %q: %w", string(b), err)}
 			return
 		}
+		logrus.Debugf("Successfully parsed JSON from conmon: Data=%d, Message=%q", si.Data, si.Message)
 		ch <- syncStruct{si: si}
 	}()
 
@@ -1436,6 +1444,13 @@ func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int,
 			return -1, fmt.Errorf("container create failed (no logs from conmon): %w", ss.err)
 		}
 		logrus.Debugf("Received: %d", ss.si.Data)
+
+		// Start continuous pipe monitoring if container is provided and PID is valid
+		// (platform-specific implementation)
+		if len(ctr) > 0 && ctr[0] != nil && ss.si.Data > 0 {
+			startContinuousPipeMonitoring(ctr[0], pipe, ss.si.Data)
+		}
+
 		if ss.si.Data < 0 {
 			if ociLog != "" {
 				ociLogData, err := os.ReadFile(ociLog)

diff --git a/libpod/oci_conmon_nosystemd.go b/libpod/oci_conmon_nosystemd.go
@@ -0,0 +1,260 @@
+//go:build !remote && (linux || freebsd) && !systemd
+
+package libpod
+
+import (
+	"bufio"
+	jsonlib "encoding/json"
+	"io"
+	"os"
+	"strconv"
+	"time"
+
+	"github.com/containers/podman/v5/libpod/define"
+	"github.com/sirupsen/logrus"
+)
+
+const (
+	// Healthcheck message type from conmon (using negative to avoid PID conflicts)
+	HealthCheckMsgStatusUpdate = -100
+
+	// Healthcheck status values sent by conmon (added to base message type -100)
+	HealthCheckStatusNone      = 0
+	HealthCheckStatusStarting  = 1
+	HealthCheckStatusHealthy   = 2
+	HealthCheckStatusUnhealthy = 3
+)
+
+// createOCIContainer generates this container's main conmon instance with healthcheck support
+func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) {
+	// Call the base implementation from common file
+	result, err := r.createOCIContainerBase(ctr, restoreOptions)
+	if err != nil {
+		return result, err
+	}
+
+	// Add healthcheck-specific logic for non-systemd builds
+	logrus.Debugf("HEALTHCHECK: Container %s created with healthcheck support", ctr.ID())
+
+	return result, nil
+}
+
+// readConmonPipeData reads container creation response and starts healthcheck monitoring
+func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string, ctr ...*Container) (int, error) {
+	// Call the base implementation from common file
+	data, err := readConmonPipeDataBase(runtimeName, pipe, ociLog, ctr...)
+	if err != nil {
+		return data, err
+	}
+
+	// Add healthcheck monitoring for non-systemd builds
+	if len(ctr) > 0 && ctr[0] != nil && data > 0 {
+		logrus.Debugf("HEALTHCHECK: Starting pipe monitoring for container %s (PID: %d)", ctr[0].ID(), data)
+		startContinuousPipeMonitoring(ctr[0], pipe, data)
+	}
+
+	return data, nil
+}
+
+// addHealthCheckArgs adds healthcheck-related arguments to conmon for non-systemd builds
+func (r *ConmonOCIRuntime) addHealthCheckArgs(ctr *Container, args []string) []string {
+	// Add healthcheck configuration as CLI arguments if container has healthcheck config
+	if ctr.HasHealthCheck() {
+		healthConfig := ctr.HealthCheckConfig()
+		if healthConfig != nil {
+			logrus.Debugf("HEALTHCHECK: Adding healthcheck CLI args for container %s", ctr.ID())
+
+			// Build healthcheck command and arguments from test array
+			healthCmd, healthArgs := r.buildHealthcheckCmdAndArgs(healthConfig.Test)
+			if healthCmd != "" {
+				args = append(args, "--healthcheck-cmd", healthCmd)
+
+				// Add all healthcheck arguments
+				for _, arg := range healthArgs {
+					args = append(args, "--healthcheck-arg", arg)
+				}
+
+				// Add optional healthcheck parameters with validation and defaults
+				interval := r.validateAndGetInterval(healthConfig.Interval)
+				timeout := r.validateAndGetTimeout(healthConfig.Timeout)
+				retries := r.validateAndGetRetries(healthConfig.Retries)
+				startPeriod := r.validateAndGetStartPeriod(healthConfig.StartPeriod)
+
+				args = append(args, "--healthcheck-interval", strconv.Itoa(interval))
+				args = append(args, "--healthcheck-timeout", strconv.Itoa(timeout))
+				args = append(args, "--healthcheck-retries", strconv.Itoa(retries))
+				args = append(args, "--healthcheck-start-period", strconv.Itoa(startPeriod))
+
+				logrus.Debugf("HEALTHCHECK: Added healthcheck args for container %s: cmd=%s, args=%v, interval=%ds, timeout=%ds, retries=%d, start-period=%ds",
+					ctr.ID(), healthCmd, healthArgs, interval, timeout, retries, startPeriod)
+			} else {
+				logrus.Warnf("HEALTHCHECK: Container %s has healthcheck config but no valid command", ctr.ID())
+			}
+		}
+	} else {
+		logrus.Debugf("HEALTHCHECK: Container %s does not have healthcheck config, skipping healthcheck args", ctr.ID())
+	}
+	return args
+}
+
+// startContinuousPipeMonitoring starts continuous pipe monitoring for non-systemd builds
+func startContinuousPipeMonitoring(ctr *Container, pipe *os.File, pid int) {
+	logrus.Debugf("Starting continuous pipe monitoring for container %s (PID: %d)", ctr.ID())
+	go readConmonHealthCheckPipeData(ctr, pipe)
+}
+
+// readConmonHealthCheckPipeData continuously reads healthcheck status updates from conmon
+func readConmonHealthCheckPipeData(ctr *Container, pipe *os.File) {
+	logrus.Debugf("HEALTHCHECK: Starting continuous healthcheck monitoring for container %s", ctr.ID())
+
+	rdr := bufio.NewReader(pipe)
+	for {
+		// Read one line from the pipe
+		b, err := rdr.ReadBytes('\n')
+		if err != nil {
+			if err == io.EOF {
+				logrus.Debugf("HEALTHCHECK: Pipe closed for container %s, stopping monitoring", ctr.ID())
+				return
+			}
+			logrus.Errorf("HEALTHCHECK: Error reading from pipe for container %s: %v", ctr.ID(), err)
+			return
+		}
+
+		// Log the raw JSON string received from conmon
+		logrus.Debugf("HEALTHCHECK: Raw JSON received from conmon for container %s: %q", ctr.ID(), string(b))
+		logrus.Debugf("HEALTHCHECK: JSON length: %d bytes", len(b))
+
+		// Parse the JSON
+		var si syncInfo
+		if err := jsonlib.Unmarshal(b, &si); err != nil {
+			logrus.Errorf("HEALTHCHECK: Failed to parse JSON from conmon for container %s: %v", ctr.ID(), err)
+			continue
+		}
+
+		logrus.Debugf("HEALTHCHECK: Parsed sync info for container %s: Data=%d, Message=%q", ctr.ID(), si.Data, si.Message)
+
+		// Handle healthcheck status updates based on your new encoding scheme
+		// Base message type is -100, status values are added to it:
+		// -100 + 0 (none) = -100
+		// -100 + 1 (starting) = -99
+		// -100 + 2 (healthy) = -98
+		// -100 + 3 (unhealthy) = -97
+		if si.Data >= HealthCheckMsgStatusUpdate && si.Data <= HealthCheckMsgStatusUpdate+HealthCheckStatusUnhealthy {
+			statusValue := si.Data - HealthCheckMsgStatusUpdate // Convert back to status value
+			var status string
+
+			switch statusValue {
+			case HealthCheckStatusNone:
+				status = define.HealthCheckReset // "reset" or "none"
+			case HealthCheckStatusStarting:
+				status = define.HealthCheckStarting // "starting"
+			case HealthCheckStatusHealthy:
+				status = define.HealthCheckHealthy // "healthy"
+			case HealthCheckStatusUnhealthy:
+				status = define.HealthCheckUnhealthy // "unhealthy"
+			default:
+				logrus.Errorf("HEALTHCHECK: Unknown status value %d for container %s", statusValue, ctr.ID())
+				continue
+			}
+
+			logrus.Infof("HEALTHCHECK: Received healthcheck status update for container %s: %s (message type: %d, status value: %d)",
+				ctr.ID(), status, si.Data, statusValue)
+
+			// Update the container's healthcheck status
+			if err := ctr.updateHealthStatus(status); err != nil {
+				logrus.Errorf("HEALTHCHECK: Failed to update healthcheck status for container %s: %v", ctr.ID(), err)
+			} else {
+				logrus.Infof("HEALTHCHECK: Successfully updated healthcheck status for container %s to %s", ctr.ID(), status)
+			}
+		} else if si.Data < 0 {
+			// Other negative message types - might be healthcheck related but not recognized
+			logrus.Debugf("HEALTHCHECK: Received unrecognized negative message type %d for container %s - might be healthcheck related", si.Data, ctr.ID())
+		} else if si.Data > 0 {
+			// Positive message types - not healthcheck related
+			logrus.Debugf("HEALTHCHECK: Received positive message type %d for container %s - not healthcheck related", si.Data, ctr.ID())
+		}
+	}
+}
+
+// buildHealthcheckCmdAndArgs converts Podman's healthcheck test array to command and arguments
+func (r *ConmonOCIRuntime) buildHealthcheckCmdAndArgs(test []string) (string, []string) {
+	if len(test) == 0 {
+		return "", nil
+	}
+
+	// Handle special cases
+	switch test[0] {
+	case "", "NONE":
+		return "", nil
+	case "CMD":
+		// CMD format: ["CMD", "curl", "-f", "http://localhost:8080/health"]
+		// -> cmd="curl", args=["-f", "http://localhost:8080/health"]
+		if len(test) > 1 {
+			return test[1], test[2:]
+		}
+		return "", nil
+	case "CMD-SHELL":
+		// CMD-SHELL format: ["CMD-SHELL", "curl -f http://localhost:8080/health"]
+		// -> cmd="/bin/sh", args=["-c", "curl -f http://localhost:8080/health"]
+		if len(test) > 1 {
+			return "/bin/sh", []string{"-c", test[1]}
+		}
+		return "", nil
+	default:
+		// Direct command format: ["curl", "-f", "http://localhost:8080/health"]
+		// -> cmd="curl", args=["-f", "http://localhost:8080/health"]
+		return test[0], test[1:]
+	}
+}
+
+// validateAndGetInterval validates and returns the healthcheck interval in seconds
+func (r *ConmonOCIRuntime) validateAndGetInterval(interval time.Duration) int {
+	// Default interval is 30 seconds
+	if interval <= 0 {
+		return 30
+	}
+	// Ensure minimum interval of 1 second
+	if interval < time.Second {
+		logrus.Warnf("HEALTHCHECK: Interval %v is less than 1 second, using 1 second", interval)
+		return 1
+	}
+	return int(interval.Seconds())
+}
+
+// validateAndGetTimeout validates and returns the healthcheck timeout in seconds
+func (r *ConmonOCIRuntime) validateAndGetTimeout(timeout time.Duration) int {
+	// Default timeout is 30 seconds
+	if timeout <= 0 {
+		return 30
+	}
+	// Ensure minimum timeout of 1 second
+	if timeout < time.Second {
+		logrus.Warnf("HEALTHCHECK: Timeout %v is less than 1 second, using 1 second", timeout)
+		return 1
+	}
+	return int(timeout.Seconds())
+}
+
+// validateAndGetRetries validates and returns the healthcheck retries count
+func (r *ConmonOCIRuntime) validateAndGetRetries(retries int) int {
+	// Default retries is 3
+	if retries <= 0 {
+		return 3
+	}
+	// Ensure reasonable maximum retries (conmon should handle this too)
+	if retries > 10 {
+		logrus.Warnf("HEALTHCHECK: Retries %d is very high, using 10", retries)
+		return 10
+	}
+	return retries
+}
+
+// validateAndGetStartPeriod validates and returns the healthcheck start period in seconds
+func (r *ConmonOCIRuntime) validateAndGetStartPeriod(startPeriod time.Duration) int {
+	// Default start period is 0 seconds
+	if startPeriod < 0 {
+		logrus.Warnf("HEALTHCHECK: Start period %v is negative, using 0", startPeriod)
+		return 0
+	}
+	return int(startPeriod.Seconds())
+}