Skip to content

Commit

Permalink
libct: move killing logic to container.Signal
Browse files Browse the repository at this point in the history
By default, the container has its own PID namespace, and killing (with
SIGKILL) its init process from the parent PID namespace also kills all
the other processes.

Obviously, it does not work that way when the container is sharing its
PID namespace with the host or another container, since init is no
longer special (it's not PID 1). In this case, killing container's init
will result in a bunch of other processes left running (and thus the
inability to remove the cgroup).

The solution to the above problem is killing all the container
processes, not just init.

The problem with the current implementation is, the killing logic is
implemented in libcontainer's initProcess.wait, and thus only available
to libcontainer users, but not the runc kill command (which uses
nonChildProcess.kill and does not use wait at all). So, some workarounds
exist:
 - func destroy(c *Container) calls signalAllProcesses;
 - runc kill implements -a flag.

This code became very tangled over time. Let's simplify things by moving
the killing all processes from initProcess.wait to container.Signal,
and documents the new behavior.

In essence, this also makes `runc kill` to automatically kill all container
processes when the container does not have its own PID namespace.
Document that as well.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
  • Loading branch information
kolyshkin committed Jun 8, 2023
1 parent 2a7dcbb commit 9583b3d
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 36 deletions.
52 changes: 32 additions & 20 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -358,9 +358,9 @@ func (c *Container) start(process *Process) (retErr error) {
}

// Signal sends a specified signal to container's init, or, if all is true,
// true, to all container's processes (as determined by container's cgroup).
// to all container's processes (as determined by container's cgroup).
//
// Setting all=true is useful when s is SIGKILL and the container does not have
// Note all=true is implied when s is SIGKILL and the container does not have
// its own PID namespace. In this scenario, the libcontainer user may be required
// to implement a proper child reaper.
func (c *Container) Signal(s os.Signal, all bool) error {
Expand All @@ -382,22 +382,36 @@ func (c *Container) Signal(s os.Signal, all bool) error {
}
return c.ignoreCgroupError(signalAllProcesses(c.cgroupManager, sig))
}
// to avoid a PID reuse attack
if status == Running || status == Created || status == Paused {
if err := c.initProcess.signal(s); err != nil {
return fmt.Errorf("unable to signal init: %w", err)
}
if status == Paused {
// For cgroup v1, killing a process in a frozen cgroup
// does nothing until it's thawed. Only thaw the cgroup
// for SIGKILL.
if s, ok := s.(unix.Signal); ok && s == unix.SIGKILL {
_ = c.cgroupManager.Freeze(configs.Thawed)
}
}
return nil

// To avoid a PID reuse attack, don't kill non-running container.
switch status {
case Running, Created, Paused:
default:
return ErrNotRunning
}
return ErrNotRunning

// When a container has its own PID namespace, inside it the init PID
// is 1, and thus it is handled specially by the kernel. In particular,
// killing init with SIGKILL from an ancestor namespace will also kill
// all other processes in that PID namespace (see pid_namespaces(7)).
//
// OTOH, if PID namespace is shared, we should kill all pids to avoid
// leftover processes.
if s == unix.SIGKILL && !c.config.Namespaces.IsPrivate(configs.NEWPID) {
err = signalAllProcesses(c.cgroupManager, unix.SIGKILL)
} else {
err = c.initProcess.signal(s)
}
if err != nil {
return fmt.Errorf("unable to signal init: %w", err)
}
if status == Paused && s == unix.SIGKILL {
// For cgroup v1, killing a process in a frozen cgroup
// does nothing until it's thawed. Only thaw the cgroup
// for SIGKILL.
_ = c.cgroupManager.Freeze(configs.Thawed)
}
return nil
}

func (c *Container) createExecFifo() error {
Expand Down Expand Up @@ -593,7 +607,6 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l
container: c,
process: p,
bootstrapData: data,
sharePidns: !c.config.Namespaces.IsPrivate(configs.NEWPID),
}
c.initProcess = init
return init, nil
Expand Down Expand Up @@ -696,8 +709,7 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
return cfg
}

// Destroy destroys the container, if its in a valid state, after killing any
// remaining running processes.
// Destroy destroys the container, if its in a valid state.
//
// Any event registrations are removed before the container is destroyed.
// No error is returned if the container is already destroyed.
Expand Down
4 changes: 2 additions & 2 deletions libcontainer/integration/exec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1399,8 +1399,8 @@ func testPidnsInitKill(t *testing.T, config *configs.Config) {
err = container.Run(process2)
ok(t, err)

// Kill the init process and Wait for it.
err = process1.Signal(syscall.SIGKILL)
// Kill the container.
err = container.Signal(syscall.SIGKILL, false)
ok(t, err)
_, err = process1.Wait()
if err == nil {
Expand Down
5 changes: 0 additions & 5 deletions libcontainer/process_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,6 @@ type initProcess struct {
fds []string
process *Process
bootstrapData io.Reader
sharePidns bool
}

func (p *initProcess) pid() int {
Expand Down Expand Up @@ -616,10 +615,6 @@ func (p *initProcess) start() (retErr error) {

func (p *initProcess) wait() (*os.ProcessState, error) {
err := p.cmd.Wait()
// we should kill all processes in cgroup when init is died if we use host PID namespace
if p.sharePidns {
_ = signalAllProcesses(p.manager, unix.SIGKILL)
}
return p.cmd.ProcessState, err
}

Expand Down
7 changes: 0 additions & 7 deletions libcontainer/state_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (

"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)

Expand Down Expand Up @@ -36,12 +35,6 @@ type containerState interface {
}

func destroy(c *Container) error {
if !c.config.Namespaces.Contains(configs.NEWPID) ||
c.config.Namespaces.PathOf(configs.NEWPID) != "" {
if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil {
logrus.Warn(err)
}
}
err := c.cgroupManager.Destroy()
if c.intelRdtManager != nil {
if ierr := c.intelRdtManager.Destroy(); err == nil {
Expand Down
4 changes: 2 additions & 2 deletions man/runc-kill.8.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ to list available signals.
# OPTIONS
**--all**|**-a**
: Send the signal to all processes inside the container, rather than
the container's init only. This option is useful when the container does not
have its own PID namespace.
the container's init only. This option is implied when the _signal_ is **KILL**
and the container does not have its own PID namespace.

: When this option is set, no error is returned if the container is stopped
or does not exist.
Expand Down

0 comments on commit 9583b3d

Please sign in to comment.