Skip to content

Commit

Permalink
runc start/run: report OOM
Browse files Browse the repository at this point in the history
In some cases, container init fails to start because it is killed by
the kernel OOM killer. The errors returned by runc in such cases are
semi-random and rather cryptic. Below are a few examples.

On cgroup v1 + systemd cgroup driver:

> process_linux.go:348: copying bootstrap data to pipe caused: write init-p: broken pipe

> process_linux.go:352: getting the final child's pid from pipe caused: EOF

On cgroup v2:

> process_linux.go:495: container init caused: read init-p: connection reset by peer

> process_linux.go:484: writing syncT 'resume' caused: write init-p: broken pipe

This commits adds the OOM method to cgroup managers, which tells whether
the container was OOM-killed. In case that has happened, the original error
is discarded (unless --debug is set), and the new OOM error is reported
instead:

> ERRO[0000] container_linux.go:367: starting container process caused: container init was OOM-killed (memory limit too low?)

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
  • Loading branch information
kolyshkin committed Feb 22, 2021
1 parent 089d7b4 commit e82a265
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 0 deletions.
3 changes: 3 additions & 0 deletions libcontainer/cgroups/cgroups.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,7 @@ type Manager interface {

// Whether the cgroup path exists or not
Exists() bool

// OOMKill reports OOM kill count for the cgroup.
OOMKill() (uint64, error)
}
9 changes: 9 additions & 0 deletions libcontainer/cgroups/fs/fs.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"sync"

"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/pkg/errors"
Expand Down Expand Up @@ -421,3 +422,11 @@ func (m *manager) GetFreezerState() (configs.FreezerState, error) {
func (m *manager) Exists() bool {
return cgroups.PathExists(m.Path("devices"))
}

func OOMKill(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill")
}

func (m *manager) OOMKill() (uint64, error) {
return OOMKill(m.Path("memory"))
}
8 changes: 8 additions & 0 deletions libcontainer/cgroups/fs2/fs2.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,3 +257,11 @@ func (m *manager) GetFreezerState() (configs.FreezerState, error) {
func (m *manager) Exists() bool {
return cgroups.PathExists(m.dirPath)
}

func OOMKill(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.events", "oom_kill")
}

func (m *manager) OOMKill() (uint64, error) {
return OOMKill(m.dirPath)
}
4 changes: 4 additions & 0 deletions libcontainer/cgroups/systemd/v1.go
Original file line number Diff line number Diff line change
Expand Up @@ -455,3 +455,7 @@ func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) {
func (m *legacyManager) Exists() bool {
return cgroups.PathExists(m.Path("devices"))
}

func (m *legacyManager) OOMKill() (uint64, error) {
return fs.OOMKill(m.Path("memory"))
}
4 changes: 4 additions & 0 deletions libcontainer/cgroups/systemd/v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -501,3 +501,7 @@ func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) {
func (m *unifiedManager) Exists() bool {
return cgroups.PathExists(m.path)
}

func (m *unifiedManager) OOMKill() (uint64, error) {
return fs2.OOMKill(m.path)
}
4 changes: 4 additions & 0 deletions libcontainer/container_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ func (m *mockCgroupManager) Exists() bool {
return err == nil
}

func (m *mockCgroupManager) OOMKill() (uint64, error) {
return 0, nil
}

func (m *mockCgroupManager) GetPaths() map[string]string {
return m.paths
}
Expand Down
18 changes: 18 additions & 0 deletions libcontainer/process_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,24 @@ func (p *initProcess) start() (retErr error) {
}
defer func() {
if retErr != nil {
// init might be killed by the kernel's OOM killer.
oom, err := p.manager.OOMKill()
if err != nil {
logrus.WithError(err).Warn("unable to get oom kill count")
} else if oom > 0 {
// Does not matter what the particular error was,
// its cause is most probably OOM, so report that.
const oomError = "container init was OOM-killed (memory limit too low?)"

if logrus.GetLevel() >= logrus.DebugLevel {
// Only show the original error if debug is set,
// as it is not generally very useful.
retErr = newSystemErrorWithCause(retErr, oomError)
} else {
retErr = newSystemError(errors.New(oomError))
}
}

// terminate the process to ensure we can remove cgroups
if err := ignoreTerminateErrors(p.terminate()); err != nil {
logrus.WithError(err).Warn("unable to terminate initProcess")
Expand Down

0 comments on commit e82a265

Please sign in to comment.