From 4796b6f8b720ad5adaf1625c857502eb213dd5e8 Mon Sep 17 00:00:00 2001 From: cdoern Date: Mon, 21 Feb 2022 11:51:00 -0500 Subject: [PATCH] use runc creation logic switch c/common to use runc cgroup creation so that we can use resource limits this entails modifying both the CgroupControl struct to contain a runc configs.Cgroup entity as well as adding actual functionality to all of the Apply() functions for the various resource groups Signed-off-by: cdoern --- pkg/cgroups/blkio.go | 3 + pkg/cgroups/blkio_linux.go | 151 +++++ pkg/cgroups/cgroups.go | 3 + pkg/cgroups/cgroups_linux.go | 636 ++++++++++++++++++ pkg/cgroups/cgroups_linux_test.go | 66 ++ pkg/cgroups/cgroups_test.go | 3 + pkg/cgroups/cpu.go | 74 +- pkg/cgroups/cpu_linux.go | 90 +++ pkg/cgroups/cpuset.go | 41 +- pkg/cgroups/cpuset_linux.go | 49 ++ pkg/cgroups/memory.go | 3 + pkg/cgroups/memory_linux.go | 69 ++ pkg/cgroups/pids.go | 3 + pkg/cgroups/pids_linux.go | 61 ++ pkg/cgroups/utils.go | 114 ++++ .../cgroups/devices/devices_emulator.go | 386 +++++++++++ .../runc/libcontainer/cgroups/fs/blkio.go | 311 +++++++++ .../runc/libcontainer/cgroups/fs/cpu.go | 129 ++++ .../runc/libcontainer/cgroups/fs/cpuacct.go | 166 +++++ .../runc/libcontainer/cgroups/fs/cpuset.go | 245 +++++++ .../runc/libcontainer/cgroups/fs/devices.go | 109 +++ .../runc/libcontainer/cgroups/fs/error.go | 15 + .../runc/libcontainer/cgroups/fs/freezer.go | 158 +++++ .../runc/libcontainer/cgroups/fs/fs.go | 264 ++++++++ .../runc/libcontainer/cgroups/fs/hugetlb.go | 62 ++ .../runc/libcontainer/cgroups/fs/memory.go | 348 ++++++++++ .../runc/libcontainer/cgroups/fs/name.go | 31 + .../runc/libcontainer/cgroups/fs/net_cls.go | 32 + .../runc/libcontainer/cgroups/fs/net_prio.go | 30 + .../runc/libcontainer/cgroups/fs/paths.go | 186 +++++ .../libcontainer/cgroups/fs/perf_event.go | 24 + .../runc/libcontainer/cgroups/fs/pids.go | 62 ++ .../runc/libcontainer/cgroups/fs/rdma.go | 25 + .../libcontainer/cgroups/fscommon/rdma.go | 121 ++++ .../libcontainer/cgroups/fscommon/utils.go | 145 ++++ vendor/modules.txt | 3 + 36 files changed, 4109 insertions(+), 109 deletions(-) create mode 100644 pkg/cgroups/blkio_linux.go create mode 100644 pkg/cgroups/cgroups_linux.go create mode 100644 pkg/cgroups/cgroups_linux_test.go create mode 100644 pkg/cgroups/cpu_linux.go create mode 100644 pkg/cgroups/cpuset_linux.go create mode 100644 pkg/cgroups/memory_linux.go create mode 100644 pkg/cgroups/pids_linux.go create mode 100644 pkg/cgroups/utils.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/error.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/perf_event.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/rdma.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/rdma.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go diff --git a/pkg/cgroups/blkio.go b/pkg/cgroups/blkio.go index bacd4eb93..26072004e 100644 --- a/pkg/cgroups/blkio.go +++ b/pkg/cgroups/blkio.go @@ -1,3 +1,6 @@ +//go:build !linux +// +build !linux + package cgroups import ( diff --git a/pkg/cgroups/blkio_linux.go b/pkg/cgroups/blkio_linux.go new file mode 100644 index 000000000..81eee5b0a --- /dev/null +++ b/pkg/cgroups/blkio_linux.go @@ -0,0 +1,151 @@ +//go:build linux +// +build linux + +package cgroups + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" +) + +type linuxBlkioHandler struct { + blkio fs.BlkioGroup +} + +func getBlkioHandler() *linuxBlkioHandler { + return &linuxBlkioHandler{} +} + +// Apply set the specified constraints +func (c *linuxBlkioHandler) Apply(ctr *CgroupControl, res *configs.Resources) error { + return c.blkio.Set(ctr.config.Path, res) +} + +// Create the cgroup +func (c *linuxBlkioHandler) Create(ctr *CgroupControl) (bool, error) { + if ctr.cgroup2 { + return false, nil + } + return ctr.createCgroupDirectory(Blkio) +} + +// Destroy the cgroup +func (c *linuxBlkioHandler) Destroy(ctr *CgroupControl) error { + return rmDirRecursively(ctr.getCgroupv1Path(Blkio)) +} + +// Stat fills a metrics structure with usage stats for the controller +func (c *linuxBlkioHandler) Stat(ctr *CgroupControl, m *cgroups.Stats) error { + var ioServiceBytesRecursive []cgroups.BlkioStatEntry + + if ctr.cgroup2 { + // more details on the io.stat file format:X https://facebookmicrosites.github.io/cgroup2/docs/io-controller.html + values, err := readCgroup2MapFile(ctr, "io.stat") + if err != nil { + return err + } + for k, v := range values { + d := strings.Split(k, ":") + if len(d) != 2 { + continue + } + minor, err := strconv.ParseUint(d[0], 10, 0) + if err != nil { + return err + } + major, err := strconv.ParseUint(d[1], 10, 0) + if err != nil { + return err + } + + for _, item := range v { + d := strings.Split(item, "=") + if len(d) != 2 { + continue + } + op := d[0] + + // Accommodate the cgroup v1 naming + switch op { + case "rbytes": + op = "read" + case "wbytes": + op = "write" + } + + value, err := strconv.ParseUint(d[1], 10, 0) + if err != nil { + return err + } + + entry := cgroups.BlkioStatEntry{ + Op: op, + Major: major, + Minor: minor, + Value: value, + } + ioServiceBytesRecursive = append(ioServiceBytesRecursive, entry) + } + } + } else { + BlkioRoot := ctr.getCgroupv1Path(Blkio) + + p := filepath.Join(BlkioRoot, "blkio.throttle.io_service_bytes_recursive") + f, err := os.Open(p) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return errors.Wrapf(err, "open %s", p) + } + defer f.Close() + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + parts := strings.Fields(line) + if len(parts) < 3 { + continue + } + d := strings.Split(parts[0], ":") + if len(d) != 2 { + continue + } + minor, err := strconv.ParseUint(d[0], 10, 0) + if err != nil { + return err + } + major, err := strconv.ParseUint(d[1], 10, 0) + if err != nil { + return err + } + + op := parts[1] + + value, err := strconv.ParseUint(parts[2], 10, 0) + if err != nil { + return err + } + entry := cgroups.BlkioStatEntry{ + Op: op, + Major: major, + Minor: minor, + Value: value, + } + ioServiceBytesRecursive = append(ioServiceBytesRecursive, entry) + } + if err := scanner.Err(); err != nil { + return errors.Wrapf(err, "parse %s", p) + } + } + m.BlkioStats.IoServiceBytesRecursive = ioServiceBytesRecursive + return nil +} diff --git a/pkg/cgroups/cgroups.go b/pkg/cgroups/cgroups.go index 0bf275f38..fde996b3c 100644 --- a/pkg/cgroups/cgroups.go +++ b/pkg/cgroups/cgroups.go @@ -1,3 +1,6 @@ +//go:build !linux +// +build !linux + package cgroups import ( diff --git a/pkg/cgroups/cgroups_linux.go b/pkg/cgroups/cgroups_linux.go new file mode 100644 index 000000000..2bb04e3f3 --- /dev/null +++ b/pkg/cgroups/cgroups_linux.go @@ -0,0 +1,636 @@ +//go:build linux +// +build linux + +package cgroups + +import ( + "bufio" + "bytes" + "context" + "fmt" + "io/ioutil" + "math" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/containers/storage/pkg/unshare" + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/godbus/dbus/v5" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" +) + +var ( + // ErrCgroupDeleted means the cgroup was deleted + ErrCgroupDeleted = errors.New("cgroup deleted") + // ErrCgroupV1Rootless means the cgroup v1 were attempted to be used in rootless environment + ErrCgroupV1Rootless = errors.New("no support for CGroups V1 in rootless environments") + ErrStatCgroup = errors.New("no cgroup available for gathering user statistics") +) + +// CgroupControl controls a cgroup hierarchy +type CgroupControl struct { + cgroup2 bool + config *configs.Cgroup + systemd bool + // List of additional cgroup subsystems joined that + // do not have a custom handler. + additionalControllers []controller +} + +type controller struct { + name string + symlink bool +} + +type controllerHandler interface { + Create(*CgroupControl) (bool, error) + Apply(*CgroupControl, *configs.Resources) error + Destroy(*CgroupControl) error + Stat(*CgroupControl, *cgroups.Stats) error +} + +const ( + cgroupRoot = "/sys/fs/cgroup" + // CPU is the cpu controller + CPU = "cpu" + // CPUAcct is the cpuacct controller + CPUAcct = "cpuacct" + // CPUset is the cpuset controller + CPUset = "cpuset" + // Memory is the memory controller + Memory = "memory" + // Pids is the pids controller + Pids = "pids" + // Blkio is the blkio controller + Blkio = "blkio" +) + +var handlers map[string]controllerHandler + +func init() { + handlers = make(map[string]controllerHandler) + handlers[CPU] = getCPUHandler() + handlers[CPUset] = getCpusetHandler() + handlers[Memory] = getMemoryHandler() + handlers[Pids] = getPidsHandler() + handlers[Blkio] = getBlkioHandler() +} + +// getAvailableControllers get the available controllers +func getAvailableControllers(exclude map[string]controllerHandler, cgroup2 bool) ([]controller, error) { + if cgroup2 { + controllers := []controller{} + controllersFile := cgroupRoot + "/cgroup.controllers" + // rootless cgroupv2: check available controllers for current user, systemd or servicescope will inherit + if unshare.IsRootless() { + userSlice, err := getCgroupPathForCurrentProcess() + if err != nil { + return controllers, err + } + // userSlice already contains '/' so not adding here + basePath := cgroupRoot + userSlice + controllersFile = fmt.Sprintf("%s/cgroup.controllers", basePath) + } + controllersFileBytes, err := ioutil.ReadFile(controllersFile) + if err != nil { + return nil, errors.Wrapf(err, "failed while reading controllers for cgroup v2 from %q", controllersFile) + } + for _, controllerName := range strings.Fields(string(controllersFileBytes)) { + c := controller{ + name: controllerName, + symlink: false, + } + controllers = append(controllers, c) + } + return controllers, nil + } + + subsystems, _ := cgroupV1GetAllSubsystems() + controllers := []controller{} + // cgroupv1 and rootless: No subsystem is available: delegation is unsafe. + if unshare.IsRootless() { + return controllers, nil + } + + for _, name := range subsystems { + if _, found := exclude[name]; found { + continue + } + fileInfo, err := os.Stat(cgroupRoot + "/" + name) + if err != nil { + continue + } + c := controller{ + name: name, + symlink: !fileInfo.IsDir(), + } + controllers = append(controllers, c) + } + + return controllers, nil +} + +// GetAvailableControllers get string:bool map of all the available controllers +func GetAvailableControllers(exclude map[string]controllerHandler, cgroup2 bool) ([]string, error) { + availableControllers, err := getAvailableControllers(exclude, cgroup2) + if err != nil { + return nil, err + } + controllerList := []string{} + for _, controller := range availableControllers { + controllerList = append(controllerList, controller.name) + } + + return controllerList, nil +} + +func cgroupV1GetAllSubsystems() ([]string, error) { + f, err := os.Open("/proc/cgroups") + if err != nil { + return nil, err + } + defer f.Close() + + subsystems := []string{} + + s := bufio.NewScanner(f) + for s.Scan() { + text := s.Text() + if text[0] != '#' { + parts := strings.Fields(text) + if len(parts) >= 4 && parts[3] != "0" { + subsystems = append(subsystems, parts[0]) + } + } + } + if err := s.Err(); err != nil { + return nil, err + } + return subsystems, nil +} + +func getCgroupPathForCurrentProcess() (string, error) { + path := fmt.Sprintf("/proc/%d/cgroup", os.Getpid()) + f, err := os.Open(path) + if err != nil { + return "", err + } + defer f.Close() + + cgroupPath := "" + s := bufio.NewScanner(f) + for s.Scan() { + text := s.Text() + procEntries := strings.SplitN(text, "::", 2) + // set process cgroupPath only if entry is valid + if len(procEntries) > 1 { + cgroupPath = procEntries[1] + } + } + if err := s.Err(); err != nil { + return cgroupPath, err + } + return cgroupPath, nil +} + +// getCgroupv1Path is a helper function to get the cgroup v1 path +func (c *CgroupControl) getCgroupv1Path(name string) string { + return filepath.Join(cgroupRoot, name, c.config.Path) +} + +// createCgroupv2Path creates the cgroupv2 path and enables all the available controllers +func createCgroupv2Path(path string) (deferredError error) { + if !strings.HasPrefix(path, cgroupRoot+"/") { + return fmt.Errorf("invalid cgroup path %s", path) + } + content, err := ioutil.ReadFile(cgroupRoot + "/cgroup.controllers") + if err != nil { + return err + } + ctrs := bytes.Fields(content) + res := append([]byte("+"), bytes.Join(ctrs, []byte(" +"))...) + + current := "/sys/fs" + elements := strings.Split(path, "/") + for i, e := range elements[3:] { + current = filepath.Join(current, e) + if i > 0 { + if err := os.Mkdir(current, 0755); err != nil { + if !os.IsExist(err) { + return err + } + } else { + // If the directory was created, be sure it is not left around on errors. + defer func() { + if deferredError != nil { + os.Remove(current) + } + }() + } + } + // We enable the controllers for all the path components except the last one. It is not allowed to add + // PIDs if there are already enabled controllers. + if i < len(elements[3:])-1 { + if err := ioutil.WriteFile(filepath.Join(current, "cgroup.subtree_control"), res, 0755); err != nil { + return err + } + } + } + return nil +} + +// initialize initializes the specified hierarchy +func (c *CgroupControl) initialize() (err error) { + createdSoFar := map[string]controllerHandler{} + defer func() { + if err != nil { + for name, ctr := range createdSoFar { + if err := ctr.Destroy(c); err != nil { + logrus.Warningf("error cleaning up controller %s for %s", name, c.config.Path) + } + } + } + }() + if c.cgroup2 { + if err := createCgroupv2Path(filepath.Join(cgroupRoot, c.config.Path)); err != nil { + return errors.Wrapf(err, "error creating cgroup path %s", c.config.Path) + } + } + for name, handler := range handlers { + created, err := handler.Create(c) + if err != nil { + return err + } + if err != nil { + return err + } + if created { + createdSoFar[name] = handler + } + } + + if !c.cgroup2 { + // We won't need to do this for cgroup v2 + for _, ctr := range c.additionalControllers { + if ctr.symlink { + continue + } + path := c.getCgroupv1Path(ctr.name) + if err := os.MkdirAll(path, 0755); err != nil { + return errors.Wrapf(err, "error creating cgroup path for %s", ctr.name) + } + } + } + + return nil +} + +func (c *CgroupControl) createCgroupDirectory(controller string) (bool, error) { + cPath := c.getCgroupv1Path(controller) + _, err := os.Stat(cPath) + if err == nil { + return false, nil + } + + if !os.IsNotExist(err) { + return false, err + } + + if err := os.MkdirAll(cPath, 0755); err != nil { + return false, errors.Wrapf(err, "error creating cgroup for %s", controller) + } + return true, nil +} + +func readFileAsUint64(path string) (uint64, error) { + data, err := ioutil.ReadFile(path) + if err != nil { + return 0, err + } + v := cleanString(string(data)) + if v == "max" { + return math.MaxUint64, nil + } + ret, err := strconv.ParseUint(v, 10, 64) + if err != nil { + return ret, errors.Wrapf(err, "parse %s from %s", v, path) + } + return ret, nil +} + +func readFileByKeyAsUint64(path, key string) (uint64, error) { + content, err := ioutil.ReadFile(path) + if err != nil { + return 0, err + } + for _, line := range strings.Split(string(content), "\n") { + fields := strings.SplitN(line, " ", 2) + if fields[0] == key { + v := cleanString(string(fields[1])) + if v == "max" { + return math.MaxUint64, nil + } + ret, err := strconv.ParseUint(v, 10, 64) + if err != nil { + return ret, errors.Wrapf(err, "parse %s from %s", v, path) + } + return ret, nil + } + } + + return 0, fmt.Errorf("no key named %s from %s", key, path) +} + +// New creates a new cgroup control +func New(path string, resources *configs.Resources) (*CgroupControl, error) { + cgroup2, err := IsCgroup2UnifiedMode() + if err != nil { + return nil, err + } + control := &CgroupControl{ + cgroup2: cgroup2, + config: &configs.Cgroup{ + Path: path, + Resources: resources, + }, + } + + if !cgroup2 { + controllers, err := getAvailableControllers(handlers, false) + if err != nil { + return nil, err + } + control.additionalControllers = controllers + } + + if err := control.initialize(); err != nil { + return nil, err + } + + return control, nil +} + +// NewSystemd creates a new cgroup control +func NewSystemd(path string) (*CgroupControl, error) { + cgroup2, err := IsCgroup2UnifiedMode() + if err != nil { + return nil, err + } + control := &CgroupControl{ + cgroup2: cgroup2, + systemd: true, + config: &configs.Cgroup{ + Path: path, + }, + } + return control, nil +} + +// Load loads an existing cgroup control +func Load(path string) (*CgroupControl, error) { + cgroup2, err := IsCgroup2UnifiedMode() + if err != nil { + return nil, err + } + control := &CgroupControl{ + cgroup2: cgroup2, + systemd: false, + config: &configs.Cgroup{ + Path: path, + }, + } + if !cgroup2 { + controllers, err := getAvailableControllers(handlers, false) + if err != nil { + return nil, err + } + control.additionalControllers = controllers + } + if !cgroup2 { + oneExists := false + // check that the cgroup exists at least under one controller + for name := range handlers { + p := control.getCgroupv1Path(name) + if _, err := os.Stat(p); err == nil { + oneExists = true + break + } + } + + // if there is no controller at all, raise an error + if !oneExists { + if unshare.IsRootless() { + return nil, ErrCgroupV1Rootless + } + // compatible with the error code + // used by containerd/cgroups + return nil, ErrCgroupDeleted + } + } + return control, nil +} + +// CreateSystemdUnit creates the systemd cgroup +func (c *CgroupControl) CreateSystemdUnit(path string) error { + if !c.systemd { + return fmt.Errorf("the cgroup controller is not using systemd") + } + + conn, err := systemdDbus.NewWithContext(context.TODO()) + if err != nil { + return err + } + defer conn.Close() + + return systemdCreate(path, conn) +} + +// GetUserConnection returns an user connection to D-BUS +func GetUserConnection(uid int) (*systemdDbus.Conn, error) { + return systemdDbus.NewConnection(func() (*dbus.Conn, error) { + return dbusAuthConnection(uid, dbus.SessionBusPrivate) + }) +} + +// CreateSystemdUserUnit creates the systemd cgroup for the specified user +func (c *CgroupControl) CreateSystemdUserUnit(path string, uid int) error { + if !c.systemd { + return fmt.Errorf("the cgroup controller is not using systemd") + } + + conn, err := GetUserConnection(uid) + if err != nil { + return err + } + defer conn.Close() + + return systemdCreate(path, conn) +} + +func dbusAuthConnection(uid int, createBus func(opts ...dbus.ConnOption) (*dbus.Conn, error)) (*dbus.Conn, error) { + conn, err := createBus() + if err != nil { + return nil, err + } + + methods := []dbus.Auth{dbus.AuthExternal(strconv.Itoa(uid))} + + err = conn.Auth(methods) + if err != nil { + conn.Close() + return nil, err + } + if err := conn.Hello(); err != nil { + return nil, err + } + + return conn, nil +} + +// Delete cleans a cgroup +func (c *CgroupControl) Delete() error { + return c.DeleteByPath(c.config.Path) +} + +// DeleteByPathConn deletes the specified cgroup path using the specified +// dbus connection if needed. +func (c *CgroupControl) DeleteByPathConn(path string, conn *systemdDbus.Conn) error { + if c.systemd { + return systemdDestroyConn(path, conn) + } + if c.cgroup2 { + return rmDirRecursively(filepath.Join(cgroupRoot, c.config.Path)) + } + var lastError error + for _, h := range handlers { + if err := h.Destroy(c); err != nil { + lastError = err + } + } + + for _, ctr := range c.additionalControllers { + if ctr.symlink { + continue + } + p := c.getCgroupv1Path(ctr.name) + if err := rmDirRecursively(p); err != nil { + lastError = errors.Wrapf(err, "remove %s", p) + } + } + return lastError +} + +// DeleteByPath deletes the specified cgroup path +func (c *CgroupControl) DeleteByPath(path string) error { + if c.systemd { + conn, err := systemdDbus.NewWithContext(context.TODO()) + if err != nil { + return err + } + defer conn.Close() + return c.DeleteByPathConn(path, conn) + } + return c.DeleteByPathConn(path, nil) +} + +// Update updates the cgroups +func (c *CgroupControl) Update(resources *configs.Resources) error { + for _, h := range handlers { + if err := h.Apply(c, resources); err != nil { + return err + } + } + return nil +} + +// AddPid moves the specified pid to the cgroup +func (c *CgroupControl) AddPid(pid int) error { + pidString := []byte(fmt.Sprintf("%d\n", pid)) + + if c.cgroup2 { + p := filepath.Join(cgroupRoot, c.config.Path, "cgroup.procs") + if err := ioutil.WriteFile(p, pidString, 0644); err != nil { + return errors.Wrapf(err, "write %s", p) + } + return nil + } + + names := make([]string, 0, len(handlers)) + for n := range handlers { + names = append(names, n) + } + + for _, c := range c.additionalControllers { + if !c.symlink { + names = append(names, c.name) + } + } + + for _, n := range names { + // If we aren't using cgroup2, we won't write correctly to unified hierarchy + if !c.cgroup2 && n == "unified" { + continue + } + p := filepath.Join(c.getCgroupv1Path(n), "tasks") + if err := ioutil.WriteFile(p, pidString, 0644); err != nil { + return errors.Wrapf(err, "write %s", p) + } + } + return nil +} + +// Stat returns usage statistics for the cgroup +func (c *CgroupControl) Stat() (*cgroups.Stats, error) { + m := cgroups.Stats{} + found := false + for _, h := range handlers { + if err := h.Stat(c, &m); err != nil { + if !os.IsNotExist(errors.Cause(err)) { + return nil, err + } + logrus.Warningf("Failed to retrieve cgroup stats: %v", err) + continue + } + found = true + } + if !found { + return nil, ErrStatCgroup + } + return &m, nil +} + +func readCgroup2MapPath(path string) (map[string][]string, error) { + ret := map[string][]string{} + f, err := os.Open(path) + if err != nil { + if os.IsNotExist(err) { + return ret, nil + } + return nil, errors.Wrapf(err, "open file %s", path) + } + defer f.Close() + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + parts := strings.Fields(line) + if len(parts) < 2 { + continue + } + ret[parts[0]] = parts[1:] + } + if err := scanner.Err(); err != nil { + return nil, errors.Wrapf(err, "parsing file %s", path) + } + return ret, nil +} + +func readCgroup2MapFile(ctr *CgroupControl, name string) (map[string][]string, error) { + p := filepath.Join(cgroupRoot, ctr.config.Path, name) + + return readCgroup2MapPath(p) +} diff --git a/pkg/cgroups/cgroups_linux_test.go b/pkg/cgroups/cgroups_linux_test.go new file mode 100644 index 000000000..cfa3da951 --- /dev/null +++ b/pkg/cgroups/cgroups_linux_test.go @@ -0,0 +1,66 @@ +//go:build linux +// +build linux + +package cgroups + +import ( + "testing" + + "github.com/containers/storage/pkg/unshare" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func TestCreated(t *testing.T) { + // tests only works in rootless mode + if unshare.IsRootless() { + return + } + + var resources configs.Resources + cgr, err := New("machine.slice", &resources) + if err != nil { + t.Error(err) + } + if err := cgr.Delete(); err != nil { + t.Error(err) + } + + cgr, err = NewSystemd("machine.slice") + if err != nil { + t.Error(err) + } + if err := cgr.Delete(); err != nil { + t.Error(err) + } +} + +func TestResources(t *testing.T) { + // tests only works in rootless mode + if unshare.IsRootless() { + return + } + + var resources configs.Resources + resources.CpuPeriod = 100000 + resources.CpuQuota = 100000 + + cgr, err := New("machine.slice", &resources) + if err != nil { + t.Error(err) + } + + err = cgr.Update(&resources) + if err != nil { + t.Error(err) + } + + period, err := fscommon.GetCgroupParamUint(cgr.config.Path, "cpu.cfs_period_us") + if err != nil { + t.Fatal(err) + } + if period != 100000 { + t.Fatal("Got the wrong value, set cpu.cfs_period_us failed.") + } + +} diff --git a/pkg/cgroups/cgroups_test.go b/pkg/cgroups/cgroups_test.go index b93edc36c..ef5138587 100644 --- a/pkg/cgroups/cgroups_test.go +++ b/pkg/cgroups/cgroups_test.go @@ -1,3 +1,6 @@ +//go:build !linux +// +build !linux + package cgroups import ( diff --git a/pkg/cgroups/cpu.go b/pkg/cgroups/cpu.go index 23539757d..bc1fc561e 100644 --- a/pkg/cgroups/cpu.go +++ b/pkg/cgroups/cpu.go @@ -1,12 +1,12 @@ +//go:build !linux +// +build !linux + package cgroups import ( "fmt" - "io/ioutil" "os" - "path/filepath" "strconv" - "strings" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" @@ -19,36 +19,6 @@ func getCPUHandler() *cpuHandler { return &cpuHandler{} } -func cleanString(s string) string { - return strings.Trim(s, "\n") -} - -func readAcct(ctr *CgroupControl, name string) (uint64, error) { - p := filepath.Join(ctr.getCgroupv1Path(CPUAcct), name) - return readFileAsUint64(p) -} - -func readAcctList(ctr *CgroupControl, name string) ([]uint64, error) { - p := filepath.Join(ctr.getCgroupv1Path(CPUAcct), name) - data, err := ioutil.ReadFile(p) - if err != nil { - return nil, errors.Wrapf(err, "reading %s", p) - } - r := []uint64{} - for _, s := range strings.Split(string(data), " ") { - s = cleanString(s) - if s == "" { - break - } - v, err := strconv.ParseUint(s, 10, 64) - if err != nil { - return nil, errors.Wrapf(err, "parsing %s", s) - } - r = append(r, v) - } - return r, nil -} - // Apply set the specified constraints func (c *cpuHandler) Apply(ctr *CgroupControl, res *spec.LinuxResources) error { if res.CPU == nil { @@ -120,41 +90,3 @@ func (c *cpuHandler) Stat(ctr *CgroupControl, m *Metrics) error { m.CPU = CPUMetrics{Usage: usage} return nil } - -// GetSystemCPUUsage returns the system usage for all the cgroups -func GetSystemCPUUsage() (uint64, error) { - cgroupv2, err := IsCgroup2UnifiedMode() - if err != nil { - return 0, err - } - if !cgroupv2 { - p := filepath.Join(cgroupRoot, CPUAcct, "cpuacct.usage") - return readFileAsUint64(p) - } - - files, err := ioutil.ReadDir(cgroupRoot) - if err != nil { - return 0, err - } - var total uint64 - for _, file := range files { - if !file.IsDir() { - continue - } - p := filepath.Join(cgroupRoot, file.Name(), "cpu.stat") - - values, err := readCgroup2MapPath(p) - if err != nil { - return 0, err - } - - if val, found := values["usage_usec"]; found { - v, err := strconv.ParseUint(cleanString(val[0]), 10, 64) - if err != nil { - return 0, err - } - total += v * 1000 - } - } - return total, nil -} diff --git a/pkg/cgroups/cpu_linux.go b/pkg/cgroups/cpu_linux.go new file mode 100644 index 000000000..588d97f8e --- /dev/null +++ b/pkg/cgroups/cpu_linux.go @@ -0,0 +1,90 @@ +//go:build linux +// +build linux + +package cgroups + +import ( + "os" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" +) + +type linuxCpuHandler struct { + cpu fs.CpuGroup +} + +func getCPUHandler() *linuxCpuHandler { + return &linuxCpuHandler{} +} + +// Apply set the specified constraints +func (c *linuxCpuHandler) Apply(ctr *CgroupControl, res *configs.Resources) error { + return c.cpu.Set(ctr.config.Path, res) +} + +// Create the cgroup +func (c *linuxCpuHandler) Create(ctr *CgroupControl) (bool, error) { + if ctr.cgroup2 { + return false, nil + } + return ctr.createCgroupDirectory(CPU) +} + +// Destroy the cgroup +func (c *linuxCpuHandler) Destroy(ctr *CgroupControl) error { + return rmDirRecursively(ctr.getCgroupv1Path(CPU)) +} + +// Stat fills a metrics structure with usage stats for the controller +func (c *linuxCpuHandler) Stat(ctr *CgroupControl, m *cgroups.Stats) error { + var err error + cpu := cgroups.CpuStats{} + if ctr.cgroup2 { + values, err := readCgroup2MapFile(ctr, "cpu.stat") + if err != nil { + return err + } + if val, found := values["usage_usec"]; found { + cpu.CpuUsage.TotalUsage, err = strconv.ParseUint(cleanString(val[0]), 10, 64) + if err != nil { + return err + } + cpu.CpuUsage.UsageInKernelmode *= 1000 + } + if val, found := values["system_usec"]; found { + cpu.CpuUsage.UsageInKernelmode, err = strconv.ParseUint(cleanString(val[0]), 10, 64) + if err != nil { + return err + } + cpu.CpuUsage.TotalUsage *= 1000 + } + } else { + cpu.CpuUsage.TotalUsage, err = readAcct(ctr, "cpuacct.usage") + if err != nil { + if !os.IsNotExist(errors.Cause(err)) { + return err + } + cpu.CpuUsage.TotalUsage = 0 + } + cpu.CpuUsage.UsageInKernelmode, err = readAcct(ctr, "cpuacct.usage_sys") + if err != nil { + if !os.IsNotExist(errors.Cause(err)) { + return err + } + cpu.CpuUsage.UsageInKernelmode = 0 + } + cpu.CpuUsage.PercpuUsage, err = readAcctList(ctr, "cpuacct.usage_percpu") + if err != nil { + if !os.IsNotExist(errors.Cause(err)) { + return err + } + cpu.CpuUsage.PercpuUsage = nil + } + } + m.CpuStats = cpu + return nil +} diff --git a/pkg/cgroups/cpuset.go b/pkg/cgroups/cpuset.go index 22ac0a079..bb9e3abdc 100644 --- a/pkg/cgroups/cpuset.go +++ b/pkg/cgroups/cpuset.go @@ -1,53 +1,18 @@ +//go:build !linux +// +build !linux + package cgroups import ( "fmt" - "io/ioutil" "path/filepath" - "strings" spec "github.com/opencontainers/runtime-spec/specs-go" - "github.com/pkg/errors" ) type cpusetHandler struct { } -func cpusetCopyFileFromParent(dir, file string, cgroupv2 bool) ([]byte, error) { - if dir == cgroupRoot { - return nil, fmt.Errorf("could not find parent to initialize cpuset %s", file) - } - path := filepath.Join(dir, file) - parentPath := path - if cgroupv2 { - parentPath = fmt.Sprintf("%s.effective", parentPath) - } - data, err := ioutil.ReadFile(parentPath) - if err != nil { - return nil, errors.Wrapf(err, "open %s", path) - } - if strings.Trim(string(data), "\n") != "" { - return data, nil - } - data, err = cpusetCopyFileFromParent(filepath.Dir(dir), file, cgroupv2) - if err != nil { - return nil, err - } - if err := ioutil.WriteFile(path, data, 0644); err != nil { - return nil, errors.Wrapf(err, "write %s", path) - } - return data, nil -} - -func cpusetCopyFromParent(path string, cgroupv2 bool) error { - for _, file := range []string{"cpuset.cpus", "cpuset.mems"} { - if _, err := cpusetCopyFileFromParent(path, file, cgroupv2); err != nil { - return err - } - } - return nil -} - func getCpusetHandler() *cpusetHandler { return &cpusetHandler{} } diff --git a/pkg/cgroups/cpuset_linux.go b/pkg/cgroups/cpuset_linux.go new file mode 100644 index 000000000..b94956a2b --- /dev/null +++ b/pkg/cgroups/cpuset_linux.go @@ -0,0 +1,49 @@ +//go:build linux +// +build linux + +package cgroups + +import ( + "path/filepath" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type linuxCpusetHandler struct { + cpuset fs.CpusetGroup +} + +func getCpusetHandler() *linuxCpusetHandler { + return &linuxCpusetHandler{} +} + +// Apply set the specified constraints +func (c *linuxCpusetHandler) Apply(ctr *CgroupControl, res *configs.Resources) error { + return c.cpuset.Set(ctr.config.Path, res) +} + +// Create the cgroup +func (c *linuxCpusetHandler) Create(ctr *CgroupControl) (bool, error) { + if ctr.cgroup2 { + path := filepath.Join(cgroupRoot, ctr.config.Path) + return true, cpusetCopyFromParent(path, true) + } + + created, err := ctr.createCgroupDirectory(CPUset) + if !created || err != nil { + return created, err + } + return true, cpusetCopyFromParent(ctr.getCgroupv1Path(CPUset), false) +} + +// Destroy the cgroup +func (c *linuxCpusetHandler) Destroy(ctr *CgroupControl) error { + return rmDirRecursively(ctr.getCgroupv1Path(CPUset)) +} + +// Stat fills a metrics structure with usage stats for the controller +func (c *linuxCpusetHandler) Stat(ctr *CgroupControl, m *cgroups.Stats) error { + return nil +} diff --git a/pkg/cgroups/memory.go b/pkg/cgroups/memory.go index 10d65893c..b597b85bf 100644 --- a/pkg/cgroups/memory.go +++ b/pkg/cgroups/memory.go @@ -1,3 +1,6 @@ +//go:build !linux +// +build !linux + package cgroups import ( diff --git a/pkg/cgroups/memory_linux.go b/pkg/cgroups/memory_linux.go new file mode 100644 index 000000000..9ce6c7948 --- /dev/null +++ b/pkg/cgroups/memory_linux.go @@ -0,0 +1,69 @@ +//go:build linux +// +build linux + +package cgroups + +import ( + "path/filepath" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type linuxMemHandler struct { + mem fs.MemoryGroup +} + +func getMemoryHandler() *linuxMemHandler { + return &linuxMemHandler{} +} + +// Apply set the specified constraints +func (c *linuxMemHandler) Apply(ctr *CgroupControl, res *configs.Resources) error { + return c.mem.Set(ctr.config.Path, res) +} + +// Create the cgroup +func (c *linuxMemHandler) Create(ctr *CgroupControl) (bool, error) { + if ctr.cgroup2 { + return false, nil + } + return ctr.createCgroupDirectory(Memory) +} + +// Destroy the cgroup +func (c *linuxMemHandler) Destroy(ctr *CgroupControl) error { + return rmDirRecursively(ctr.getCgroupv1Path(Memory)) +} + +// Stat fills a metrics structure with usage stats for the controller +func (c *linuxMemHandler) Stat(ctr *CgroupControl, m *cgroups.Stats) error { + var err error + memUsage := cgroups.MemoryStats{} + + var memoryRoot string + var limitFilename string + + if ctr.cgroup2 { + memoryRoot = filepath.Join(cgroupRoot, ctr.config.Path) + limitFilename = "memory.max" + if memUsage.Usage.Usage, err = readFileByKeyAsUint64(filepath.Join(memoryRoot, "memory.stat"), "anon"); err != nil { + return err + } + } else { + memoryRoot = ctr.getCgroupv1Path(Memory) + limitFilename = "memory.limit_in_bytes" + if memUsage.Usage.Usage, err = readFileAsUint64(filepath.Join(memoryRoot, "memory.usage_in_bytes")); err != nil { + return err + } + } + + memUsage.Usage.Limit, err = readFileAsUint64(filepath.Join(memoryRoot, limitFilename)) + if err != nil { + return err + } + + m.MemoryStats = memUsage + return nil +} diff --git a/pkg/cgroups/pids.go b/pkg/cgroups/pids.go index 58cb32b3b..b281b18a2 100644 --- a/pkg/cgroups/pids.go +++ b/pkg/cgroups/pids.go @@ -1,3 +1,6 @@ +//go:build !linux +// +build !linux + package cgroups import ( diff --git a/pkg/cgroups/pids_linux.go b/pkg/cgroups/pids_linux.go new file mode 100644 index 000000000..3af7626ca --- /dev/null +++ b/pkg/cgroups/pids_linux.go @@ -0,0 +1,61 @@ +//go:build linux +// +build linux + +package cgroups + +import ( + "path/filepath" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type linuxPidHandler struct { + pid fs.PidsGroup +} + +func getPidsHandler() *linuxPidHandler { + return &linuxPidHandler{} +} + +// Apply set the specified constraints +func (c *linuxPidHandler) Apply(ctr *CgroupControl, res *configs.Resources) error { + return c.pid.Set(ctr.config.Path, res) +} + +// Create the cgroup +func (c *linuxPidHandler) Create(ctr *CgroupControl) (bool, error) { + if ctr.cgroup2 { + return false, nil + } + return ctr.createCgroupDirectory(Pids) +} + +// Destroy the cgroup +func (c *linuxPidHandler) Destroy(ctr *CgroupControl) error { + return rmDirRecursively(ctr.getCgroupv1Path(Pids)) +} + +// Stat fills a metrics structure with usage stats for the controller +func (c *linuxPidHandler) Stat(ctr *CgroupControl, m *cgroups.Stats) error { + if ctr.config.Path == "" { + // nothing we can do to retrieve the pids.current path + return nil + } + + var PIDRoot string + if ctr.cgroup2 { + PIDRoot = filepath.Join(cgroupRoot, ctr.config.Path) + } else { + PIDRoot = ctr.getCgroupv1Path(Pids) + } + + current, err := readFileAsUint64(filepath.Join(PIDRoot, "pids.current")) + if err != nil { + return err + } + + m.PidsStats.Current = current + return nil +} diff --git a/pkg/cgroups/utils.go b/pkg/cgroups/utils.go new file mode 100644 index 000000000..d0169ee4e --- /dev/null +++ b/pkg/cgroups/utils.go @@ -0,0 +1,114 @@ +package cgroups + +import ( + "fmt" + "io/ioutil" + "path/filepath" + "strconv" + "strings" + + "github.com/pkg/errors" +) + +func cleanString(s string) string { + return strings.Trim(s, "\n") +} + +func readAcct(ctr *CgroupControl, name string) (uint64, error) { + p := filepath.Join(ctr.getCgroupv1Path(CPUAcct), name) + return readFileAsUint64(p) +} + +func readAcctList(ctr *CgroupControl, name string) ([]uint64, error) { + p := filepath.Join(ctr.getCgroupv1Path(CPUAcct), name) + data, err := ioutil.ReadFile(p) + if err != nil { + return nil, errors.Wrapf(err, "reading %s", p) + } + r := []uint64{} + for _, s := range strings.Split(string(data), " ") { + s = cleanString(s) + if s == "" { + break + } + v, err := strconv.ParseUint(s, 10, 64) + if err != nil { + return nil, errors.Wrapf(err, "parsing %s", s) + } + r = append(r, v) + } + return r, nil +} + +// GetSystemCPUUsage returns the system usage for all the cgroups +func GetSystemCPUUsage() (uint64, error) { + cgroupv2, err := IsCgroup2UnifiedMode() + if err != nil { + return 0, err + } + if !cgroupv2 { + p := filepath.Join(cgroupRoot, CPUAcct, "cpuacct.usage") + return readFileAsUint64(p) + } + + files, err := ioutil.ReadDir(cgroupRoot) + if err != nil { + return 0, err + } + var total uint64 + for _, file := range files { + if !file.IsDir() { + continue + } + p := filepath.Join(cgroupRoot, file.Name(), "cpu.stat") + + values, err := readCgroup2MapPath(p) + if err != nil { + return 0, err + } + + if val, found := values["usage_usec"]; found { + v, err := strconv.ParseUint(cleanString(val[0]), 10, 64) + if err != nil { + return 0, err + } + total += v * 1000 + } + } + return total, nil +} + +func cpusetCopyFileFromParent(dir, file string, cgroupv2 bool) ([]byte, error) { + if dir == cgroupRoot { + return nil, fmt.Errorf("could not find parent to initialize cpuset %s", file) + } + path := filepath.Join(dir, file) + parentPath := path + if cgroupv2 { + parentPath = fmt.Sprintf("%s.effective", parentPath) + } + data, err := ioutil.ReadFile(parentPath) + if err != nil { + return nil, errors.Wrapf(err, "open %s", path) + } + if strings.Trim(string(data), "\n") != "" { + return data, nil + } + data, err = cpusetCopyFileFromParent(filepath.Dir(dir), file, cgroupv2) + if err != nil { + return nil, err + } + if err := ioutil.WriteFile(path, data, 0644); err != nil { + return nil, errors.Wrapf(err, "write %s", path) + } + return data, nil +} + +func cpusetCopyFromParent(path string, cgroupv2 bool) error { + for _, file := range []string{"cpuset.cpus", "cpuset.mems"} { + if _, err := cpusetCopyFileFromParent(path, file, cgroupv2); err != nil { + return err + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go new file mode 100644 index 000000000..6c61ee4c0 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go @@ -0,0 +1,386 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Copyright (C) 2020 Aleksa Sarai + * Copyright (C) 2020 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package devices + +import ( + "bufio" + "fmt" + "io" + "sort" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/devices" +) + +// deviceMeta is a Rule without the Allow or Permissions fields, and no +// wildcard-type support. It's effectively the "match" portion of a metadata +// rule, for the purposes of our emulation. +type deviceMeta struct { + node devices.Type + major int64 + minor int64 +} + +// deviceRule is effectively the tuple (deviceMeta, Permissions). +type deviceRule struct { + meta deviceMeta + perms devices.Permissions +} + +// deviceRules is a mapping of device metadata rules to the associated +// permissions in the ruleset. +type deviceRules map[deviceMeta]devices.Permissions + +func (r deviceRules) orderedEntries() []deviceRule { + var rules []deviceRule + for meta, perms := range r { + rules = append(rules, deviceRule{meta: meta, perms: perms}) + } + sort.Slice(rules, func(i, j int) bool { + // Sort by (major, minor, type). + a, b := rules[i].meta, rules[j].meta + return a.major < b.major || + (a.major == b.major && a.minor < b.minor) || + (a.major == b.major && a.minor == b.minor && a.node < b.node) + }) + return rules +} + +type Emulator struct { + defaultAllow bool + rules deviceRules +} + +func (e *Emulator) IsBlacklist() bool { + return e.defaultAllow +} + +func (e *Emulator) IsAllowAll() bool { + return e.IsBlacklist() && len(e.rules) == 0 +} + +func parseLine(line string) (*deviceRule, error) { + // Input: node major:minor perms. + fields := strings.FieldsFunc(line, func(r rune) bool { + return r == ' ' || r == ':' + }) + if len(fields) != 4 { + return nil, fmt.Errorf("malformed devices.list rule %s", line) + } + + var ( + rule deviceRule + node = fields[0] + major = fields[1] + minor = fields[2] + perms = fields[3] + ) + + // Parse the node type. + switch node { + case "a": + // Super-special case -- "a" always means every device with every + // access mode. In fact, for devices.list this actually indicates that + // the cgroup is in black-list mode. + // TODO: Double-check that the entire file is "a *:* rwm". + return nil, nil + case "b": + rule.meta.node = devices.BlockDevice + case "c": + rule.meta.node = devices.CharDevice + default: + return nil, fmt.Errorf("unknown device type %q", node) + } + + // Parse the major number. + if major == "*" { + rule.meta.major = devices.Wildcard + } else { + val, err := strconv.ParseUint(major, 10, 32) + if err != nil { + return nil, fmt.Errorf("invalid major number: %w", err) + } + rule.meta.major = int64(val) + } + + // Parse the minor number. + if minor == "*" { + rule.meta.minor = devices.Wildcard + } else { + val, err := strconv.ParseUint(minor, 10, 32) + if err != nil { + return nil, fmt.Errorf("invalid minor number: %w", err) + } + rule.meta.minor = int64(val) + } + + // Parse the access permissions. + rule.perms = devices.Permissions(perms) + if !rule.perms.IsValid() || rule.perms.IsEmpty() { + return nil, fmt.Errorf("parse access mode: contained unknown modes or is empty: %q", perms) + } + return &rule, nil +} + +func (e *Emulator) addRule(rule deviceRule) error { //nolint:unparam + if e.rules == nil { + e.rules = make(map[deviceMeta]devices.Permissions) + } + + // Merge with any pre-existing permissions. + oldPerms := e.rules[rule.meta] + newPerms := rule.perms.Union(oldPerms) + e.rules[rule.meta] = newPerms + return nil +} + +func (e *Emulator) rmRule(rule deviceRule) error { + // Give an error if any of the permissions requested to be removed are + // present in a partially-matching wildcard rule, because such rules will + // be ignored by cgroupv1. + // + // This is a diversion from cgroupv1, but is necessary to avoid leading + // users into a false sense of security. cgroupv1 will silently(!) ignore + // requests to remove partial exceptions, but we really shouldn't do that. + // + // It may seem like we could just "split" wildcard rules which hit this + // issue, but unfortunately there are 2^32 possible major and minor + // numbers, which would exhaust kernel memory quickly if we did this. Not + // to mention it'd be really slow (the kernel side is implemented as a + // linked-list of exceptions). + for _, partialMeta := range []deviceMeta{ + {node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor}, + {node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard}, + {node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard}, + } { + // This wildcard rule is equivalent to the requested rule, so skip it. + if rule.meta == partialMeta { + continue + } + // Only give an error if the set of permissions overlap. + partialPerms := e.rules[partialMeta] + if !partialPerms.Intersection(rule.perms).IsEmpty() { + return fmt.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms) + } + } + + // Subtract all of the permissions listed from the full match rule. If the + // rule didn't exist, all of this is a no-op. + newPerms := e.rules[rule.meta].Difference(rule.perms) + if newPerms.IsEmpty() { + delete(e.rules, rule.meta) + } else { + e.rules[rule.meta] = newPerms + } + // TODO: The actual cgroup code doesn't care if an exception didn't exist + // during removal, so not erroring out here is /accurate/ but quite + // worrying. Maybe we should do additional validation, but again we + // have to worry about backwards-compatibility. + return nil +} + +func (e *Emulator) allow(rule *deviceRule) error { + // This cgroup is configured as a black-list. Reset the entire emulator, + // and put is into black-list mode. + if rule == nil || rule.meta.node == devices.WildcardDevice { + *e = Emulator{ + defaultAllow: true, + rules: nil, + } + return nil + } + + var err error + if e.defaultAllow { + err = wrapErr(e.rmRule(*rule), "unable to remove 'deny' exception") + } else { + err = wrapErr(e.addRule(*rule), "unable to add 'allow' exception") + } + return err +} + +func (e *Emulator) deny(rule *deviceRule) error { + // This cgroup is configured as a white-list. Reset the entire emulator, + // and put is into white-list mode. + if rule == nil || rule.meta.node == devices.WildcardDevice { + *e = Emulator{ + defaultAllow: false, + rules: nil, + } + return nil + } + + var err error + if e.defaultAllow { + err = wrapErr(e.addRule(*rule), "unable to add 'deny' exception") + } else { + err = wrapErr(e.rmRule(*rule), "unable to remove 'allow' exception") + } + return err +} + +func (e *Emulator) Apply(rule devices.Rule) error { + if !rule.Type.CanCgroup() { + return fmt.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type) + } + + innerRule := &deviceRule{ + meta: deviceMeta{ + node: rule.Type, + major: rule.Major, + minor: rule.Minor, + }, + perms: rule.Permissions, + } + if innerRule.meta.node == devices.WildcardDevice { + innerRule = nil + } + + if rule.Allow { + return e.allow(innerRule) + } + + return e.deny(innerRule) +} + +// EmulatorFromList takes a reader to a "devices.list"-like source, and returns +// a new Emulator that represents the state of the devices cgroup. Note that +// black-list devices cgroups cannot be fully reconstructed, due to limitations +// in the devices cgroup API. Instead, such cgroups are always treated as +// "allow all" cgroups. +func EmulatorFromList(list io.Reader) (*Emulator, error) { + // Normally cgroups are in black-list mode by default, but the way we + // figure out the current mode is whether or not devices.list has an + // allow-all rule. So we default to a white-list, and the existence of an + // "a *:* rwm" entry will tell us otherwise. + e := &Emulator{ + defaultAllow: false, + } + + // Parse the "devices.list". + s := bufio.NewScanner(list) + for s.Scan() { + line := s.Text() + deviceRule, err := parseLine(line) + if err != nil { + return nil, fmt.Errorf("error parsing line %q: %w", line, err) + } + // "devices.list" is an allow list. Note that this means that in + // black-list mode, we have no idea what rules are in play. As a + // result, we need to be very careful in Transition(). + if err := e.allow(deviceRule); err != nil { + return nil, fmt.Errorf("error adding devices.list rule: %w", err) + } + } + if err := s.Err(); err != nil { + return nil, fmt.Errorf("error reading devices.list lines: %w", err) + } + return e, nil +} + +// Transition calculates what is the minimally-disruptive set of rules need to +// be applied to a devices cgroup in order to transition to the given target. +// This means that any already-existing rules will not be applied, and +// disruptive rules (like denying all device access) will only be applied if +// necessary. +// +// This function is the sole reason for all of Emulator -- to allow us +// to figure out how to update a containers' cgroups without causing spurious +// device errors (if possible). +func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) { + var transitionRules []*devices.Rule + oldRules := source.rules + + // If the default policy doesn't match, we need to include a "disruptive" + // rule (either allow-all or deny-all) in order to switch the cgroup to the + // correct default policy. + // + // However, due to a limitation in "devices.list" we cannot be sure what + // deny rules are in place in a black-list cgroup. Thus if the source is a + // black-list we also have to include a disruptive rule. + if source.IsBlacklist() || source.defaultAllow != target.defaultAllow { + transitionRules = append(transitionRules, &devices.Rule{ + Type: 'a', + Major: -1, + Minor: -1, + Permissions: devices.Permissions("rwm"), + Allow: target.defaultAllow, + }) + // The old rules are only relevant if we aren't starting out with a + // disruptive rule. + oldRules = nil + } + + // NOTE: We traverse through the rules in a sorted order so we always write + // the same set of rules (this is to aid testing). + + // First, we create inverse rules for any old rules not in the new set. + // This includes partial-inverse rules for specific permissions. This is a + // no-op if we added a disruptive rule, since oldRules will be empty. + for _, rule := range oldRules.orderedEntries() { + meta, oldPerms := rule.meta, rule.perms + newPerms := target.rules[meta] + droppedPerms := oldPerms.Difference(newPerms) + if !droppedPerms.IsEmpty() { + transitionRules = append(transitionRules, &devices.Rule{ + Type: meta.node, + Major: meta.major, + Minor: meta.minor, + Permissions: droppedPerms, + Allow: target.defaultAllow, + }) + } + } + + // Add any additional rules which weren't in the old set. We happen to + // filter out rules which are present in both sets, though this isn't + // strictly necessary. + for _, rule := range target.rules.orderedEntries() { + meta, newPerms := rule.meta, rule.perms + oldPerms := oldRules[meta] + gainedPerms := newPerms.Difference(oldPerms) + if !gainedPerms.IsEmpty() { + transitionRules = append(transitionRules, &devices.Rule{ + Type: meta.node, + Major: meta.major, + Minor: meta.minor, + Permissions: gainedPerms, + Allow: !target.defaultAllow, + }) + } + } + return transitionRules, nil +} + +// Rules returns the minimum set of rules necessary to convert a *deny-all* +// cgroup to the emulated filter state (note that this is not the same as a +// default cgroupv1 cgroup -- which is allow-all). This is effectively just a +// wrapper around Transition() with the source emulator being an empty cgroup. +func (e *Emulator) Rules() ([]*devices.Rule, error) { + defaultCgroup := &Emulator{defaultAllow: false} + return defaultCgroup.Transition(e) +} + +func wrapErr(err error, text string) error { + if err == nil { + return nil + } + return fmt.Errorf(text+": %w", err) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go new file mode 100644 index 000000000..c81b6562a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go @@ -0,0 +1,311 @@ +package fs + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type BlkioGroup struct { + weightFilename string + weightDeviceFilename string +} + +func (s *BlkioGroup) Name() string { + return "blkio" +} + +func (s *BlkioGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *BlkioGroup) Set(path string, r *configs.Resources) error { + s.detectWeightFilenames(path) + if r.BlkioWeight != 0 { + if err := cgroups.WriteFile(path, s.weightFilename, strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil { + return err + } + } + + if r.BlkioLeafWeight != 0 { + if err := cgroups.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(r.BlkioLeafWeight), 10)); err != nil { + return err + } + } + for _, wd := range r.BlkioWeightDevice { + if wd.Weight != 0 { + if err := cgroups.WriteFile(path, s.weightDeviceFilename, wd.WeightString()); err != nil { + return err + } + } + if wd.LeafWeight != 0 { + if err := cgroups.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil { + return err + } + } + } + for _, td := range r.BlkioThrottleReadBpsDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleWriteBpsDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleReadIOPSDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleWriteIOPSDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil { + return err + } + } + + return nil +} + +/* +examples: + + blkio.sectors + 8:0 6792 + + blkio.io_service_bytes + 8:0 Read 1282048 + 8:0 Write 2195456 + 8:0 Sync 2195456 + 8:0 Async 1282048 + 8:0 Total 3477504 + Total 3477504 + + blkio.io_serviced + 8:0 Read 124 + 8:0 Write 104 + 8:0 Sync 104 + 8:0 Async 124 + 8:0 Total 228 + Total 228 + + blkio.io_queued + 8:0 Read 0 + 8:0 Write 0 + 8:0 Sync 0 + 8:0 Async 0 + 8:0 Total 0 + Total 0 +*/ + +func splitBlkioStatLine(r rune) bool { + return r == ' ' || r == ':' +} + +func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) { + var blkioStats []cgroups.BlkioStatEntry + f, err := cgroups.OpenFile(dir, file, os.O_RDONLY) + if err != nil { + if os.IsNotExist(err) { + return blkioStats, nil + } + return nil, err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + // format: dev type amount + fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine) + if len(fields) < 3 { + if len(fields) == 2 && fields[0] == "Total" { + // skip total line + continue + } else { + return nil, malformedLine(dir, file, sc.Text()) + } + } + + v, err := strconv.ParseUint(fields[0], 10, 64) + if err != nil { + return nil, &parseError{Path: dir, File: file, Err: err} + } + major := v + + v, err = strconv.ParseUint(fields[1], 10, 64) + if err != nil { + return nil, &parseError{Path: dir, File: file, Err: err} + } + minor := v + + op := "" + valueField := 2 + if len(fields) == 4 { + op = fields[2] + valueField = 3 + } + v, err = strconv.ParseUint(fields[valueField], 10, 64) + if err != nil { + return nil, &parseError{Path: dir, File: file, Err: err} + } + blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v}) + } + if err := sc.Err(); err != nil { + return nil, &parseError{Path: dir, File: file, Err: err} + } + + return blkioStats, nil +} + +func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error { + type blkioStatInfo struct { + filename string + blkioStatEntriesPtr *[]cgroups.BlkioStatEntry + } + bfqDebugStats := []blkioStatInfo{ + { + filename: "blkio.bfq.sectors_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive, + }, + { + filename: "blkio.bfq.io_service_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive, + }, + { + filename: "blkio.bfq.io_wait_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive, + }, + { + filename: "blkio.bfq.io_merged_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive, + }, + { + filename: "blkio.bfq.io_queued_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive, + }, + { + filename: "blkio.bfq.time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive, + }, + { + filename: "blkio.bfq.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.bfq.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + bfqStats := []blkioStatInfo{ + { + filename: "blkio.bfq.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.bfq.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + cfqStats := []blkioStatInfo{ + { + filename: "blkio.sectors_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive, + }, + { + filename: "blkio.io_service_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive, + }, + { + filename: "blkio.io_wait_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive, + }, + { + filename: "blkio.io_merged_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive, + }, + { + filename: "blkio.io_queued_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive, + }, + { + filename: "blkio.time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive, + }, + { + filename: "blkio.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + throttleRecursiveStats := []blkioStatInfo{ + { + filename: "blkio.throttle.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.throttle.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + baseStats := []blkioStatInfo{ + { + filename: "blkio.throttle.io_serviced", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.throttle.io_service_bytes", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + orderedStats := [][]blkioStatInfo{ + bfqDebugStats, + bfqStats, + cfqStats, + throttleRecursiveStats, + baseStats, + } + + var blkioStats []cgroups.BlkioStatEntry + var err error + + for _, statGroup := range orderedStats { + for i, statInfo := range statGroup { + if blkioStats, err = getBlkioStat(path, statInfo.filename); err != nil || blkioStats == nil { + // if error occurs on first file, move to next group + if i == 0 { + break + } + return err + } + *statInfo.blkioStatEntriesPtr = blkioStats + // finish if all stats are gathered + if i == len(statGroup)-1 { + return nil + } + } + } + return nil +} + +func (s *BlkioGroup) detectWeightFilenames(path string) { + if s.weightFilename != "" { + // Already detected. + return + } + if cgroups.PathExists(filepath.Join(path, "blkio.weight")) { + s.weightFilename = "blkio.weight" + s.weightDeviceFilename = "blkio.weight_device" + } else { + s.weightFilename = "blkio.bfq.weight" + s.weightDeviceFilename = "blkio.bfq.weight_device" + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go new file mode 100644 index 000000000..6c79f899b --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go @@ -0,0 +1,129 @@ +package fs + +import ( + "bufio" + "errors" + "fmt" + "os" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" +) + +type CpuGroup struct{} + +func (s *CpuGroup) Name() string { + return "cpu" +} + +func (s *CpuGroup) Apply(path string, r *configs.Resources, pid int) error { + if err := os.MkdirAll(path, 0o755); err != nil { + return err + } + // We should set the real-Time group scheduling settings before moving + // in the process because if the process is already in SCHED_RR mode + // and no RT bandwidth is set, adding it will fail. + if err := s.SetRtSched(path, r); err != nil { + return err + } + // Since we are not using apply(), we need to place the pid + // into the procs file. + return cgroups.WriteCgroupProc(path, pid) +} + +func (s *CpuGroup) SetRtSched(path string, r *configs.Resources) error { + if r.CpuRtPeriod != 0 { + if err := cgroups.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(r.CpuRtPeriod, 10)); err != nil { + return err + } + } + if r.CpuRtRuntime != 0 { + if err := cgroups.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil { + return err + } + } + return nil +} + +func (s *CpuGroup) Set(path string, r *configs.Resources) error { + if r.CpuShares != 0 { + shares := r.CpuShares + if err := cgroups.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil { + return err + } + // read it back + sharesRead, err := fscommon.GetCgroupParamUint(path, "cpu.shares") + if err != nil { + return err + } + // ... and check + if shares > sharesRead { + return fmt.Errorf("the maximum allowed cpu-shares is %d", sharesRead) + } else if shares < sharesRead { + return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead) + } + } + + var period string + if r.CpuPeriod != 0 { + period = strconv.FormatUint(r.CpuPeriod, 10) + if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil { + // Sometimes when the period to be set is smaller + // than the current one, it is rejected by the kernel + // (EINVAL) as old_quota/new_period exceeds the parent + // cgroup quota limit. If this happens and the quota is + // going to be set, ignore the error for now and retry + // after setting the quota. + if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 { + return err + } + } else { + period = "" + } + } + if r.CpuQuota != 0 { + if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil { + return err + } + if period != "" { + if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil { + return err + } + } + } + return s.SetRtSched(path, r) +} + +func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error { + const file = "cpu.stat" + f, err := cgroups.OpenFile(path, file, os.O_RDONLY) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + t, v, err := fscommon.ParseKeyValue(sc.Text()) + if err != nil { + return &parseError{Path: path, File: file, Err: err} + } + switch t { + case "nr_periods": + stats.CpuStats.ThrottlingData.Periods = v + + case "nr_throttled": + stats.CpuStats.ThrottlingData.ThrottledPeriods = v + + case "throttled_time": + stats.CpuStats.ThrottlingData.ThrottledTime = v + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go new file mode 100644 index 000000000..d3bd7e111 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go @@ -0,0 +1,166 @@ +package fs + +import ( + "bufio" + "os" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +const ( + cgroupCpuacctStat = "cpuacct.stat" + cgroupCpuacctUsageAll = "cpuacct.usage_all" + + nanosecondsInSecond = 1000000000 + + userModeColumn = 1 + kernelModeColumn = 2 + cuacctUsageAllColumnsNumber = 3 + + // The value comes from `C.sysconf(C._SC_CLK_TCK)`, and + // on Linux it's a constant which is safe to be hard coded, + // so we can avoid using cgo here. For details, see: + // https://github.com/containerd/cgroups/pull/12 + clockTicks uint64 = 100 +) + +type CpuacctGroup struct{} + +func (s *CpuacctGroup) Name() string { + return "cpuacct" +} + +func (s *CpuacctGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *CpuacctGroup) Set(_ string, _ *configs.Resources) error { + return nil +} + +func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error { + if !cgroups.PathExists(path) { + return nil + } + userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path) + if err != nil { + return err + } + + totalUsage, err := fscommon.GetCgroupParamUint(path, "cpuacct.usage") + if err != nil { + return err + } + + percpuUsage, err := getPercpuUsage(path) + if err != nil { + return err + } + + percpuUsageInKernelmode, percpuUsageInUsermode, err := getPercpuUsageInModes(path) + if err != nil { + return err + } + + stats.CpuStats.CpuUsage.TotalUsage = totalUsage + stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage + stats.CpuStats.CpuUsage.PercpuUsageInKernelmode = percpuUsageInKernelmode + stats.CpuStats.CpuUsage.PercpuUsageInUsermode = percpuUsageInUsermode + stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage + stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage + return nil +} + +// Returns user and kernel usage breakdown in nanoseconds. +func getCpuUsageBreakdown(path string) (uint64, uint64, error) { + var userModeUsage, kernelModeUsage uint64 + const ( + userField = "user" + systemField = "system" + file = cgroupCpuacctStat + ) + + // Expected format: + // user + // system + data, err := cgroups.ReadFile(path, file) + if err != nil { + return 0, 0, err + } + // TODO: use strings.SplitN instead. + fields := strings.Fields(data) + if len(fields) < 4 || fields[0] != userField || fields[2] != systemField { + return 0, 0, malformedLine(path, file, data) + } + if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil { + return 0, 0, &parseError{Path: path, File: file, Err: err} + } + if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil { + return 0, 0, &parseError{Path: path, File: file, Err: err} + } + + return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil +} + +func getPercpuUsage(path string) ([]uint64, error) { + const file = "cpuacct.usage_percpu" + percpuUsage := []uint64{} + data, err := cgroups.ReadFile(path, file) + if err != nil { + return percpuUsage, err + } + // TODO: use strings.SplitN instead. + for _, value := range strings.Fields(data) { + value, err := strconv.ParseUint(value, 10, 64) + if err != nil { + return percpuUsage, &parseError{Path: path, File: file, Err: err} + } + percpuUsage = append(percpuUsage, value) + } + return percpuUsage, nil +} + +func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) { + usageKernelMode := []uint64{} + usageUserMode := []uint64{} + const file = cgroupCpuacctUsageAll + + fd, err := cgroups.OpenFile(path, file, os.O_RDONLY) + if os.IsNotExist(err) { + return usageKernelMode, usageUserMode, nil + } else if err != nil { + return nil, nil, err + } + defer fd.Close() + + scanner := bufio.NewScanner(fd) + scanner.Scan() // skipping header line + + for scanner.Scan() { + lineFields := strings.SplitN(scanner.Text(), " ", cuacctUsageAllColumnsNumber+1) + if len(lineFields) != cuacctUsageAllColumnsNumber { + continue + } + + usageInKernelMode, err := strconv.ParseUint(lineFields[kernelModeColumn], 10, 64) + if err != nil { + return nil, nil, &parseError{Path: path, File: file, Err: err} + } + usageKernelMode = append(usageKernelMode, usageInKernelMode) + + usageInUserMode, err := strconv.ParseUint(lineFields[userModeColumn], 10, 64) + if err != nil { + return nil, nil, &parseError{Path: path, File: file, Err: err} + } + usageUserMode = append(usageUserMode, usageInUserMode) + } + if err := scanner.Err(); err != nil { + return nil, nil, &parseError{Path: path, File: file, Err: err} + } + + return usageKernelMode, usageUserMode, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go new file mode 100644 index 000000000..550baa427 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go @@ -0,0 +1,245 @@ +package fs + +import ( + "errors" + "os" + "path/filepath" + "strconv" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type CpusetGroup struct{} + +func (s *CpusetGroup) Name() string { + return "cpuset" +} + +func (s *CpusetGroup) Apply(path string, r *configs.Resources, pid int) error { + return s.ApplyDir(path, r, pid) +} + +func (s *CpusetGroup) Set(path string, r *configs.Resources) error { + if r.CpusetCpus != "" { + if err := cgroups.WriteFile(path, "cpuset.cpus", r.CpusetCpus); err != nil { + return err + } + } + if r.CpusetMems != "" { + if err := cgroups.WriteFile(path, "cpuset.mems", r.CpusetMems); err != nil { + return err + } + } + return nil +} + +func getCpusetStat(path string, file string) ([]uint16, error) { + var extracted []uint16 + fileContent, err := fscommon.GetCgroupParamString(path, file) + if err != nil { + return extracted, err + } + if len(fileContent) == 0 { + return extracted, &parseError{Path: path, File: file, Err: errors.New("empty file")} + } + + for _, s := range strings.Split(fileContent, ",") { + sp := strings.SplitN(s, "-", 3) + switch len(sp) { + case 3: + return extracted, &parseError{Path: path, File: file, Err: errors.New("extra dash")} + case 2: + min, err := strconv.ParseUint(sp[0], 10, 16) + if err != nil { + return extracted, &parseError{Path: path, File: file, Err: err} + } + max, err := strconv.ParseUint(sp[1], 10, 16) + if err != nil { + return extracted, &parseError{Path: path, File: file, Err: err} + } + if min > max { + return extracted, &parseError{Path: path, File: file, Err: errors.New("invalid values, min > max")} + } + for i := min; i <= max; i++ { + extracted = append(extracted, uint16(i)) + } + case 1: + value, err := strconv.ParseUint(s, 10, 16) + if err != nil { + return extracted, &parseError{Path: path, File: file, Err: err} + } + extracted = append(extracted, uint16(value)) + } + } + + return extracted, nil +} + +func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error { + var err error + + stats.CPUSetStats.CPUs, err = getCpusetStat(path, "cpuset.cpus") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.CPUExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.cpu_exclusive") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.Mems, err = getCpusetStat(path, "cpuset.mems") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemHardwall, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_hardwall") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_exclusive") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemoryMigrate, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_migrate") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemorySpreadPage, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_page") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemorySpreadSlab, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_slab") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemoryPressure, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_pressure") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.SchedLoadBalance, err = fscommon.GetCgroupParamUint(path, "cpuset.sched_load_balance") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.SchedRelaxDomainLevel, err = fscommon.GetCgroupParamInt(path, "cpuset.sched_relax_domain_level") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + return nil +} + +func (s *CpusetGroup) ApplyDir(dir string, r *configs.Resources, pid int) error { + // This might happen if we have no cpuset cgroup mounted. + // Just do nothing and don't fail. + if dir == "" { + return nil + } + // 'ensureParent' start with parent because we don't want to + // explicitly inherit from parent, it could conflict with + // 'cpuset.cpu_exclusive'. + if err := cpusetEnsureParent(filepath.Dir(dir)); err != nil { + return err + } + if err := os.Mkdir(dir, 0o755); err != nil && !os.IsExist(err) { + return err + } + // We didn't inherit cpuset configs from parent, but we have + // to ensure cpuset configs are set before moving task into the + // cgroup. + // The logic is, if user specified cpuset configs, use these + // specified configs, otherwise, inherit from parent. This makes + // cpuset configs work correctly with 'cpuset.cpu_exclusive', and + // keep backward compatibility. + if err := s.ensureCpusAndMems(dir, r); err != nil { + return err + } + // Since we are not using apply(), we need to place the pid + // into the procs file. + return cgroups.WriteCgroupProc(dir, pid) +} + +func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) { + if cpus, err = cgroups.ReadFile(parent, "cpuset.cpus"); err != nil { + return + } + if mems, err = cgroups.ReadFile(parent, "cpuset.mems"); err != nil { + return + } + return cpus, mems, nil +} + +// cpusetEnsureParent makes sure that the parent directories of current +// are created and populated with the proper cpus and mems files copied +// from their respective parent. It does that recursively, starting from +// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point). +func cpusetEnsureParent(current string) error { + var st unix.Statfs_t + + parent := filepath.Dir(current) + err := unix.Statfs(parent, &st) + if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC { + return nil + } + // Treat non-existing directory as cgroupfs as it will be created, + // and the root cpuset directory obviously exists. + if err != nil && err != unix.ENOENT { //nolint:errorlint // unix errors are bare + return &os.PathError{Op: "statfs", Path: parent, Err: err} + } + + if err := cpusetEnsureParent(parent); err != nil { + return err + } + if err := os.Mkdir(current, 0o755); err != nil && !os.IsExist(err) { + return err + } + return cpusetCopyIfNeeded(current, parent) +} + +// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent +// directory to the current directory if the file's contents are 0 +func cpusetCopyIfNeeded(current, parent string) error { + currentCpus, currentMems, err := getCpusetSubsystemSettings(current) + if err != nil { + return err + } + parentCpus, parentMems, err := getCpusetSubsystemSettings(parent) + if err != nil { + return err + } + + if isEmptyCpuset(currentCpus) { + if err := cgroups.WriteFile(current, "cpuset.cpus", parentCpus); err != nil { + return err + } + } + if isEmptyCpuset(currentMems) { + if err := cgroups.WriteFile(current, "cpuset.mems", parentMems); err != nil { + return err + } + } + return nil +} + +func isEmptyCpuset(str string) bool { + return str == "" || str == "\n" +} + +func (s *CpusetGroup) ensureCpusAndMems(path string, r *configs.Resources) error { + if err := s.Set(path, r); err != nil { + return err + } + return cpusetCopyIfNeeded(path, filepath.Dir(path)) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go new file mode 100644 index 000000000..4527a70eb --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go @@ -0,0 +1,109 @@ +package fs + +import ( + "bytes" + "errors" + "reflect" + + "github.com/opencontainers/runc/libcontainer/cgroups" + cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runc/libcontainer/userns" +) + +type DevicesGroup struct { + TestingSkipFinalCheck bool +} + +func (s *DevicesGroup) Name() string { + return "devices" +} + +func (s *DevicesGroup) Apply(path string, r *configs.Resources, pid int) error { + if r.SkipDevices { + return nil + } + if path == "" { + // Return error here, since devices cgroup + // is a hard requirement for container's security. + return errSubsystemDoesNotExist + } + + return apply(path, pid) +} + +func loadEmulator(path string) (*cgroupdevices.Emulator, error) { + list, err := cgroups.ReadFile(path, "devices.list") + if err != nil { + return nil, err + } + return cgroupdevices.EmulatorFromList(bytes.NewBufferString(list)) +} + +func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) { + // This defaults to a white-list -- which is what we want! + emu := &cgroupdevices.Emulator{} + for _, rule := range rules { + if err := emu.Apply(*rule); err != nil { + return nil, err + } + } + return emu, nil +} + +func (s *DevicesGroup) Set(path string, r *configs.Resources) error { + if userns.RunningInUserNS() || r.SkipDevices { + return nil + } + + // Generate two emulators, one for the current state of the cgroup and one + // for the requested state by the user. + current, err := loadEmulator(path) + if err != nil { + return err + } + target, err := buildEmulator(r.Devices) + if err != nil { + return err + } + + // Compute the minimal set of transition rules needed to achieve the + // requested state. + transitionRules, err := current.Transition(target) + if err != nil { + return err + } + for _, rule := range transitionRules { + file := "devices.deny" + if rule.Allow { + file = "devices.allow" + } + if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil { + return err + } + } + + // Final safety check -- ensure that the resulting state is what was + // requested. This is only really correct for white-lists, but for + // black-lists we can at least check that the cgroup is in the right mode. + // + // This safety-check is skipped for the unit tests because we cannot + // currently mock devices.list correctly. + if !s.TestingSkipFinalCheck { + currentAfter, err := loadEmulator(path) + if err != nil { + return err + } + if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) { + return errors.New("resulting devices cgroup doesn't precisely match target") + } else if target.IsBlacklist() != currentAfter.IsBlacklist() { + return errors.New("resulting devices cgroup doesn't match target mode") + } + } + return nil +} + +func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/error.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/error.go new file mode 100644 index 000000000..f2ab6f130 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/error.go @@ -0,0 +1,15 @@ +package fs + +import ( + "fmt" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" +) + +type parseError = fscommon.ParseError + +// malformedLine is used by all cgroupfs file parsers that expect a line +// in a particular format but get some garbage instead. +func malformedLine(path, file, line string) error { + return &parseError{Path: path, File: file, Err: fmt.Errorf("malformed line: %s", line)} +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go new file mode 100644 index 000000000..987f1bf5e --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go @@ -0,0 +1,158 @@ +package fs + +import ( + "errors" + "fmt" + "os" + "strings" + "time" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +type FreezerGroup struct{} + +func (s *FreezerGroup) Name() string { + return "freezer" +} + +func (s *FreezerGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) { + switch r.Freezer { + case configs.Frozen: + defer func() { + if Err != nil { + // Freezing failed, and it is bad and dangerous + // to leave the cgroup in FROZEN or FREEZING + // state, so (try to) thaw it back. + _ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed)) + } + }() + + // As per older kernel docs (freezer-subsystem.txt before + // kernel commit ef9fe980c6fcc1821), if FREEZING is seen, + // userspace should either retry or thaw. While current + // kernel cgroup v1 docs no longer mention a need to retry, + // even a recent kernel (v5.4, Ubuntu 20.04) can't reliably + // freeze a cgroup v1 while new processes keep appearing in it + // (either via fork/clone or by writing new PIDs to + // cgroup.procs). + // + // The numbers below are empirically chosen to have a decent + // chance to succeed in various scenarios ("runc pause/unpause + // with parallel runc exec" and "bare freeze/unfreeze on a very + // slow system"), tested on RHEL7 and Ubuntu 20.04 kernels. + // + // Adding any amount of sleep in between retries did not + // increase the chances of successful freeze in "pause/unpause + // with parallel exec" reproducer. OTOH, adding an occasional + // sleep helped for the case where the system is extremely slow + // (CentOS 7 VM on GHA CI). + // + // Alas, this is still a game of chances, since the real fix + // belong to the kernel (cgroup v2 do not have this bug). + + for i := 0; i < 1000; i++ { + if i%50 == 49 { + // Occasional thaw and sleep improves + // the chances to succeed in freezing + // in case new processes keep appearing + // in the cgroup. + _ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed)) + time.Sleep(10 * time.Millisecond) + } + + if err := cgroups.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil { + return err + } + + if i%25 == 24 { + // Occasional short sleep before reading + // the state back also improves the chances to + // succeed in freezing in case of a very slow + // system. + time.Sleep(10 * time.Microsecond) + } + state, err := cgroups.ReadFile(path, "freezer.state") + if err != nil { + return err + } + state = strings.TrimSpace(state) + switch state { + case "FREEZING": + continue + case string(configs.Frozen): + if i > 1 { + logrus.Debugf("frozen after %d retries", i) + } + return nil + default: + // should never happen + return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state)) + } + } + // Despite our best efforts, it got stuck in FREEZING. + return errors.New("unable to freeze") + case configs.Thawed: + return cgroups.WriteFile(path, "freezer.state", string(configs.Thawed)) + case configs.Undefined: + return nil + default: + return fmt.Errorf("Invalid argument '%s' to freezer.state", string(r.Freezer)) + } +} + +func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} + +func (s *FreezerGroup) GetState(path string) (configs.FreezerState, error) { + for { + state, err := cgroups.ReadFile(path, "freezer.state") + if err != nil { + // If the kernel is too old, then we just treat the freezer as + // being in an "undefined" state. + if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) { + err = nil + } + return configs.Undefined, err + } + switch strings.TrimSpace(state) { + case "THAWED": + return configs.Thawed, nil + case "FROZEN": + // Find out whether the cgroup is frozen directly, + // or indirectly via an ancestor. + self, err := cgroups.ReadFile(path, "freezer.self_freezing") + if err != nil { + // If the kernel is too old, then we just treat + // it as being frozen. + if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.ENODEV) { + err = nil + } + return configs.Frozen, err + } + switch self { + case "0\n": + return configs.Thawed, nil + case "1\n": + return configs.Frozen, nil + default: + return configs.Undefined, fmt.Errorf(`unknown "freezer.self_freezing" state: %q`, self) + } + case "FREEZING": + // Make sure we get a stable freezer state, so retry if the cgroup + // is still undergoing freezing. This should be a temporary delay. + time.Sleep(1 * time.Millisecond) + continue + default: + return configs.Undefined, fmt.Errorf("unknown freezer.state %q", state) + } + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go new file mode 100644 index 000000000..fb4fcc7f7 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go @@ -0,0 +1,264 @@ +package fs + +import ( + "errors" + "fmt" + "os" + "sync" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +var subsystems = []subsystem{ + &CpusetGroup{}, + &DevicesGroup{}, + &MemoryGroup{}, + &CpuGroup{}, + &CpuacctGroup{}, + &PidsGroup{}, + &BlkioGroup{}, + &HugetlbGroup{}, + &NetClsGroup{}, + &NetPrioGroup{}, + &PerfEventGroup{}, + &FreezerGroup{}, + &RdmaGroup{}, + &NameGroup{GroupName: "name=systemd", Join: true}, +} + +var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") + +func init() { + // If using cgroups-hybrid mode then add a "" controller indicating + // it should join the cgroups v2. + if cgroups.IsCgroup2HybridMode() { + subsystems = append(subsystems, &NameGroup{GroupName: "", Join: true}) + } +} + +type subsystem interface { + // Name returns the name of the subsystem. + Name() string + // GetStats fills in the stats for the subsystem. + GetStats(path string, stats *cgroups.Stats) error + // Apply creates and joins a cgroup, adding pid into it. Some + // subsystems use resources to pre-configure the cgroup parents + // before creating or joining it. + Apply(path string, r *configs.Resources, pid int) error + // Set sets the cgroup resources. + Set(path string, r *configs.Resources) error +} + +type manager struct { + mu sync.Mutex + cgroups *configs.Cgroup + paths map[string]string +} + +func NewManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) { + // Some v1 controllers (cpu, cpuset, and devices) expect + // cgroups.Resources to not be nil in Apply. + if cg.Resources == nil { + return nil, errors.New("cgroup v1 manager needs configs.Resources to be set during manager creation") + } + if cg.Resources.Unified != nil { + return nil, cgroups.ErrV1NoUnified + } + + if paths == nil { + var err error + paths, err = initPaths(cg) + if err != nil { + return nil, err + } + } + + return &manager{ + cgroups: cg, + paths: paths, + }, nil +} + +// isIgnorableError returns whether err is a permission error (in the loose +// sense of the word). This includes EROFS (which for an unprivileged user is +// basically a permission error) and EACCES (for similar reasons) as well as +// the normal EPERM. +func isIgnorableError(rootless bool, err error) bool { + // We do not ignore errors if we are root. + if !rootless { + return false + } + // Is it an ordinary EPERM? + if errors.Is(err, os.ErrPermission) { + return true + } + // Handle some specific syscall errors. + var errno unix.Errno + if errors.As(err, &errno) { + return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES + } + return false +} + +func (m *manager) Apply(pid int) (err error) { + m.mu.Lock() + defer m.mu.Unlock() + + c := m.cgroups + + for _, sys := range subsystems { + name := sys.Name() + p, ok := m.paths[name] + if !ok { + continue + } + + if err := sys.Apply(p, c.Resources, pid); err != nil { + // In the case of rootless (including euid=0 in userns), where an + // explicit cgroup path hasn't been set, we don't bail on error in + // case of permission problems here, but do delete the path from + // the m.paths map, since it is either non-existent and could not + // be created, or the pid could not be added to it. + // + // Cases where limits for the subsystem have been set are handled + // later by Set, which fails with a friendly error (see + // if path == "" in Set). + if isIgnorableError(c.Rootless, err) && c.Path == "" { + delete(m.paths, name) + continue + } + return err + } + + } + return nil +} + +func (m *manager) Destroy() error { + m.mu.Lock() + defer m.mu.Unlock() + return cgroups.RemovePaths(m.paths) +} + +func (m *manager) Path(subsys string) string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths[subsys] +} + +func (m *manager) GetStats() (*cgroups.Stats, error) { + m.mu.Lock() + defer m.mu.Unlock() + stats := cgroups.NewStats() + for _, sys := range subsystems { + path := m.paths[sys.Name()] + if path == "" { + continue + } + if err := sys.GetStats(path, stats); err != nil { + return nil, err + } + } + return stats, nil +} + +func (m *manager) Set(r *configs.Resources) error { + if r == nil { + return nil + } + + if r.Unified != nil { + return cgroups.ErrV1NoUnified + } + + m.mu.Lock() + defer m.mu.Unlock() + for _, sys := range subsystems { + path := m.paths[sys.Name()] + if err := sys.Set(path, r); err != nil { + // When rootless is true, errors from the device subsystem + // are ignored, as it is really not expected to work. + if m.cgroups.Rootless && sys.Name() == "devices" { + continue + } + // However, errors from other subsystems are not ignored. + // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if path == "" { + // We never created a path for this cgroup, so we cannot set + // limits for it (though we have already tried at this point). + return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name()) + } + return err + } + } + + return nil +} + +// Freeze toggles the container's freezer cgroup depending on the state +// provided +func (m *manager) Freeze(state configs.FreezerState) error { + path := m.Path("freezer") + if path == "" { + return errors.New("cannot toggle freezer: cgroups not configured for container") + } + + prevState := m.cgroups.Resources.Freezer + m.cgroups.Resources.Freezer = state + freezer := &FreezerGroup{} + if err := freezer.Set(path, m.cgroups.Resources); err != nil { + m.cgroups.Resources.Freezer = prevState + return err + } + return nil +} + +func (m *manager) GetPids() ([]int, error) { + return cgroups.GetPids(m.Path("devices")) +} + +func (m *manager) GetAllPids() ([]int, error) { + return cgroups.GetAllPids(m.Path("devices")) +} + +func (m *manager) GetPaths() map[string]string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths +} + +func (m *manager) GetCgroups() (*configs.Cgroup, error) { + return m.cgroups, nil +} + +func (m *manager) GetFreezerState() (configs.FreezerState, error) { + dir := m.Path("freezer") + // If the container doesn't have the freezer cgroup, say it's undefined. + if dir == "" { + return configs.Undefined, nil + } + freezer := &FreezerGroup{} + return freezer.GetState(dir) +} + +func (m *manager) Exists() bool { + return cgroups.PathExists(m.Path("devices")) +} + +func OOMKillCount(path string) (uint64, error) { + return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill") +} + +func (m *manager) OOMKillCount() (uint64, error) { + c, err := OOMKillCount(m.Path("memory")) + // Ignore ENOENT when rootless as it couldn't create cgroup. + if err != nil && m.cgroups.Rootless && os.IsNotExist(err) { + err = nil + } + + return c, err +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go new file mode 100644 index 000000000..8ddd6fdd8 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go @@ -0,0 +1,62 @@ +package fs + +import ( + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type HugetlbGroup struct{} + +func (s *HugetlbGroup) Name() string { + return "hugetlb" +} + +func (s *HugetlbGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *HugetlbGroup) Set(path string, r *configs.Resources) error { + for _, hugetlb := range r.HugetlbLimit { + if err := cgroups.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil { + return err + } + } + + return nil +} + +func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error { + if !cgroups.PathExists(path) { + return nil + } + hugetlbStats := cgroups.HugetlbStats{} + for _, pageSize := range cgroups.HugePageSizes() { + usage := "hugetlb." + pageSize + ".usage_in_bytes" + value, err := fscommon.GetCgroupParamUint(path, usage) + if err != nil { + return err + } + hugetlbStats.Usage = value + + maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes" + value, err = fscommon.GetCgroupParamUint(path, maxUsage) + if err != nil { + return err + } + hugetlbStats.MaxUsage = value + + failcnt := "hugetlb." + pageSize + ".failcnt" + value, err = fscommon.GetCgroupParamUint(path, failcnt) + if err != nil { + return err + } + hugetlbStats.Failcnt = value + + stats.HugetlbStats[pageSize] = hugetlbStats + } + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go new file mode 100644 index 000000000..b7c75f941 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go @@ -0,0 +1,348 @@ +package fs + +import ( + "bufio" + "errors" + "fmt" + "math" + "os" + "path/filepath" + "strconv" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +const ( + cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" + cgroupMemoryLimit = "memory.limit_in_bytes" + cgroupMemoryUsage = "memory.usage_in_bytes" + cgroupMemoryMaxUsage = "memory.max_usage_in_bytes" +) + +type MemoryGroup struct{} + +func (s *MemoryGroup) Name() string { + return "memory" +} + +func (s *MemoryGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func setMemory(path string, val int64) error { + if val == 0 { + return nil + } + + err := cgroups.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10)) + if !errors.Is(err, unix.EBUSY) { + return err + } + + // EBUSY means the kernel can't set new limit as it's too low + // (lower than the current usage). Return more specific error. + usage, err := fscommon.GetCgroupParamUint(path, cgroupMemoryUsage) + if err != nil { + return err + } + max, err := fscommon.GetCgroupParamUint(path, cgroupMemoryMaxUsage) + if err != nil { + return err + } + + return fmt.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max) +} + +func setSwap(path string, val int64) error { + if val == 0 { + return nil + } + + return cgroups.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10)) +} + +func setMemoryAndSwap(path string, r *configs.Resources) error { + // If the memory update is set to -1 and the swap is not explicitly + // set, we should also set swap to -1, it means unlimited memory. + if r.Memory == -1 && r.MemorySwap == 0 { + // Only set swap if it's enabled in kernel + if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) { + r.MemorySwap = -1 + } + } + + // When memory and swap memory are both set, we need to handle the cases + // for updating container. + if r.Memory != 0 && r.MemorySwap != 0 { + curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit) + if err != nil { + return err + } + + // When update memory limit, we should adapt the write sequence + // for memory and swap memory, so it won't fail because the new + // value and the old value don't fit kernel's validation. + if r.MemorySwap == -1 || curLimit < uint64(r.MemorySwap) { + if err := setSwap(path, r.MemorySwap); err != nil { + return err + } + if err := setMemory(path, r.Memory); err != nil { + return err + } + return nil + } + } + + if err := setMemory(path, r.Memory); err != nil { + return err + } + if err := setSwap(path, r.MemorySwap); err != nil { + return err + } + + return nil +} + +func (s *MemoryGroup) Set(path string, r *configs.Resources) error { + if err := setMemoryAndSwap(path, r); err != nil { + return err + } + + // ignore KernelMemory and KernelMemoryTCP + + if r.MemoryReservation != 0 { + if err := cgroups.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil { + return err + } + } + + if r.OomKillDisable { + if err := cgroups.WriteFile(path, "memory.oom_control", "1"); err != nil { + return err + } + } + if r.MemorySwappiness == nil || int64(*r.MemorySwappiness) == -1 { + return nil + } else if *r.MemorySwappiness <= 100 { + if err := cgroups.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil { + return err + } + } else { + return fmt.Errorf("invalid memory swappiness value: %d (valid range is 0-100)", *r.MemorySwappiness) + } + + return nil +} + +func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { + const file = "memory.stat" + statsFile, err := cgroups.OpenFile(path, file, os.O_RDONLY) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer statsFile.Close() + + sc := bufio.NewScanner(statsFile) + for sc.Scan() { + t, v, err := fscommon.ParseKeyValue(sc.Text()) + if err != nil { + return &parseError{Path: path, File: file, Err: err} + } + stats.MemoryStats.Stats[t] = v + } + stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"] + + memoryUsage, err := getMemoryData(path, "") + if err != nil { + return err + } + stats.MemoryStats.Usage = memoryUsage + swapUsage, err := getMemoryData(path, "memsw") + if err != nil { + return err + } + stats.MemoryStats.SwapUsage = swapUsage + kernelUsage, err := getMemoryData(path, "kmem") + if err != nil { + return err + } + stats.MemoryStats.KernelUsage = kernelUsage + kernelTCPUsage, err := getMemoryData(path, "kmem.tcp") + if err != nil { + return err + } + stats.MemoryStats.KernelTCPUsage = kernelTCPUsage + + value, err := fscommon.GetCgroupParamUint(path, "memory.use_hierarchy") + if err != nil { + return err + } + if value == 1 { + stats.MemoryStats.UseHierarchy = true + } + + pagesByNUMA, err := getPageUsageByNUMA(path) + if err != nil { + return err + } + stats.MemoryStats.PageUsageByNUMA = pagesByNUMA + + return nil +} + +func getMemoryData(path, name string) (cgroups.MemoryData, error) { + memoryData := cgroups.MemoryData{} + + moduleName := "memory" + if name != "" { + moduleName = "memory." + name + } + var ( + usage = moduleName + ".usage_in_bytes" + maxUsage = moduleName + ".max_usage_in_bytes" + failcnt = moduleName + ".failcnt" + limit = moduleName + ".limit_in_bytes" + ) + + value, err := fscommon.GetCgroupParamUint(path, usage) + if err != nil { + if name != "" && os.IsNotExist(err) { + // Ignore ENOENT as swap and kmem controllers + // are optional in the kernel. + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, err + } + memoryData.Usage = value + value, err = fscommon.GetCgroupParamUint(path, maxUsage) + if err != nil { + return cgroups.MemoryData{}, err + } + memoryData.MaxUsage = value + value, err = fscommon.GetCgroupParamUint(path, failcnt) + if err != nil { + return cgroups.MemoryData{}, err + } + memoryData.Failcnt = value + value, err = fscommon.GetCgroupParamUint(path, limit) + if err != nil { + return cgroups.MemoryData{}, err + } + memoryData.Limit = value + + return memoryData, nil +} + +func getPageUsageByNUMA(path string) (cgroups.PageUsageByNUMA, error) { + const ( + maxColumns = math.MaxUint8 + 1 + file = "memory.numa_stat" + ) + stats := cgroups.PageUsageByNUMA{} + + fd, err := cgroups.OpenFile(path, file, os.O_RDONLY) + if os.IsNotExist(err) { + return stats, nil + } else if err != nil { + return stats, err + } + defer fd.Close() + + // File format is documented in linux/Documentation/cgroup-v1/memory.txt + // and it looks like this: + // + // total= N0= N1= ... + // file= N0= N1= ... + // anon= N0= N1= ... + // unevictable= N0= N1= ... + // hierarchical_= N0= N1= ... + + scanner := bufio.NewScanner(fd) + for scanner.Scan() { + var field *cgroups.PageStats + + line := scanner.Text() + columns := strings.SplitN(line, " ", maxColumns) + for i, column := range columns { + byNode := strings.SplitN(column, "=", 2) + // Some custom kernels have non-standard fields, like + // numa_locality 0 0 0 0 0 0 0 0 0 0 + // numa_exectime 0 + if len(byNode) < 2 { + if i == 0 { + // Ignore/skip those. + break + } else { + // The first column was already validated, + // so be strict to the rest. + return stats, malformedLine(path, file, line) + } + } + key, val := byNode[0], byNode[1] + if i == 0 { // First column: key is name, val is total. + field = getNUMAField(&stats, key) + if field == nil { // unknown field (new kernel?) + break + } + field.Total, err = strconv.ParseUint(val, 0, 64) + if err != nil { + return stats, &parseError{Path: path, File: file, Err: err} + } + field.Nodes = map[uint8]uint64{} + } else { // Subsequent columns: key is N, val is usage. + if len(key) < 2 || key[0] != 'N' { + // This is definitely an error. + return stats, malformedLine(path, file, line) + } + + n, err := strconv.ParseUint(key[1:], 10, 8) + if err != nil { + return stats, &parseError{Path: path, File: file, Err: err} + } + + usage, err := strconv.ParseUint(val, 10, 64) + if err != nil { + return stats, &parseError{Path: path, File: file, Err: err} + } + + field.Nodes[uint8(n)] = usage + } + + } + } + if err := scanner.Err(); err != nil { + return cgroups.PageUsageByNUMA{}, &parseError{Path: path, File: file, Err: err} + } + + return stats, nil +} + +func getNUMAField(stats *cgroups.PageUsageByNUMA, name string) *cgroups.PageStats { + switch name { + case "total": + return &stats.Total + case "file": + return &stats.File + case "anon": + return &stats.Anon + case "unevictable": + return &stats.Unevictable + case "hierarchical_total": + return &stats.Hierarchical.Total + case "hierarchical_file": + return &stats.Hierarchical.File + case "hierarchical_anon": + return &stats.Hierarchical.Anon + case "hierarchical_unevictable": + return &stats.Hierarchical.Unevictable + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go new file mode 100644 index 000000000..b8d5d849c --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go @@ -0,0 +1,31 @@ +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type NameGroup struct { + GroupName string + Join bool +} + +func (s *NameGroup) Name() string { + return s.GroupName +} + +func (s *NameGroup) Apply(path string, _ *configs.Resources, pid int) error { + if s.Join { + // Ignore errors if the named cgroup does not exist. + _ = apply(path, pid) + } + return nil +} + +func (s *NameGroup) Set(_ string, _ *configs.Resources) error { + return nil +} + +func (s *NameGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go new file mode 100644 index 000000000..abfd09ce8 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go @@ -0,0 +1,32 @@ +package fs + +import ( + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type NetClsGroup struct{} + +func (s *NetClsGroup) Name() string { + return "net_cls" +} + +func (s *NetClsGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *NetClsGroup) Set(path string, r *configs.Resources) error { + if r.NetClsClassid != 0 { + if err := cgroups.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(r.NetClsClassid), 10)); err != nil { + return err + } + } + + return nil +} + +func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go new file mode 100644 index 000000000..da74d3779 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go @@ -0,0 +1,30 @@ +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type NetPrioGroup struct{} + +func (s *NetPrioGroup) Name() string { + return "net_prio" +} + +func (s *NetPrioGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *NetPrioGroup) Set(path string, r *configs.Resources) error { + for _, prioMap := range r.NetPrioIfpriomap { + if err := cgroups.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil { + return err + } + } + + return nil +} + +func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go new file mode 100644 index 000000000..1092331b2 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go @@ -0,0 +1,186 @@ +package fs + +import ( + "errors" + "os" + "path/filepath" + "sync" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" +) + +// The absolute path to the root of the cgroup hierarchies. +var ( + cgroupRootLock sync.Mutex + cgroupRoot string +) + +const defaultCgroupRoot = "/sys/fs/cgroup" + +func initPaths(cg *configs.Cgroup) (map[string]string, error) { + root, err := rootPath() + if err != nil { + return nil, err + } + + inner, err := innerPath(cg) + if err != nil { + return nil, err + } + + paths := make(map[string]string) + for _, sys := range subsystems { + name := sys.Name() + path, err := subsysPath(root, inner, name) + if err != nil { + // The non-presence of the devices subsystem + // is considered fatal for security reasons. + if cgroups.IsNotFound(err) && (cg.SkipDevices || name != "devices") { + continue + } + + return nil, err + } + paths[name] = path + } + + return paths, nil +} + +func tryDefaultCgroupRoot() string { + var st, pst unix.Stat_t + + // (1) it should be a directory... + err := unix.Lstat(defaultCgroupRoot, &st) + if err != nil || st.Mode&unix.S_IFDIR == 0 { + return "" + } + + // (2) ... and a mount point ... + err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst) + if err != nil { + return "" + } + + if st.Dev == pst.Dev { + // parent dir has the same dev -- not a mount point + return "" + } + + // (3) ... of 'tmpfs' fs type. + var fst unix.Statfs_t + err = unix.Statfs(defaultCgroupRoot, &fst) + if err != nil || fst.Type != unix.TMPFS_MAGIC { + return "" + } + + // (4) it should have at least 1 entry ... + dir, err := os.Open(defaultCgroupRoot) + if err != nil { + return "" + } + names, err := dir.Readdirnames(1) + if err != nil { + return "" + } + if len(names) < 1 { + return "" + } + // ... which is a cgroup mount point. + err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst) + if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC { + return "" + } + + return defaultCgroupRoot +} + +// rootPath finds and returns path to the root of the cgroup hierarchies. +func rootPath() (string, error) { + cgroupRootLock.Lock() + defer cgroupRootLock.Unlock() + + if cgroupRoot != "" { + return cgroupRoot, nil + } + + // fast path + cgroupRoot = tryDefaultCgroupRoot() + if cgroupRoot != "" { + return cgroupRoot, nil + } + + // slow path: parse mountinfo + mi, err := cgroups.GetCgroupMounts(false) + if err != nil { + return "", err + } + if len(mi) < 1 { + return "", errors.New("no cgroup mount found in mountinfo") + } + + // Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"), + // use its parent directory. + root := filepath.Dir(mi[0].Mountpoint) + + if _, err := os.Stat(root); err != nil { + return "", err + } + + cgroupRoot = root + return cgroupRoot, nil +} + +func innerPath(c *configs.Cgroup) (string, error) { + if (c.Name != "" || c.Parent != "") && c.Path != "" { + return "", errors.New("cgroup: either Path or Name and Parent should be used") + } + + // XXX: Do not remove CleanPath. Path safety is important! -- cyphar + innerPath := utils.CleanPath(c.Path) + if innerPath == "" { + cgParent := utils.CleanPath(c.Parent) + cgName := utils.CleanPath(c.Name) + innerPath = filepath.Join(cgParent, cgName) + } + + return innerPath, nil +} + +func subsysPath(root, inner, subsystem string) (string, error) { + // If the cgroup name/path is absolute do not look relative to the cgroup of the init process. + if filepath.IsAbs(inner) { + mnt, err := cgroups.FindCgroupMountpoint(root, subsystem) + // If we didn't mount the subsystem, there is no point we make the path. + if err != nil { + return "", err + } + + // Sometimes subsystems can be mounted together as 'cpu,cpuacct'. + return filepath.Join(root, filepath.Base(mnt), inner), nil + } + + // Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating + // process could in container and shared pid namespace with host, and + // /proc/1/cgroup could point to whole other world of cgroups. + parentPath, err := cgroups.GetOwnCgroupPath(subsystem) + if err != nil { + return "", err + } + + return filepath.Join(parentPath, inner), nil +} + +func apply(path string, pid int) error { + if path == "" { + return nil + } + if err := os.MkdirAll(path, 0o755); err != nil { + return err + } + return cgroups.WriteCgroupProc(path, pid) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/perf_event.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/perf_event.go new file mode 100644 index 000000000..b86955c8f --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/perf_event.go @@ -0,0 +1,24 @@ +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type PerfEventGroup struct{} + +func (s *PerfEventGroup) Name() string { + return "perf_event" +} + +func (s *PerfEventGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *PerfEventGroup) Set(_ string, _ *configs.Resources) error { + return nil +} + +func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go new file mode 100644 index 000000000..1f13532a5 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go @@ -0,0 +1,62 @@ +package fs + +import ( + "math" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type PidsGroup struct{} + +func (s *PidsGroup) Name() string { + return "pids" +} + +func (s *PidsGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *PidsGroup) Set(path string, r *configs.Resources) error { + if r.PidsLimit != 0 { + // "max" is the fallback value. + limit := "max" + + if r.PidsLimit > 0 { + limit = strconv.FormatInt(r.PidsLimit, 10) + } + + if err := cgroups.WriteFile(path, "pids.max", limit); err != nil { + return err + } + } + + return nil +} + +func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error { + if !cgroups.PathExists(path) { + return nil + } + current, err := fscommon.GetCgroupParamUint(path, "pids.current") + if err != nil { + return err + } + + max, err := fscommon.GetCgroupParamUint(path, "pids.max") + if err != nil { + return err + } + // If no limit is set, read from pids.max returns "max", which is + // converted to MaxUint64 by GetCgroupParamUint. Historically, we + // represent "no limit" for pids as 0, thus this conversion. + if max == math.MaxUint64 { + max = 0 + } + + stats.PidsStats.Current = current + stats.PidsStats.Limit = max + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/rdma.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/rdma.go new file mode 100644 index 000000000..5bbe0f35f --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/rdma.go @@ -0,0 +1,25 @@ +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type RdmaGroup struct{} + +func (s *RdmaGroup) Name() string { + return "rdma" +} + +func (s *RdmaGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *RdmaGroup) Set(path string, r *configs.Resources) error { + return fscommon.RdmaSet(path, r) +} + +func (s *RdmaGroup) GetStats(path string, stats *cgroups.Stats) error { + return fscommon.RdmaGetStats(path, stats) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/rdma.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/rdma.go new file mode 100644 index 000000000..d463d15ee --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/rdma.go @@ -0,0 +1,121 @@ +package fscommon + +import ( + "bufio" + "errors" + "math" + "os" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" +) + +// parseRdmaKV parses raw string to RdmaEntry. +func parseRdmaKV(raw string, entry *cgroups.RdmaEntry) error { + var value uint32 + + parts := strings.SplitN(raw, "=", 3) + + if len(parts) != 2 { + return errors.New("Unable to parse RDMA entry") + } + + k, v := parts[0], parts[1] + + if v == "max" { + value = math.MaxUint32 + } else { + val64, err := strconv.ParseUint(v, 10, 32) + if err != nil { + return err + } + value = uint32(val64) + } + if k == "hca_handle" { + entry.HcaHandles = value + } else if k == "hca_object" { + entry.HcaObjects = value + } + + return nil +} + +// readRdmaEntries reads and converts array of rawstrings to RdmaEntries from file. +// example entry: mlx4_0 hca_handle=2 hca_object=2000 +func readRdmaEntries(dir, file string) ([]cgroups.RdmaEntry, error) { + rdmaEntries := make([]cgroups.RdmaEntry, 0) + fd, err := cgroups.OpenFile(dir, file, unix.O_RDONLY) + if err != nil { + return nil, err + } + defer fd.Close() //nolint:errorlint + scanner := bufio.NewScanner(fd) + for scanner.Scan() { + parts := strings.SplitN(scanner.Text(), " ", 4) + if len(parts) == 3 { + entry := new(cgroups.RdmaEntry) + entry.Device = parts[0] + err = parseRdmaKV(parts[1], entry) + if err != nil { + continue + } + err = parseRdmaKV(parts[2], entry) + if err != nil { + continue + } + + rdmaEntries = append(rdmaEntries, *entry) + } + } + return rdmaEntries, scanner.Err() +} + +// RdmaGetStats returns rdma stats such as totalLimit and current entries. +func RdmaGetStats(path string, stats *cgroups.Stats) error { + currentEntries, err := readRdmaEntries(path, "rdma.current") + if err != nil { + if errors.Is(err, os.ErrNotExist) { + err = nil + } + return err + } + maxEntries, err := readRdmaEntries(path, "rdma.max") + if err != nil { + return err + } + // If device got removed between reading two files, ignore returning stats. + if len(currentEntries) != len(maxEntries) { + return nil + } + + stats.RdmaStats = cgroups.RdmaStats{ + RdmaLimit: maxEntries, + RdmaCurrent: currentEntries, + } + + return nil +} + +func createCmdString(device string, limits configs.LinuxRdma) string { + cmdString := device + if limits.HcaHandles != nil { + cmdString += " hca_handle=" + strconv.FormatUint(uint64(*limits.HcaHandles), 10) + } + if limits.HcaObjects != nil { + cmdString += " hca_object=" + strconv.FormatUint(uint64(*limits.HcaObjects), 10) + } + return cmdString +} + +// RdmaSet sets RDMA resources. +func RdmaSet(path string, r *configs.Resources) error { + for device, limits := range r.Rdma { + if err := cgroups.WriteFile(path, "rdma.max", createCmdString(device, limits)); err != nil { + return err + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go new file mode 100644 index 000000000..f4a51c9e5 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go @@ -0,0 +1,145 @@ +package fscommon + +import ( + "errors" + "fmt" + "math" + "path" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" +) + +var ( + // Deprecated: use cgroups.OpenFile instead. + OpenFile = cgroups.OpenFile + // Deprecated: use cgroups.ReadFile instead. + ReadFile = cgroups.ReadFile + // Deprecated: use cgroups.WriteFile instead. + WriteFile = cgroups.WriteFile +) + +// ParseError records a parse error details, including the file path. +type ParseError struct { + Path string + File string + Err error +} + +func (e *ParseError) Error() string { + return "unable to parse " + path.Join(e.Path, e.File) + ": " + e.Err.Error() +} + +func (e *ParseError) Unwrap() error { return e.Err } + +// ParseUint converts a string to an uint64 integer. +// Negative values are returned at zero as, due to kernel bugs, +// some of the memory cgroup stats can be negative. +func ParseUint(s string, base, bitSize int) (uint64, error) { + value, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if errors.Is(intErr, strconv.ErrRange) && intValue < 0 { + return 0, nil + } + + return value, err + } + + return value, nil +} + +// ParseKeyValue parses a space-separated "name value" kind of cgroup +// parameter and returns its key as a string, and its value as uint64 +// (ParseUint is used to convert the value). For example, +// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234. +func ParseKeyValue(t string) (string, uint64, error) { + parts := strings.SplitN(t, " ", 3) + if len(parts) != 2 { + return "", 0, fmt.Errorf("line %q is not in key value format", t) + } + + value, err := ParseUint(parts[1], 10, 64) + if err != nil { + return "", 0, err + } + + return parts[0], value, nil +} + +// GetValueByKey reads a key-value pairs from the specified cgroup file, +// and returns a value of the specified key. ParseUint is used for value +// conversion. +func GetValueByKey(path, file, key string) (uint64, error) { + content, err := cgroups.ReadFile(path, file) + if err != nil { + return 0, err + } + + lines := strings.Split(content, "\n") + for _, line := range lines { + arr := strings.Split(line, " ") + if len(arr) == 2 && arr[0] == key { + val, err := ParseUint(arr[1], 10, 64) + if err != nil { + err = &ParseError{Path: path, File: file, Err: err} + } + return val, err + } + } + + return 0, nil +} + +// GetCgroupParamUint reads a single uint64 value from the specified cgroup file. +// If the value read is "max", the math.MaxUint64 is returned. +func GetCgroupParamUint(path, file string) (uint64, error) { + contents, err := GetCgroupParamString(path, file) + if err != nil { + return 0, err + } + contents = strings.TrimSpace(contents) + if contents == "max" { + return math.MaxUint64, nil + } + + res, err := ParseUint(contents, 10, 64) + if err != nil { + return res, &ParseError{Path: path, File: file, Err: err} + } + return res, nil +} + +// GetCgroupParamInt reads a single int64 value from specified cgroup file. +// If the value read is "max", the math.MaxInt64 is returned. +func GetCgroupParamInt(path, file string) (int64, error) { + contents, err := cgroups.ReadFile(path, file) + if err != nil { + return 0, err + } + contents = strings.TrimSpace(contents) + if contents == "max" { + return math.MaxInt64, nil + } + + res, err := strconv.ParseInt(contents, 10, 64) + if err != nil { + return res, &ParseError{Path: path, File: file, Err: err} + } + return res, nil +} + +// GetCgroupParamString reads a string from the specified cgroup file. +func GetCgroupParamString(path, file string) (string, error) { + contents, err := cgroups.ReadFile(path, file) + if err != nil { + return "", err + } + + return strings.TrimSpace(contents), nil +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 49bcda963..1c7de38a0 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -364,6 +364,9 @@ github.com/opencontainers/image-spec/specs-go/v1 ## explicit github.com/opencontainers/runc/libcontainer/apparmor github.com/opencontainers/runc/libcontainer/cgroups +github.com/opencontainers/runc/libcontainer/cgroups/devices +github.com/opencontainers/runc/libcontainer/cgroups/fs +github.com/opencontainers/runc/libcontainer/cgroups/fscommon github.com/opencontainers/runc/libcontainer/configs github.com/opencontainers/runc/libcontainer/devices github.com/opencontainers/runc/libcontainer/user