From 087103805ad47f0d9aefadf33027845f8484c779 Mon Sep 17 00:00:00 2001
From: Eric Ernst <eric.ernst@intel.com>
Date: Tue, 29 Oct 2019 16:45:40 -0700
Subject: [PATCH] sandbox/cgroups: don't constrain if using SandboxCgroupsOnly

When SandboxCgroupsOnly is set, we are expected to just inherit our parent's
cgroup settings and to move all Kata threads within that sandbox cgroup. The
initial implementation still adjusted the size of this cgroup. This commit
fixes this.

This commit makes a couple of functional changes, small refactors, and
adds clarifying comments for some functions.

Fixes: #2090

Signed-off-by: Eric Ernst <eric.ernst@intel.com>
---
 virtcontainers/cgroups.go |  4 +-
 virtcontainers/sandbox.go | 85 ++++++++++++++++++++++++---------------
 2 files changed, 55 insertions(+), 34 deletions(-)

diff --git a/virtcontainers/cgroups.go b/virtcontainers/cgroups.go
index ddb0adcdc7..588613ae0a 100644
--- a/virtcontainers/cgroups.go
+++ b/virtcontainers/cgroups.go
@@ -36,7 +36,7 @@ const cgroupKataPrefix = "kata"
 var cgroupsLoadFunc = cgroups.Load
 var cgroupsNewFunc = cgroups.New
 
-// V1Constraints returns the cgroups that are compatible with th VC architecture
+// V1Constraints returns the cgroups that are compatible with the VC architecture
 // and hypervisor, constraints can be applied to these cgroups.
 func V1Constraints() ([]cgroups.Subsystem, error) {
 	root, err := cgroupV1MountPoint()
@@ -51,7 +51,7 @@ func V1Constraints() ([]cgroups.Subsystem, error) {
 	return cgroupsSubsystems(subsystems)
 }
 
-// V1NoConstraints returns the cgroups that are *not* compatible with th VC
+// V1NoConstraints returns the cgroups that are *not* compatible with the VC
 // architecture and hypervisor, constraints MUST NOT be applied to these cgroups.
 func V1NoConstraints() ([]cgroups.Subsystem, error) {
 	root, err := cgroupV1MountPoint()
diff --git a/virtcontainers/sandbox.go b/virtcontainers/sandbox.go
index 6458e69812..8fbafe1f90 100644
--- a/virtcontainers/sandbox.go
+++ b/virtcontainers/sandbox.go
@@ -1806,10 +1806,12 @@ func (s *Sandbox) AddDevice(info config.DeviceInfo) (api.Device, error) {
 	return b, nil
 }
 
+// updateResources will calculate the resources required for the virtual machine, and
+// adjust the virtual machine sizing accordingly. For a given sandbox, it will calculate the
+// number of vCPUs required based on the sum of container requests, plus default CPUs for the VM.
+// Similar is done for memory. If changes in memory or CPU are made, the VM will be updated and
+// the agent will online the applicable CPU and memory.
 func (s *Sandbox) updateResources() error {
-	// the hypervisor.MemorySize is the amount of memory reserved for
-	// the VM and contaniners without memory limit
-
 	if s == nil {
 		return errors.New("sandbox is nil")
 	}
@@ -1822,8 +1824,9 @@ func (s *Sandbox) updateResources() error {
 	// Add default vcpus for sandbox
 	sandboxVCPUs += s.hypervisor.hypervisorConfig().NumVCPUs
 
-	sandboxMemoryByte := int64(s.hypervisor.hypervisorConfig().MemorySize) << utils.MibToBytesShift
-	sandboxMemoryByte += s.calculateSandboxMemory()
+	sandboxMemoryByte := s.calculateSandboxMemory()
+	// Add default / rsvd memory for sandbox.
+	sandboxMemoryByte += int64(s.hypervisor.hypervisorConfig().MemorySize) << utils.MibToBytesShift
 
 	// Update VCPUs
 	s.Logger().WithField("cpus-sandbox", sandboxVCPUs).Debugf("Request to hypervisor to update vCPUs")
@@ -1831,7 +1834,8 @@ func (s *Sandbox) updateResources() error {
 	if err != nil {
 		return err
 	}
-	// The CPUs were increased, ask agent to online them
+
+	// If the CPUs were increased, ask agent to online them
 	if oldCPUs < newCPUs {
 		vcpusAdded := newCPUs - oldCPUs
 		if err := s.agent.onlineCPUMem(vcpusAdded, true); err != nil {
@@ -1848,7 +1852,7 @@ func (s *Sandbox) updateResources() error {
 	}
 	s.Logger().Debugf("Sandbox memory size: %d MB", newMemory)
 	if s.state.GuestMemoryHotplugProbe && updatedMemoryDevice.addr != 0 {
-		//notify the guest kernel about memory hot-add event, before onlining them
+		// notify the guest kernel about memory hot-add event, before onlining them
 		s.Logger().Debugf("notify guest kernel memory hot-add event via probe interface, memory device located at 0x%x", updatedMemoryDevice.addr)
 		if err := s.agent.memHotplugByProbe(updatedMemoryDevice.addr, uint32(updatedMemoryDevice.sizeMB), s.state.GuestMemoryBlockSizeMB); err != nil {
 			return err
@@ -1890,6 +1894,10 @@ func (s *Sandbox) GetHypervisorType() string {
 	return string(s.config.HypervisorType)
 }
 
+// cgroupsUpdate will:
+//  1) get the v1constraints cgroup associated with the stored cgroup path
+//  2) (re-)add hypervisor vCPU threads to the appropriate cgroup
+//  3) If we are managing sandbox cgroup, update the v1constraints cgroup size
 func (s *Sandbox) cgroupsUpdate() error {
 	if s.state.CgroupPath == "" {
 		s.Logger().Warn("sandbox's cgroup won't be updated: cgroup path is empty")
@@ -1901,7 +1909,7 @@ func (s *Sandbox) cgroupsUpdate() error {
 		return fmt.Errorf("Could not load cgroup %v: %v", s.state.CgroupPath, err)
 	}
 
-	if err := s.constrainHypervisorVCPUs(cgroup); err != nil {
+	if err := s.constrainHypervisor(cgroup); err != nil {
 		return err
 	}
 
@@ -1910,18 +1918,24 @@ func (s *Sandbox) cgroupsUpdate() error {
 		return nil
 	}
 
-	resources, err := s.resources()
-	if err != nil {
-		return err
-	}
+	// If Kata is configured for SandboxCgroupOnly, do not change the constraints on the cgroup.
+	// We will rely on the values set within the parent cgroup
+	if !s.config.SandboxCgroupOnly {
+		resources, err := s.resources()
+		if err != nil {
+			return err
+		}
 
-	if err := cgroup.Update(&resources); err != nil {
-		return fmt.Errorf("Could not update sandbox cgroup path='%v' error='%v'", s.state.CgroupPath, err)
+		if err := cgroup.Update(&resources); err != nil {
+			return fmt.Errorf("Could not update sandbox cgroup path='%v' error='%v'", s.state.CgroupPath, err)
+		}
 	}
 
 	return nil
 }
 
+// cgroupsDelete will move the running processes in the sandbox cgroup
+// to the parent and then delete the sandbox cgroup
 func (s *Sandbox) cgroupsDelete() error {
 	s.Logger().Debug("Deleting sandbox cgroup")
 	if s.state.CgroupPath == "" {
@@ -1930,19 +1944,19 @@ func (s *Sandbox) cgroupsDelete() error {
 	}
 
 	var path string
-	cgroupSubystems := V1NoConstraints
+	var cgroupSubsystems cgroups.Hierarchy
 
 	if s.config.SandboxCgroupOnly {
-		// Override V1NoConstraints, if SandboxCgroupOnly is enabled
-		cgroupSubystems = cgroups.V1
+		cgroupSubsystems = cgroups.V1
 		path = s.state.CgroupPath
 		s.Logger().WithField("path", path).Debug("Deleting sandbox cgroups (all subsystems)")
 	} else {
+		cgroupSubsystems = V1NoConstraints
 		path = cgroupNoConstraintsPath(s.state.CgroupPath)
 		s.Logger().WithField("path", path).Debug("Deleting no constraints cgroup")
 	}
 
-	sandboxCgroups, err := cgroupsLoadFunc(cgroupSubystems, cgroups.StaticPath(path))
+	sandboxCgroups, err := cgroupsLoadFunc(cgroupSubsystems, cgroups.StaticPath(path))
 	if err == cgroups.ErrCgroupDeleted {
 		// cgroup already deleted
 		s.Logger().Warnf("cgroup already deleted: '%s'", err)
@@ -1954,7 +1968,7 @@ func (s *Sandbox) cgroupsDelete() error {
 	}
 
 	// move running process here, that way cgroup can be removed
-	parent, err := parentCgroup(cgroupSubystems, path)
+	parent, err := parentCgroup(cgroupSubsystems, path)
 	if err != nil {
 		// parent cgroup doesn't exist, that means there are no process running
 		// and the no constraints cgroup was removed.
@@ -1970,35 +1984,42 @@ func (s *Sandbox) cgroupsDelete() error {
 	return sandboxCgroups.Delete()
 }
 
-func (s *Sandbox) constrainHypervisorVCPUs(cgroup cgroups.Cgroup) error {
+// constrainHypervisor will place the VMM and vCPU threads into cgroups.
+func (s *Sandbox) constrainHypervisor(cgroup cgroups.Cgroup) error {
 	pids := s.hypervisor.getPids()
 	if len(pids) == 0 || pids[0] == 0 {
 		return fmt.Errorf("Invalid hypervisor PID: %+v", pids)
 	}
 
-	// Move hypervisor into cgroups without constraints,
-	// those cgroups are not yet supported.
-	resources := &specs.LinuxResources{}
-	path := cgroupNoConstraintsPath(s.state.CgroupPath)
-	noConstraintsCgroup, err := cgroupsNewFunc(V1NoConstraints, cgroups.StaticPath(path), resources)
-	if err != nil {
-		return fmt.Errorf("Could not create cgroup %v: %v", path, err)
-
+	// VMM threads are only placed into the constrained cgroup if SandboxCgroupOnly is being set.
+	// This is the "correct" behavior, but if the parent cgroup isn't set up correctly to take
+	// Kata/VMM into account, Kata may fail to boot due to being overconstrained.
+	// If !SandboxCgroupOnly, place the VMM into an unconstrained cgroup
+	vmmCgroup := cgroup
+	if !s.config.SandboxCgroupOnly {
+		// Move the VMM into cgroups without constraints, those cgroups are not yet supported.
+		resources := &specs.LinuxResources{}
+		path := cgroupNoConstraintsPath(s.state.CgroupPath)
+		var err error
+		vmmCgroup, err = cgroupsNewFunc(V1NoConstraints, cgroups.StaticPath(path), resources)
+		if err != nil {
+			return fmt.Errorf("Could not create cgroup %v: %v", path, err)
+		}
 	}
+
 	for _, pid := range pids {
 		if pid <= 0 {
 			s.Logger().Warnf("Invalid hypervisor pid: %d", pid)
 			continue
 		}
 
-		if err := noConstraintsCgroup.Add(cgroups.Process{Pid: pid}); err != nil {
-			return fmt.Errorf("Could not add hypervisor PID %d to cgroup %v: %v", pid, path, err)
+		if err := vmmCgroup.Add(cgroups.Process{Pid: pid}); err != nil {
+			return fmt.Errorf("Could not add hypervisor PID %d to cgroup: %v", pid, err)
 		}
-
 	}
 
 	// when new container joins, new CPU could be hotplugged, so we
-	// have to query fresh vcpu info from hypervisor for every time.
+	// have to query fresh vcpu info from hypervisor every time.
 	tids, err := s.hypervisor.getThreadIDs()
 	if err != nil {
 		return fmt.Errorf("failed to get thread ids from hypervisor: %v", err)