sandbox/cgroups: don't constrain if using SandboxCgroupsOnly

When SandboxCgroupsOnly is set, we are expected to just inherit our parent's cgroup settings and to move all Kata threads within that sandbox cgroup. The initial implementation still adjusted the size of this cgroup. This commit fixes this. This commit makes a couple of functional changes, small refactors, and adds clarifying comments for some functions. Fixes: kata-containers#2090 Signed-off-by: Eric Ernst <eric.ernst@intel.com>
egernst · Oct 31, 2019 · 3308077 · 3308077
1 parent ee527d3
commit 3308077
Show file tree

Hide file tree

Showing 3 changed files with 61 additions and 27 deletions.
diff --git a/virtcontainers/cgroups.go b/virtcontainers/cgroups.go
@@ -36,7 +36,7 @@ const cgroupKataPrefix = "kata"
 var cgroupsLoadFunc = cgroups.Load
 var cgroupsNewFunc = cgroups.New
 
-// V1Constraints returns the cgroups that are compatible with th VC architecture
+// V1Constraints returns the cgroups that are compatible with the VC architecture
 // and hypervisor, constraints can be applied to these cgroups.
 func V1Constraints() ([]cgroups.Subsystem, error) {
 	root, err := cgroupV1MountPoint()
@@ -51,7 +51,7 @@ func V1Constraints() ([]cgroups.Subsystem, error) {
 	return cgroupsSubsystems(subsystems)
 }
 
-// V1NoConstraints returns the cgroups that are *not* compatible with th VC
+// V1NoConstraints returns the cgroups that are *not* compatible with the VC
 // architecture and hypervisor, constraints MUST NOT be applied to these cgroups.
 func V1NoConstraints() ([]cgroups.Subsystem, error) {
 	root, err := cgroupV1MountPoint()

diff --git a/virtcontainers/cgroups_test.go b/virtcontainers/cgroups_test.go
@@ -130,6 +130,7 @@ func TestUpdateCgroups(t *testing.T) {
 		state: types.SandboxState{
 			CgroupPath: "",
 		},
+		config: &SandboxConfig{SandboxCgroupOnly: false},
 	}
 
 	// empty path

diff --git a/virtcontainers/sandbox.go b/virtcontainers/sandbox.go
@@ -1809,10 +1809,12 @@ func (s *Sandbox) AddDevice(info config.DeviceInfo) (api.Device, error) {
 	return b, nil
 }
 
+// updateResources will calculate the resources required for the virtual machine, and
+// adjust the virtual machine sizing accordingly. For a given sandbox, it will calculate the
+// number of vCPUs required based on the sum of container requests, plus default CPUs for the VM.
+// Similar is done for memory. If changes in memory or CPU are made, the VM will be updated and
+// the agent will online the applicable CPU and memory.
 func (s *Sandbox) updateResources() error {
-	// the hypervisor.MemorySize is the amount of memory reserved for
-	// the VM and contaniners without memory limit
-
 	if s == nil {
 		return errors.New("sandbox is nil")
 	}
@@ -1825,16 +1827,18 @@ func (s *Sandbox) updateResources() error {
 	// Add default vcpus for sandbox
 	sandboxVCPUs += s.hypervisor.hypervisorConfig().NumVCPUs
 
-	sandboxMemoryByte := int64(s.hypervisor.hypervisorConfig().MemorySize) << utils.MibToBytesShift
-	sandboxMemoryByte += s.calculateSandboxMemory()
+	sandboxMemoryByte := s.calculateSandboxMemory()
+	// Add default / rsvd memory for sandbox.
+	sandboxMemoryByte += int64(s.hypervisor.hypervisorConfig().MemorySize) << utils.MibToBytesShift
 
 	// Update VCPUs
 	s.Logger().WithField("cpus-sandbox", sandboxVCPUs).Debugf("Request to hypervisor to update vCPUs")
 	oldCPUs, newCPUs, err := s.hypervisor.resizeVCPUs(sandboxVCPUs)
 	if err != nil {
 		return err
 	}
-	// The CPUs were increased, ask agent to online them
+
+	// If the CPUs were increased, ask agent to online them
 	if oldCPUs < newCPUs {
 		vcpusAdded := newCPUs - oldCPUs
 		if err := s.agent.onlineCPUMem(vcpusAdded, true); err != nil {
@@ -1851,7 +1855,7 @@ func (s *Sandbox) updateResources() error {
 	}
 	s.Logger().Debugf("Sandbox memory size: %d MB", newMemory)
 	if s.state.GuestMemoryHotplugProbe && updatedMemoryDevice.addr != 0 {
-		//notify the guest kernel about memory hot-add event, before onlining them
+		// notify the guest kernel about memory hot-add event, before onlining them
 		s.Logger().Debugf("notify guest kernel memory hot-add event via probe interface, memory device located at 0x%x", updatedMemoryDevice.addr)
 		if err := s.agent.memHotplugByProbe(updatedMemoryDevice.addr, uint32(updatedMemoryDevice.sizeMB), s.state.GuestMemoryBlockSizeMB); err != nil {
 			return err
@@ -1893,7 +1897,19 @@ func (s *Sandbox) GetHypervisorType() string {
 	return string(s.config.HypervisorType)
 }
 
+// cgroupsUpdate will:
+//  1) get the v1constraints cgroup associated with the stored cgroup path
+//  2) (re-)add hypervisor vCPU threads to the appropriate cgroup
+//  3) If we are managing sandbox cgroup, update the v1constraints cgroup size
 func (s *Sandbox) cgroupsUpdate() error {
+
+	// If Kata is configured for SandboxCgroupOnly, the VMM and its processes are already
+	// in the Kata sandbox cgroup (inherited). No need to move threads/processes, and we should
+	// rely on parent's cgroup CPU/memory values
+	if s.config.SandboxCgroupOnly {
+		return nil
+	}
+
 	if s.state.CgroupPath == "" {
 		s.Logger().Warn("sandbox's cgroup won't be updated: cgroup path is empty")
 		return nil
@@ -1904,7 +1920,7 @@ func (s *Sandbox) cgroupsUpdate() error {
 		return fmt.Errorf("Could not load cgroup %v: %v", s.state.CgroupPath, err)
 	}
 
-	if err := s.constrainHypervisorVCPUs(cgroup); err != nil {
+	if err := s.constrainHypervisor(cgroup); err != nil {
 		return err
 	}
 
@@ -1925,6 +1941,8 @@ func (s *Sandbox) cgroupsUpdate() error {
 	return nil
 }
 
+// cgroupsDelete will move the running processes in the sandbox cgroup
+// to the parent and then delete the sandbox cgroup
 func (s *Sandbox) cgroupsDelete() error {
 	s.Logger().Debug("Deleting sandbox cgroup")
 	if s.state.CgroupPath == "" {
@@ -1933,19 +1951,19 @@ func (s *Sandbox) cgroupsDelete() error {
 	}
 
 	var path string
-	cgroupSubystems := V1NoConstraints
+	var cgroupSubsystems cgroups.Hierarchy
 
 	if s.config.SandboxCgroupOnly {
-		// Override V1NoConstraints, if SandboxCgroupOnly is enabled
-		cgroupSubystems = cgroups.V1
+		cgroupSubsystems = cgroups.V1
 		path = s.state.CgroupPath
 		s.Logger().WithField("path", path).Debug("Deleting sandbox cgroups (all subsystems)")
 	} else {
+		cgroupSubsystems = V1NoConstraints
 		path = cgroupNoConstraintsPath(s.state.CgroupPath)
 		s.Logger().WithField("path", path).Debug("Deleting no constraints cgroup")
 	}
 
-	sandboxCgroups, err := cgroupsLoadFunc(cgroupSubystems, cgroups.StaticPath(path))
+	sandboxCgroups, err := cgroupsLoadFunc(cgroupSubsystems, cgroups.StaticPath(path))
 	if err == cgroups.ErrCgroupDeleted {
 		// cgroup already deleted
 		s.Logger().Warnf("cgroup already deleted: '%s'", err)
@@ -1957,7 +1975,7 @@ func (s *Sandbox) cgroupsDelete() error {
 	}
 
 	// move running process here, that way cgroup can be removed
-	parent, err := parentCgroup(cgroupSubystems, path)
+	parent, err := parentCgroup(cgroupSubsystems, path)
 	if err != nil {
 		// parent cgroup doesn't exist, that means there are no process running
 		// and the no constraints cgroup was removed.
@@ -1973,35 +1991,50 @@ func (s *Sandbox) cgroupsDelete() error {
 	return sandboxCgroups.Delete()
 }
 
-func (s *Sandbox) constrainHypervisorVCPUs(cgroup cgroups.Cgroup) error {
+// constrainHypervisor will place the VMM and vCPU threads into cgroups.
+func (s *Sandbox) constrainHypervisor(cgroup cgroups.Cgroup) error {
+	// VMM threads are only placed into the constrained cgroup if SandboxCgroupOnly is being set.
+	// This is the "correct" behavior, but if the parent cgroup isn't set up correctly to take
+	// Kata/VMM into account, Kata may fail to boot due to being overconstrained.
+	// If !SandboxCgroupOnly, place the VMM into an unconstrained cgroup, and the vCPU threads into constrained
+	// cgroup
+	if s.config.SandboxCgroupOnly {
+		// Kata components were moved into the sandbox-cgroup already, so VMM
+		// will already land there as well. No need to take action
+		return nil
+	}
+
 	pids := s.hypervisor.getPids()
 	if len(pids) == 0 || pids[0] == 0 {
 		return fmt.Errorf("Invalid hypervisor PID: %+v", pids)
 	}
 
-	// Move hypervisor into cgroups without constraints,
-	// those cgroups are not yet supported.
+	// VMM threads are only placed into the constrained cgroup if SandboxCgroupOnly is being set.
+	// This is the "correct" behavior, but if the parent cgroup isn't set up correctly to take
+	// Kata/VMM into account, Kata may fail to boot due to being overconstrained.
+	// If !SandboxCgroupOnly, place the VMM into an unconstrained cgroup, and the vCPU threads into constrained
+	// cgroup
+	// Move the VMM into cgroups without constraints, those cgroups are not yet supported.
 	resources := &specs.LinuxResources{}
 	path := cgroupNoConstraintsPath(s.state.CgroupPath)
-	noConstraintsCgroup, err := cgroupsNewFunc(V1NoConstraints, cgroups.StaticPath(path), resources)
+	vmmCgroup, err := cgroupsNewFunc(V1NoConstraints, cgroups.StaticPath(path), resources)
 	if err != nil {
 		return fmt.Errorf("Could not create cgroup %v: %v", path, err)
-
 	}
+
 	for _, pid := range pids {
 		if pid <= 0 {
 			s.Logger().Warnf("Invalid hypervisor pid: %d", pid)
 			continue
 		}
 
-		if err := noConstraintsCgroup.Add(cgroups.Process{Pid: pid}); err != nil {
-			return fmt.Errorf("Could not add hypervisor PID %d to cgroup %v: %v", pid, path, err)
+		if err := vmmCgroup.Add(cgroups.Process{Pid: pid}); err != nil {
+			return fmt.Errorf("Could not add hypervisor PID %d to cgroup: %v", pid, err)
 		}
-
 	}
 
 	// when new container joins, new CPU could be hotplugged, so we
-	// have to query fresh vcpu info from hypervisor for every time.
+	// have to query fresh vcpu info from hypervisor every time.
 	tids, err := s.hypervisor.getThreadIDs()
 	if err != nil {
 		return fmt.Errorf("failed to get thread ids from hypervisor: %v", err)
@@ -2013,9 +2046,9 @@ func (s *Sandbox) constrainHypervisorVCPUs(cgroup cgroups.Cgroup) error {
 		return nil
 	}
 
-	// We are about to move just the vcpus (threads) into cgroups with constraints.
-	// Move whole hypervisor process whould be easier but the IO/network performance
-	// whould be impacted.
+	// Move vcpus (threads) into cgroups with constraints.
+	// Move whole hypervisor process would be easier but the IO/network performance
+	// would be over-constrained.
 	for _, i := range tids.vcpus {
 		// In contrast, AddTask will write thread id to `tasks`
 		// After this, vcpu threads are in "vcpu" sub-cgroup, other threads in