Skip to content
This repository has been archived by the owner on May 12, 2021. It is now read-only.

Wait explicitly for VFIO devices to complete hotplug, instead of relying on PCI rescan #850

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ var logsVSockPort = uint32(0)
var debugConsoleVSockPort = uint32(0)

// Timeout waiting for a device to be hotplugged
var hotplugTimeout = 3 * time.Second
var hotplugTimeout = 10 * time.Second
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

some container engines like docker and podman kill the container if its status has not changed to started after 10s - so the kata container will be killed before timing out - did you consider this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, in that case the higher level engine could time out before us, but that's kind of unavoidable. In practice the total delay with SHPC is usualy ~6s, so we'd probably complete in time. The 10s here is just on the basis that the timeout should generally be substantially longer than the actual expected time, to avoid false positives.


// Specify the log level
var logLevel = defaultLogLevel
Expand Down Expand Up @@ -761,8 +761,7 @@ func (s *sandbox) listenToUdevEvents() {
fieldLogger.Infof("Received add uevent")

// Check if device hotplug event results in a device node being created.
if uEv.DevName != "" &&
(strings.HasPrefix(uEv.DevPath, rootBusPath) || strings.HasPrefix(uEv.DevPath, acpiDevPath)) {
if strings.HasPrefix(uEv.DevPath, rootBusPath) || strings.HasPrefix(uEv.DevPath, acpiDevPath) {
// Lock is needed to safely read and modify the sysToDevMap and deviceWatchers.
// This makes sure that watchers do not access the map while it is being updated.
s.Lock()
Expand Down
1 change: 0 additions & 1 deletion agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ const (
testExecID = "testExecID"
testContainerID = "testContainerID"
testFileMode = os.FileMode(0640)
testDirMode = os.FileMode(0750)
)

func createFileWithPerms(file, contents string, perms os.FileMode) error {
Expand Down
70 changes: 50 additions & 20 deletions device.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,15 @@ const (
driverNvdimmType = "nvdimm"
driverEphemeralType = "ephemeral"
driverLocalType = "local"
vmRootfs = "/"
)

const (
pciBusMode = 0220
//VFIO devices to be bound to the VM's native drivers (this is
//Kata specific behaviour, that requires careful configuration
//to get something usable inside the container)
driverVfioVmType = "vfio-vm"
vmRootfs = "/"
)

var (
pciBusRescanFile = sysfsDir + "/bus/pci/rescan"
systemDevPath = "/dev"
getSCSIDevPath = getSCSIDevPathImpl
getPmemDevPath = getPmemDevPathImpl
Expand Down Expand Up @@ -97,10 +97,7 @@ var deviceHandlerList = map[string]deviceHandler{
driverBlkCCWType: virtioBlkCCWDeviceHandler,
driverSCSIType: virtioSCSIDeviceHandler,
driverNvdimmType: nvdimmDeviceHandler,
}

func rescanPciBus() error {
return ioutil.WriteFile(pciBusRescanFile, []byte{'1'}, pciBusMode)
driverVfioVmType: vfioDeviceHandler,
}

// pciPathToSysfs fetches the sysfs path for a PCI path, relative to
Expand Down Expand Up @@ -146,6 +143,7 @@ func pciPathToSysfsImpl(pciPath PciPath) (string, error) {
}

func getDeviceName(s *sandbox, devID string) (string, error) {
var found bool
var devName string
var notifyChan chan string

Expand All @@ -155,7 +153,7 @@ func getDeviceName(s *sandbox, devID string) (string, error) {
s.Lock()
for key, value := range s.sysToDevMap {
if strings.Contains(key, devID) {
devName = value
devName, found = value, true
fieldLogger.Infof("Device: %s found in device map", devID)
break
}
Expand All @@ -166,13 +164,13 @@ func getDeviceName(s *sandbox, devID string) (string, error) {
// The key of the watchers map is the device we are interested in.
// Note this is done inside the lock, not to miss any events from the
// global udev listener.
if devName == "" {
if !found {
notifyChan = make(chan string, 1)
s.deviceWatchers[devID] = notifyChan
}
s.Unlock()

if devName == "" {
if !found {
fieldLogger.Infof("Waiting on channel for device: %s notification", devID)
select {
case devName = <-notifyChan:
Expand All @@ -196,14 +194,6 @@ func getPCIDeviceNameImpl(s *sandbox, pciPath PciPath) (string, error) {
return "", err
}

fieldLogger := agentLog.WithField("sysfsRelPath", sysfsRelPath)

// Rescan pci bus if we need to wait for a new pci device
if err = rescanPciBus(); err != nil {
fieldLogger.WithError(err).Error("Failed to scan pci bus")
return "", err
}

return getDeviceName(s, sysfsRelPath)
}

Expand Down Expand Up @@ -262,6 +252,46 @@ func nvdimmDeviceHandler(_ context.Context, device pb.Device, spec *pb.Spec, s *
return updateSpecDeviceList(device, spec, devIdx)
}

// device.Options should have one entry for each PCI device in the VFIO group
// Each option should have the form "DDDD:BB:DD.F=NN/MM"
// DDDD:BB:DD.F is the device's PCI address in the host
// NN is the PCI slot of the device's bridge, MM is the PCI slot of the device
func vfioDeviceHandler(ctx context.Context, device pb.Device, spec *pb.Spec, s *sandbox, devIdx devIndex) error {
fieldLogger := agentLog.WithField("host-dev", device.ContainerPath)

for _, opt := range device.Options {
tokens := strings.Split(opt, "=")
if len(tokens) != 2 {
return fmt.Errorf("Malformed VFIO option %q", opt)
}

hostBdf := tokens[0]
guestPCIPath := PciPath{tokens[1]}

sysfsRelPath, err := pciPathToSysfs(guestPCIPath)
if err != nil {
return err
}

// We don't actually care what the /dev node is called
// (and there may not be any), but this will wait for
// a uevent that indicates the device is ready
_, err = getDeviceName(s, sysfsRelPath)
if err != nil {
return err
}

tokens = strings.Split(sysfsRelPath, "/")
guestBdf := tokens[len(tokens)-1]

fieldLogger.WithField("host-bdf", hostBdf).WithField("guest-bdf", guestBdf).Debug("VFIO: PCI device ready")
}

fieldLogger.Debug("VFIO: Group complete")

return nil
}

// updateSpecDeviceList takes a device description provided by the caller,
// trying to find it on the guest. Once this device has been identified, the
// "real" information that can be read from inside the VM is used to update
Expand Down
117 changes: 62 additions & 55 deletions device_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -731,45 +731,6 @@ func TestUpdateSpecDeviceListCharBlockConflict(t *testing.T) {
assert.Equal(hostMinor, spec.Linux.Resources.Devices[1].Minor)
}

func TestRescanPciBus(t *testing.T) {
skipUnlessRoot(t)

assert := assert.New(t)

err := rescanPciBus()
assert.Nil(err)

}

func TestRescanPciBusSubverted(t *testing.T) {
assert := assert.New(t)

dir, err := ioutil.TempDir("", "")
assert.NoError(err)
defer os.RemoveAll(dir)

rescanDir := filepath.Join(dir, "rescan-dir")

err = os.MkdirAll(rescanDir, testDirMode)
assert.NoError(err)

rescan := filepath.Join(rescanDir, "rescan")

savedFile := pciBusRescanFile
defer func() {
pciBusRescanFile = savedFile
}()

pciBusRescanFile = rescan

err = rescanPciBus()
assert.NoError(err)

os.RemoveAll(rescanDir)
err = rescanPciBus()
assert.Error(err)
}

func TestVirtioMmioBlkDeviceHandler(t *testing.T) {
assert := assert.New(t)

Expand Down Expand Up @@ -834,6 +795,47 @@ func TestNvdimmDeviceHandler(t *testing.T) {
assert.Error(err)
}

func TestVfioVmDeviceHandler(t *testing.T) {
assert := assert.New(t)

hostBdf := "0000:0a:0b.0"
guestPciPath := "1c/1d"
guestBridgeBdf := "0000:00:1c.0"
guestBdf := "0000:0e:1d.0"
pciSysPath := filepath.Join(guestBridgeBdf, guestBdf)
guestSysPath := filepath.Join(sysfsDir, rootBusPath, pciSysPath)

device := pb.Device{
Type: driverVfioVmType,
Options: []string{fmt.Sprintf("%s=%s", hostBdf, guestPciPath)},
}
spec := &pb.Spec{}
devIdx := makeDevIndex(spec)
sb := &sandbox{
sysToDevMap: make(map[string]string),
}

// Pre-populate the sysToDevMap, since the getDeviceName
// mechanics is not what we're testing
sb.sysToDevMap[guestSysPath] = ""

ctx := context.Background()

// Replace getDevicePCIAddress with a dummy, since that's not
// what we're testing here, and it would attempt to poke in
// sysfs
savedFunc := pciPathToSysfs
defer func() {
pciPathToSysfs = savedFunc
}()
pciPathToSysfs = func(pciPath PciPath) (string, error) {
return pciSysPath, nil
}

err := vfioDeviceHandler(ctx, device, spec, sb, devIdx)
assert.Nil(err)
}

func TestGetPCIDeviceName(t *testing.T) {
assert := assert.New(t)

Expand Down Expand Up @@ -865,13 +867,6 @@ func TestGetPCIDeviceName(t *testing.T) {

_, err = getPCIDeviceNameImpl(&sb, PciPath{""})
assert.Error(err)

rescanDir := filepath.Dir(pciBusRescanFile)
err = os.MkdirAll(rescanDir, testDirMode)
assert.NoError(err)

_, err = getPCIDeviceNameImpl(&sb, PciPath{""})
assert.Error(err)
}

func TestGetSCSIDevPath(t *testing.T) {
Expand Down Expand Up @@ -912,26 +907,23 @@ func TestCheckCCWBusFormat(t *testing.T) {
}
}

func TestGetDeviceName(t *testing.T) {
func oneGetDeviceNameTest(t *testing.T, sysName, devName, watchFor string) {
assert := assert.New(t)
devName := "vda"
busID := "0.0.0005"
devPath := path.Join("/devices/css0/0.0.0004", busID, "virtio4/block", devName)

systodevmap := make(map[string]string)
systodevmap[devPath] = devName
systodevmap[sysName] = devName

sb := sandbox{
deviceWatchers: make(map[string](chan string)),
sysToDevMap: systodevmap,
}

name, err := getDeviceName(&sb, busID)
name, err := getDeviceName(&sb, watchFor)

assert.Nil(err)
assert.Equal(name, path.Join(devRootPath, devName))

delete(sb.sysToDevMap, devPath)
delete(sb.sysToDevMap, sysName)

go func() {
for {
Expand All @@ -941,7 +933,7 @@ func TestGetDeviceName(t *testing.T) {
continue
}

if strings.Contains(devPath, devAddress) && strings.HasSuffix(devAddress, blkCCWSuffix) {
if strings.Contains(watchFor, devAddress) {
ch <- devName
close(ch)
delete(sb.deviceWatchers, devAddress)
Expand All @@ -954,12 +946,27 @@ func TestGetDeviceName(t *testing.T) {
sb.Unlock()
}()

name, err = getDeviceName(&sb, path.Join(busID, blkCCWSuffix))
name, err = getDeviceName(&sb, watchFor)

assert.Nil(err)
assert.Equal(name, path.Join(devRootPath, devName))
}

func TestGetDeviceName(t *testing.T) {
devName := "vda"
busID := "0.0.0005"
sysName := path.Join("/devices/css0/0.0.0004", busID, "virtio4/block", devName)

oneGetDeviceNameTest(t, sysName, devName, busID)
}

func TestGetDeviceNameEmptyDev(t *testing.T) {
busID := "0000:01:00.0"
sysName := path.Join("/devices/pci0000:00/0000:00:1c.0", busID)

oneGetDeviceNameTest(t, sysName, "", busID)
}

func TestUpdateDeviceCgroupForGuestRootfs(t *testing.T) {
skipUnlessRoot(t)
assert := assert.New(t)
Expand Down
6 changes: 0 additions & 6 deletions grpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -618,12 +618,6 @@ func (a *agentGRPC) CreateContainer(ctx context.Context, req *pb.CreateContainer
return emptyResp, err
}

// re-scan PCI bus
// looking for hidden devices
if err = rescanPciBus(); err != nil {
agentLog.WithError(err).Warn("Could not rescan PCI bus")
}

// Some devices need some extra processing (the ones invoked with
// --device for instance), and that's what this call is doing. It
// updates the devices listed in the OCI spec, so that they actually
Expand Down
3 changes: 2 additions & 1 deletion mount_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,8 @@ func TestStorageHandlers(t *testing.T) {
}

sb := sandbox{
storages: make(map[string]*sandboxStorage),
storages: make(map[string]*sandboxStorage),
deviceWatchers: make(map[string](chan string)),
}

ctx, cancel := context.WithCancel(context.Background())
Expand Down