From 2c7f27ec4f8faf979e598fa63a5c6f9d5dcd9c6f Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Fri, 6 Mar 2020 16:39:45 +0000 Subject: [PATCH 1/8] vendor: update govmm bring `pmem` option to pmem/nvdimm devices shortlog: qemu: add pmem flag to memory-backend-file Signed-off-by: Julio Montes --- Gopkg.lock | 4 ++-- Gopkg.toml | 2 +- vendor/github.com/intel/govmm/qemu/qmp.go | 13 +++++++++++-- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/Gopkg.lock b/Gopkg.lock index 8c4a4b66f5..db5230b28b 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -412,11 +412,11 @@ revision = "2f1d1f20f75d5404f53b9edf6b53ed5505508675" [[projects]] - digest = "1:0044fb81f517f480ca3c33675a3af6b4ada77a6faf699a302bc2388c98cacba9" + digest = "1:fafdb4aa5b6207f51ec7557818d5f7a534ed44ea4fb31c6f2e8abb01d1627b74" name = "github.com/intel/govmm" packages = ["qemu"] pruneopts = "NUT" - revision = "3700c55dd766d37e17af354fb9975dc801619d62" + revision = "e969afbec52cf687bbe97b76654c664128cdb04b" [[projects]] digest = "1:d6e9b99fe0150d4c26d81612676e8d59ad045642e4cbc8646e494b50d4f245ef" diff --git a/Gopkg.toml b/Gopkg.toml index cb981258c0..84b9e446c7 100644 --- a/Gopkg.toml +++ b/Gopkg.toml @@ -48,7 +48,7 @@ [[constraint]] name = "github.com/intel/govmm" - revision = "3700c55dd766d37e17af354fb9975dc801619d62" + revision = "e969afbec52cf687bbe97b76654c664128cdb04b" [[constraint]] name = "github.com/kata-containers/agent" diff --git a/vendor/github.com/intel/govmm/qemu/qmp.go b/vendor/github.com/intel/govmm/qemu/qmp.go index 83ac94a043..bf9a77dda1 100644 --- a/vendor/github.com/intel/govmm/qemu/qmp.go +++ b/vendor/github.com/intel/govmm/qemu/qmp.go @@ -1428,8 +1428,9 @@ func (q *QMP) ExecHotplugMemory(ctx context.Context, qomtype, id, mempath string // a NVDIMM driver with the device_add command. // id is the id of the device to add. It must be a valid QMP identifier. // mempath is the path of the device to add, e.g., /dev/rdb0. size is -// the data size of the device. -func (q *QMP) ExecuteNVDIMMDeviceAdd(ctx context.Context, id, mempath string, size int64) error { +// the data size of the device. pmem is to guarantee the persistence of QEMU writes +// to the vNVDIMM backend. +func (q *QMP) ExecuteNVDIMMDeviceAdd(ctx context.Context, id, mempath string, size int64, pmem *bool) error { args := map[string]interface{}{ "qom-type": "memory-backend-file", "id": "nvdimmbackmem" + id, @@ -1439,6 +1440,14 @@ func (q *QMP) ExecuteNVDIMMDeviceAdd(ctx context.Context, id, mempath string, si "share": true, }, } + + if q.version.Major > 4 || (q.version.Major == 4 && q.version.Minor >= 1) { + if pmem != nil { + props := args["props"].(map[string]interface{}) + props["pmem"] = *pmem + } + } + err := q.executeCommand(ctx, "object-add", args, nil) if err != nil { return err From 0a4e2edcf4464d0c0ba6c501472687537de344bb Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Fri, 6 Mar 2020 20:54:21 +0000 Subject: [PATCH 2/8] virtcontainers: move GetDevicePathAndFsType to utils_linux `GetDevicePathAndFsType` is a function to get the path and filesystem type of a mount point from `/proc/mounts`. Move `GetDevicePathAndFsType` to utils_linux since it's linux specific and that way it can be used in other subpackages. Signed-off-by: Julio Montes --- virtcontainers/container.go | 2 +- virtcontainers/mount.go | 60 ++---------------------- virtcontainers/mount_test.go | 16 ------- virtcontainers/utils/utils_linux.go | 56 ++++++++++++++++++++++ virtcontainers/utils/utils_linux_test.go | 16 +++++++ 5 files changed, 76 insertions(+), 74 deletions(-) diff --git a/virtcontainers/container.go b/virtcontainers/container.go index 1f5d2c4e5f..bfb32e9a42 100644 --- a/virtcontainers/container.go +++ b/virtcontainers/container.go @@ -1323,7 +1323,7 @@ func (c *Container) hotplugDrive() error { c.rootfsSuffix = "" } // If device mapper device, then fetch the full path of the device - devicePath, fsType, err = GetDevicePathAndFsType(dev.mountPoint) + devicePath, fsType, err = utils.GetDevicePathAndFsType(dev.mountPoint) if err != nil { return err } diff --git a/virtcontainers/mount.go b/virtcontainers/mount.go index fc06424c94..dd6177bef7 100644 --- a/virtcontainers/mount.go +++ b/virtcontainers/mount.go @@ -6,17 +6,16 @@ package virtcontainers import ( - "bufio" "context" "errors" "fmt" - "io" "os" "path/filepath" "strings" "syscall" merr "github.com/hashicorp/go-multierror" + "github.com/kata-containers/runtime/virtcontainers/utils" "github.com/sirupsen/logrus" ) @@ -187,59 +186,6 @@ func getDeviceForPath(path string) (device, error) { return dev, nil } -const ( - procMountsFile = "/proc/mounts" - - fieldsPerLine = 6 -) - -const ( - procDeviceIndex = iota - procPathIndex - procTypeIndex -) - -// GetDevicePathAndFsType gets the device for the mount point and the file system type -// of the mount. -func GetDevicePathAndFsType(mountPoint string) (devicePath, fsType string, err error) { - if mountPoint == "" { - err = fmt.Errorf("Mount point cannot be empty") - return - } - - var file *os.File - - file, err = os.Open(procMountsFile) - if err != nil { - return - } - - defer file.Close() - - reader := bufio.NewReader(file) - for { - var line string - - line, err = reader.ReadString('\n') - if err == io.EOF { - err = fmt.Errorf("Mount %s not found", mountPoint) - return - } - - fields := strings.Fields(line) - if len(fields) != fieldsPerLine { - err = fmt.Errorf("Incorrect no of fields (expected %d, got %d)) :%s", fieldsPerLine, len(fields), line) - return - } - - if mountPoint == fields[procPathIndex] { - devicePath = fields[procDeviceIndex] - fsType = fields[procTypeIndex] - return - } - } -} - var blockFormatTemplate = "/sys/dev/block/%d:%d/dm" var checkStorageDriver = isDeviceMapper @@ -445,7 +391,7 @@ func IsEphemeralStorage(path string) bool { return false } - if _, fsType, _ := GetDevicePathAndFsType(path); fsType == "tmpfs" { + if _, fsType, _ := utils.GetDevicePathAndFsType(path); fsType == "tmpfs" { return true } @@ -460,7 +406,7 @@ func Isk8sHostEmptyDir(path string) bool { return false } - if _, fsType, _ := GetDevicePathAndFsType(path); fsType != "tmpfs" { + if _, fsType, _ := utils.GetDevicePathAndFsType(path); fsType != "tmpfs" { return true } return false diff --git a/virtcontainers/mount_test.go b/virtcontainers/mount_test.go index fc394cc558..5a39133be7 100644 --- a/virtcontainers/mount_test.go +++ b/virtcontainers/mount_test.go @@ -206,22 +206,6 @@ func TestGetDeviceForPathBindMount(t *testing.T) { assert.Equal(sourceDev, destDev) } -func TestGetDevicePathAndFsTypeEmptyMount(t *testing.T) { - assert := assert.New(t) - _, _, err := GetDevicePathAndFsType("") - assert.Error(err) -} - -func TestGetDevicePathAndFsTypeSuccessful(t *testing.T) { - assert := assert.New(t) - - path, fstype, err := GetDevicePathAndFsType("/proc") - assert.NoError(err) - - assert.Equal(path, "proc") - assert.Equal(fstype, "proc") -} - func TestIsDeviceMapper(t *testing.T) { assert := assert.New(t) diff --git a/virtcontainers/utils/utils_linux.go b/virtcontainers/utils/utils_linux.go index 49d89dbd71..ad870d63ee 100644 --- a/virtcontainers/utils/utils_linux.go +++ b/virtcontainers/utils/utils_linux.go @@ -6,10 +6,13 @@ package utils import ( + "bufio" "crypto/rand" "fmt" + "io" "math/big" "os" + "strings" "syscall" "unsafe" @@ -85,3 +88,56 @@ func FindContextID() (*os.File, uint64, error) { vsockFd.Close() return nil, 0, fmt.Errorf("Could not get a unique context ID for the vsock : %s", err) } + +const ( + procMountsFile = "/proc/mounts" + + fieldsPerLine = 6 +) + +const ( + procDeviceIndex = iota + procPathIndex + procTypeIndex +) + +// GetDevicePathAndFsType gets the device for the mount point and the file system type +// of the mount. +func GetDevicePathAndFsType(mountPoint string) (devicePath, fsType string, err error) { + if mountPoint == "" { + err = fmt.Errorf("Mount point cannot be empty") + return + } + + var file *os.File + + file, err = os.Open(procMountsFile) + if err != nil { + return + } + + defer file.Close() + + reader := bufio.NewReader(file) + for { + var line string + + line, err = reader.ReadString('\n') + if err == io.EOF { + err = fmt.Errorf("Mount %s not found", mountPoint) + return + } + + fields := strings.Fields(line) + if len(fields) != fieldsPerLine { + err = fmt.Errorf("Incorrect no of fields (expected %d, got %d)) :%s", fieldsPerLine, len(fields), line) + return + } + + if mountPoint == fields[procPathIndex] { + devicePath = fields[procDeviceIndex] + fsType = fields[procTypeIndex] + return + } + } +} diff --git a/virtcontainers/utils/utils_linux_test.go b/virtcontainers/utils/utils_linux_test.go index 4901348ebc..4554fa935d 100644 --- a/virtcontainers/utils/utils_linux_test.go +++ b/virtcontainers/utils/utils_linux_test.go @@ -33,3 +33,19 @@ func TestFindContextID(t *testing.T) { assert.Zero(cid) assert.Error(err) } + +func TestGetDevicePathAndFsTypeEmptyMount(t *testing.T) { + assert := assert.New(t) + _, _, err := GetDevicePathAndFsType("") + assert.Error(err) +} + +func TestGetDevicePathAndFsTypeSuccessful(t *testing.T) { + assert := assert.New(t) + + path, fstype, err := GetDevicePathAndFsType("/proc") + assert.NoError(err) + + assert.Equal(path, "proc") + assert.Equal(fstype, "proc") +} From 9ff44dba870d3b7ae7828639bda81f2f189a6fb6 Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Fri, 20 Mar 2020 14:03:57 +0000 Subject: [PATCH 3/8] virtcontainers: implement function to get the backing file Implement function the get the backing file from a loop device. The backing file can be used as backend file for a NVDIMM device in the guest Signed-off-by: Julio Montes --- virtcontainers/device/config/config.go | 53 ++++++++++----- virtcontainers/device/config/config_test.go | 73 +++++++++++++++++++++ 2 files changed, 108 insertions(+), 18 deletions(-) create mode 100644 virtcontainers/device/config/config_test.go diff --git a/virtcontainers/device/config/config.go b/virtcontainers/device/config/config.go index 57b4f0a7b9..c054f0df55 100644 --- a/virtcontainers/device/config/config.go +++ b/virtcontainers/device/config/config.go @@ -12,6 +12,7 @@ import ( "os" "path/filepath" "strconv" + "strings" "github.com/go-ini/ini" "golang.org/x/sys/unix" @@ -91,6 +92,8 @@ var SysBusPciDevicesPath = "/sys/bus/pci/devices" // SysBusPciSlotsPath is static string of /sys/bus/pci/slots var SysBusPciSlotsPath = "/sys/bus/pci/slots" +var getSysDevPath = getSysDevPathImpl + // DeviceInfo is an embedded type that contains device data common to all types of devices. type DeviceInfo struct { // Hostpath is device path on host @@ -257,29 +260,14 @@ func GetHostPath(devInfo DeviceInfo, vhostUserStoreEnabled bool, vhostUserStoreP return "", fmt.Errorf("Empty path provided for device") } - var pathComp string - - switch devInfo.DevType { - case "c", "u": - pathComp = "char" - case "b": - pathComp = "block" - default: - // Unsupported device types. Return nil error to ignore devices - // that cannot be handled currently. - return "", nil - } - // Filter out vhost-user storage devices by device Major numbers. if vhostUserStoreEnabled && devInfo.DevType == "b" && (devInfo.Major == VhostUserSCSIMajor || devInfo.Major == VhostUserBlkMajor) { return getVhostUserHostPath(devInfo, vhostUserStorePath) } - format := strconv.FormatInt(devInfo.Major, 10) + ":" + strconv.FormatInt(devInfo.Minor, 10) - sysDevPath := filepath.Join(SysDevPrefix, pathComp, format, "uevent") - - if _, err := os.Stat(sysDevPath); err != nil { + ueventPath := filepath.Join(getSysDevPath(devInfo), "uevent") + if _, err := os.Stat(ueventPath); err != nil { // Some devices(eg. /dev/fuse, /dev/cuse) do not always implement sysfs interface under /sys/dev // These devices are passed by default by docker. // @@ -293,7 +281,7 @@ func GetHostPath(devInfo DeviceInfo, vhostUserStoreEnabled bool, vhostUserStoreP return "", err } - content, err := ini.Load(sysDevPath) + content, err := ini.Load(ueventPath) if err != nil { return "", err } @@ -306,6 +294,35 @@ func GetHostPath(devInfo DeviceInfo, vhostUserStoreEnabled bool, vhostUserStoreP return filepath.Join("/dev", devName.String()), nil } +// getBackingFile is used to fetch the backing file for the device. +func getBackingFile(devInfo DeviceInfo) (string, error) { + backingFilePath := filepath.Join(getSysDevPath(devInfo), "loop", "backing_file") + data, err := ioutil.ReadFile(backingFilePath) + if err != nil { + return "", err + } + + return strings.TrimSpace(string(data)), nil +} + +func getSysDevPathImpl(devInfo DeviceInfo) string { + var pathComp string + + switch devInfo.DevType { + case "c", "u": + pathComp = "char" + case "b": + pathComp = "block" + default: + // Unsupported device types. Return nil error to ignore devices + // that cannot be handled currently. + return "" + } + + format := strconv.FormatInt(devInfo.Major, 10) + ":" + strconv.FormatInt(devInfo.Minor, 10) + return filepath.Join(SysDevPrefix, pathComp, format) +} + // getVhostUserHostPath is used to fetch host path for the vhost-user device. // For vhost-user block device like vhost-user-blk or vhost-user-scsi, its // socket should be under directory "/block/sockets/"; diff --git a/virtcontainers/device/config/config_test.go b/virtcontainers/device/config/config_test.go new file mode 100644 index 0000000000..698f52ecd9 --- /dev/null +++ b/virtcontainers/device/config/config_test.go @@ -0,0 +1,73 @@ +// Copyright (c) 2020 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +package config + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestGetBackingFile(t *testing.T) { + assert := assert.New(t) + + dir, err := ioutil.TempDir("", "backing") + assert.NoError(err) + defer os.RemoveAll(dir) + + orgGetSysDevPath := getSysDevPath + getSysDevPath = func(info DeviceInfo) string { + return dir + } + defer func() { getSysDevPath = orgGetSysDevPath }() + + info := DeviceInfo{} + path, err := getBackingFile(info) + assert.Error(err) + assert.Empty(path) + + loopDir := filepath.Join(dir, "loop") + err = os.Mkdir(loopDir, os.FileMode(0755)) + assert.NoError(err) + + backingFile := "/fake-img" + + err = ioutil.WriteFile(filepath.Join(loopDir, "backing_file"), []byte(backingFile), os.FileMode(0755)) + assert.NoError(err) + + path, err = getBackingFile(info) + assert.NoError(err) + assert.Equal(backingFile, path) +} + +func TestGetSysDevPathImpl(t *testing.T) { + assert := assert.New(t) + + info := DeviceInfo{ + DevType: "", + Major: 127, + Minor: 0, + } + + path := getSysDevPathImpl(info) + assert.Empty(path) + + expectedFormat := fmt.Sprintf("%d:%d", info.Major, info.Minor) + + info.DevType = "c" + path = getSysDevPathImpl(info) + assert.Contains(path, expectedFormat) + assert.Contains(path, "char") + + info.DevType = "b" + path = getSysDevPathImpl(info) + assert.Contains(path, expectedFormat) + assert.Contains(path, "block") +} From ee941e5c56b520396b0619bf08ae5abaaccc6eff Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Fri, 28 Feb 2020 21:01:57 +0000 Subject: [PATCH 4/8] virtcontainers: Implement function to get the pmem DeviceInfo Implement function to get the pmem `DeviceInfo` from a volume. `PmemDeviceInfo` return a new `DeviceInfo` object if a volume has a loop device as backend and the backing file for such loop device contains the PFN signature, needed to enable DAX in the guest. Signed-off-by: Julio Montes --- virtcontainers/device/config/config.go | 8 ++ virtcontainers/device/config/pmem.go | 116 ++++++++++++++++++++++ virtcontainers/device/config/pmem_test.go | 49 +++++++++ 3 files changed, 173 insertions(+) create mode 100644 virtcontainers/device/config/pmem.go create mode 100644 virtcontainers/device/config/pmem_test.go diff --git a/virtcontainers/device/config/config.go b/virtcontainers/device/config/config.go index c054f0df55..18fcfedcec 100644 --- a/virtcontainers/device/config/config.go +++ b/virtcontainers/device/config/config.go @@ -113,6 +113,10 @@ type DeviceInfo struct { Major int64 Minor int64 + // Pmem enabled persistent memory. Use HostPath as backing file + // for a nvdimm device in the guest. + Pmem bool + // FileMode permission bits for the device. FileMode os.FileMode @@ -169,6 +173,10 @@ type BlockDrive struct { // ReadOnly sets the device file readonly ReadOnly bool + + // Pmem enables persistent memory. Use File as backing file + // for a nvdimm device in the guest + Pmem bool } // VFIODeviceType indicates VFIO device type diff --git a/virtcontainers/device/config/pmem.go b/virtcontainers/device/config/pmem.go new file mode 100644 index 0000000000..9cfe3b58f4 --- /dev/null +++ b/virtcontainers/device/config/pmem.go @@ -0,0 +1,116 @@ +// Copyright (c) 2020 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +package config + +import ( + "fmt" + "os" + "syscall" + + "github.com/kata-containers/runtime/virtcontainers/utils" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +const ( + // This signature is defined in the linux NVDIMM driver. + // devices or backing files with this signature can be used + // as pmem (persistent memory) devices in the guest. + pfnSignature = "NVDIMM_PFN_INFO" + + // offset in the backing file where the signature should be + pfnSignatureOffset = int64(4 * 1024) +) + +var ( + pmemLog = logrus.WithField("source", "virtcontainers/device/config") +) + +// PmemDeviceInfo returns a DeviceInfo if a loop device +// is mounted on source, and the backing file of the loop device +// has the PFN signature. +func PmemDeviceInfo(source, destination string) (*DeviceInfo, error) { + stat := syscall.Stat_t{} + err := syscall.Stat(source, &stat) + if err != nil { + return nil, err + } + + // device object is still incomplete, + // but it can be used to fetch the backing file + device := &DeviceInfo{ + ContainerPath: destination, + DevType: "b", + Major: int64(unix.Major(stat.Dev)), + Minor: int64(unix.Minor(stat.Dev)), + Pmem: true, + DriverOptions: make(map[string]string), + } + + pmemLog.WithFields( + logrus.Fields{ + "major": device.Major, + "minor": device.Minor, + }).Debug("looking for backing file") + + device.HostPath, err = getBackingFile(*device) + if err != nil { + return nil, err + } + + pmemLog.WithField("backing-file", device.HostPath). + Debug("backing file found: looking for PFN signature") + + if !hasPFNSignature(device.HostPath) { + return nil, fmt.Errorf("backing file %v has not PFN signature", device.HostPath) + } + + _, fstype, err := utils.GetDevicePathAndFsType(source) + if err != nil { + pmemLog.WithError(err).WithField("mount-point", source).Warn("failed to get fstype: using ext4") + fstype = "ext4" + } + + pmemLog.WithField("fstype", fstype).Debug("filesystem for mount point") + device.DriverOptions["fstype"] = fstype + + return device, nil +} + +// returns true if the file/device path has the PFN signature +// required to use it as PMEM device and enable DAX. +// See [1] to know more about the PFN signature. +// +// [1] - https://github.com/kata-containers/osbuilder/blob/master/image-builder/nsdax.gpl.c +func hasPFNSignature(path string) bool { + f, err := os.Open(path) + if err != nil { + pmemLog.WithError(err).Error("Could not get PFN signature") + return false + } + defer f.Close() + + signatureLen := len(pfnSignature) + signature := make([]byte, signatureLen) + + l, err := f.ReadAt(signature, pfnSignatureOffset) + if err != nil { + pmemLog.WithError(err).Debug("Could not read pfn signature") + return false + } + + pmemLog.WithFields(logrus.Fields{ + "path": path, + "signature": string(signature), + }).Debug("got signature") + + if l != signatureLen { + pmemLog.WithField("read-bytes", l).Debug("Incomplete signature") + return false + } + + return pfnSignature == string(signature) +} diff --git a/virtcontainers/device/config/pmem_test.go b/virtcontainers/device/config/pmem_test.go new file mode 100644 index 0000000000..319adc2536 --- /dev/null +++ b/virtcontainers/device/config/pmem_test.go @@ -0,0 +1,49 @@ +// Copyright (c) 2020 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +package config + +import ( + "io/ioutil" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" +) + +func createPFNFile(assert *assert.Assertions, dir string) string { + pfnPath := filepath.Join(dir, "pfn") + file, err := os.Create(pfnPath) + assert.NoError(err) + defer file.Close() + + l, err := file.WriteAt([]byte(pfnSignature), pfnSignatureOffset) + assert.NoError(err) + assert.Equal(len(pfnSignature), l) + + return pfnPath +} + +func TestHasPFNSignature(t *testing.T) { + assert := assert.New(t) + + b := hasPFNSignature("/abc/xyz/123/sw") + assert.False(b) + + f, err := ioutil.TempFile("", "pfn") + assert.NoError(err) + f.Close() + defer os.Remove(f.Name()) + + b = hasPFNSignature(f.Name()) + assert.False(b) + + pfnFile := createPFNFile(assert, os.TempDir()) + defer os.Remove(pfnFile) + + b = hasPFNSignature(pfnFile) + assert.True(b) +} From abbdf078cd5aaf328f5cccf2dfc33730e92d8dc7 Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Fri, 28 Feb 2020 21:14:07 +0000 Subject: [PATCH 5/8] virtcontainers: add Pmem attribute to BlockDrive A `BlockDrive` can be used as pmem device, since they both are similar and can be mounted in the same way in the guest. The `Pmem` attribute helps kata to identify a pmem device and how it has to be hotplugged in the guest. Signed-off-by: Julio Montes --- virtcontainers/device/drivers/block.go | 7 +++++++ virtcontainers/persist/api/device.go | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/virtcontainers/device/drivers/block.go b/virtcontainers/device/drivers/block.go index 62d55f6493..6711345571 100644 --- a/virtcontainers/device/drivers/block.go +++ b/virtcontainers/device/drivers/block.go @@ -65,6 +65,11 @@ func (device *BlockDevice) Attach(devReceiver api.DeviceReceiver) (err error) { Format: "raw", ID: utils.MakeNameID("drive", device.DeviceInfo.ID, maxDevIDSize), Index: index, + Pmem: device.DeviceInfo.Pmem, + } + + if fs, ok := device.DeviceInfo.DriverOptions["fstype"]; ok { + drive.Format = fs } customOptions := device.DeviceInfo.DriverOptions @@ -169,6 +174,7 @@ func (device *BlockDevice) Save() persistapi.DeviceState { NvdimmID: drive.NvdimmID, VirtPath: drive.VirtPath, DevNo: drive.DevNo, + Pmem: drive.Pmem, } } return ds @@ -194,6 +200,7 @@ func (device *BlockDevice) Load(ds persistapi.DeviceState) { NvdimmID: bd.NvdimmID, VirtPath: bd.VirtPath, DevNo: bd.DevNo, + Pmem: bd.Pmem, } } diff --git a/virtcontainers/persist/api/device.go b/virtcontainers/persist/api/device.go index 67226542a9..34916ed0b3 100644 --- a/virtcontainers/persist/api/device.go +++ b/virtcontainers/persist/api/device.go @@ -41,6 +41,10 @@ type BlockDrive struct { // DevNo DevNo string + + // Pmem enabled persistent memory. Use File as backing file + // for a nvdimm device in the guest. + Pmem bool } // VFIODev represents a VFIO drive used for hotplugging From 84e0ee13c8de454c1e6809e2089a4d09d53f9794 Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Fri, 20 Mar 2020 14:12:38 +0000 Subject: [PATCH 6/8] virtcontainers: reimplement `createBlockDevices` Reimplement `createBlockDevices` to identify possible volumes that can be used as pmem devices Signed-off-by: Julio Montes --- virtcontainers/container.go | 31 +++++++++++++++++++++--- virtcontainers/device/manager/manager.go | 16 +++++++----- 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/virtcontainers/container.go b/virtcontainers/container.go index bfb32e9a42..4ba7ee2e3a 100644 --- a/virtcontainers/container.go +++ b/virtcontainers/container.go @@ -643,6 +643,11 @@ func filterDevices(c *Container, devices []ContainerDevice) (ret []ContainerDevi } func (c *Container) createBlockDevices() error { + if !c.checkBlockDeviceSupport() { + c.Logger().Warn("Block device not supported") + return nil + } + // iterate all mounts and create block device if it's block based. for i, m := range c.mounts { if len(m.BlockDeviceID) > 0 || m.Type != "bind" { @@ -657,18 +662,36 @@ func (c *Container) createBlockDevices() error { return fmt.Errorf("stat %q failed: %v", m.Source, err) } + var di *config.DeviceInfo + var err error + // Check if mount is a block device file. If it is, the block device will be attached to the host // instead of passing this as a shared mount. - if c.checkBlockDeviceSupport() && stat.Mode&unix.S_IFBLK == unix.S_IFBLK { - b, err := c.sandbox.devManager.NewDevice(config.DeviceInfo{ + if stat.Mode&unix.S_IFBLK == unix.S_IFBLK { + di = &config.DeviceInfo{ HostPath: m.Source, ContainerPath: m.Destination, DevType: "b", Major: int64(unix.Major(stat.Rdev)), Minor: int64(unix.Minor(stat.Rdev)), - }) + } + // check whether source can be used as a pmem device + } else if di, err = config.PmemDeviceInfo(m.Source, m.Destination); err != nil { + c.Logger().WithError(err). + WithField("mount-source", m.Source). + Debug("no loop device") + } + + if err == nil && di != nil { + b, err := c.sandbox.devManager.NewDevice(*di) + if err != nil { - return fmt.Errorf("device manager failed to create new device for %q: %v", m.Source, err) + // Do not return an error, try to create + // devices for other mounts + c.Logger().WithError(err).WithField("mount-source", m.Source). + Error("device manager failed to create new device") + continue + } c.mounts[i].BlockDeviceID = b.DeviceID() diff --git a/virtcontainers/device/manager/manager.go b/virtcontainers/device/manager/manager.go index 531cbd1ddb..6a2b665a5e 100644 --- a/virtcontainers/device/manager/manager.go +++ b/virtcontainers/device/manager/manager.go @@ -98,11 +98,15 @@ func (dm *deviceManager) findDeviceByMajorMinor(major, minor int64) api.Device { // createDevice creates one device based on DeviceInfo func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device, err error) { - path, err := config.GetHostPathFunc(devInfo, dm.vhostUserStoreEnabled, dm.vhostUserStorePath) - if err != nil { - return nil, err + // pmem device may points to block devices or raw files, + // do not change its HostPath. + if !devInfo.Pmem { + path, err := config.GetHostPathFunc(devInfo, dm.vhostUserStoreEnabled, dm.vhostUserStorePath) + if err != nil { + return nil, err + } + devInfo.HostPath = path } - devInfo.HostPath = path defer func() { if err == nil { @@ -119,7 +123,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device if devInfo.ID, err = dm.newDeviceID(); err != nil { return nil, err } - if isVFIO(path) { + if isVFIO(devInfo.HostPath) { return drivers.NewVFIODevice(&devInfo), nil } else if isVhostUserBlk(devInfo) { if devInfo.DriverOptions == nil { @@ -134,7 +138,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device devInfo.DriverOptions["block-driver"] = dm.blockDriver return drivers.NewBlockDevice(&devInfo), nil } else { - deviceLogger().WithField("device", path).Info("Device has not been passed to the container") + deviceLogger().WithField("device", devInfo.HostPath).Info("Device has not been passed to the container") return drivers.NewGenericDevice(&devInfo), nil } } From 434b30255edb4fe778a1056cf249e4daace79731 Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Fri, 28 Feb 2020 21:32:27 +0000 Subject: [PATCH 7/8] virtcontainers: hotplug block drives that are pmem devices as nvdimm hotplug as NVDIMM devices the block drives that can be used as pmem devices (`Pmem=true`), the host path to such devices is a raw file that contains the PFN signature. Signed-off-by: Julio Montes --- virtcontainers/qemu.go | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/virtcontainers/qemu.go b/virtcontainers/qemu.go index ab9c4c2c9f..694db22cf1 100644 --- a/virtcontainers/qemu.go +++ b/virtcontainers/qemu.go @@ -1051,16 +1051,28 @@ func (q *qemu) qmpShutdown() { } func (q *qemu) hotplugAddBlockDevice(drive *config.BlockDrive, op operation, devID string) (err error) { - if q.config.BlockDeviceDriver == config.Nvdimm { + // drive can be a pmem device, in which case it's used as backing file for a nvdimm device + if q.config.BlockDeviceDriver == config.Nvdimm || drive.Pmem { var blocksize int64 file, err := os.Open(drive.File) if err != nil { return err } - if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, file.Fd(), unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&blocksize))); err != 0 { + defer file.Close() + + st, err := file.Stat() + if err != nil { + return fmt.Errorf("failed to get information from nvdimm device %v: %v", drive.File, err) + } + + // regular files do not support syscall BLKGETSIZE64 + if st.Mode().IsRegular() { + blocksize = st.Size() + } else if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, file.Fd(), unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&blocksize))); err != 0 { return err } - if err = q.qmpMonitorCh.qmp.ExecuteNVDIMMDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, drive.File, blocksize); err != nil { + + if err = q.qmpMonitorCh.qmp.ExecuteNVDIMMDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, drive.File, blocksize, &drive.Pmem); err != nil { q.Logger().WithError(err).Errorf("Failed to add NVDIMM device %s", drive.File) return err } From 2c310fecd4910c8216baf361f6e69d876044d945 Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Fri, 20 Mar 2020 14:51:10 +0000 Subject: [PATCH 8/8] virtcontainers: handle persistent memory volumes A persistent memory volume MUST meet the following conditions: * A loop device must be mounted in the directory passed as volume * The loop device must have a backing file * The backing file must have the PFN signature at offset 4k [1][2] The backing file is used as backend file for a NVDIMM device in the guest fixes #2262 [1] - https://github.com/kata-containers/osbuilder/blob/master/image-builder /nsdax.gpl.c [2] - https://github.com/torvalds/linux/blob/master/drivers/nvdimm/pfn.h Signed-off-by: Julio Montes --- virtcontainers/kata_agent.go | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/virtcontainers/kata_agent.go b/virtcontainers/kata_agent.go index 987fb6354e..c9154aaf58 100644 --- a/virtcontainers/kata_agent.go +++ b/virtcontainers/kata_agent.go @@ -1099,6 +1099,13 @@ func (k *kataAgent) appendBlockDevice(dev ContainerDevice, c *Container) *grpc.D return nil } + if d.Pmem { + // block drive is a persistent memory device that + // was passed as volume (-v) not as device (--device). + // It shouldn't be visible in the container + return nil + } + kataDevice := &grpc.Device{ ContainerPath: dev.ContainerPath, } @@ -1461,6 +1468,12 @@ func (k *kataAgent) handleDeviceBlockVolume(c *Container, device api.Device) (*g return nil, fmt.Errorf("malformed block drive") } switch { + // pmem volumes case + case blockDrive.Pmem: + vol.Driver = kataNvdimmDevType + vol.Source = fmt.Sprintf("/dev/pmem%s", blockDrive.NvdimmID) + vol.Fstype = blockDrive.Format + vol.Options = []string{"dax"} case c.sandbox.config.HypervisorConfig.BlockDeviceDriver == config.VirtioBlockCCW: vol.Driver = kataBlkCCWDevType vol.Source = blockDrive.DevNo @@ -1538,8 +1551,12 @@ func (k *kataAgent) handleBlockVolumes(c *Container) ([]*grpc.Storage, error) { } vol.MountPoint = m.Destination - vol.Fstype = "bind" - vol.Options = []string{"bind"} + if vol.Fstype == "" { + vol.Fstype = "bind" + } + if len(vol.Options) == 0 { + vol.Options = []string{"bind"} + } volumeStorages = append(volumeStorages, vol) }