From ca6f3c0614042c25e0f045b0abd8d210403fd4c7 Mon Sep 17 00:00:00 2001 From: Maksim An Date: Thu, 6 Jun 2024 20:20:52 -0700 Subject: [PATCH] feature: block-device mounts This PR adds capability to mount virtual and passthrough disks as block devices inside containers. We add a new "blockdev://" prefix to OCI `Mount.ContainerPath`, which indicates that the source should be mounted as a blcok device. A new `BlockDev` field has been added to `mountConfig` used by `mountManager`, which indicates that the SCSI attachment should be mounted as a block device. The GCS has also been updated to handle `BlockDev`. Instead of mounting the filesystem, GCS creates a symlink to the block device corresponding to the SCSI attachment. The symlink path is set by shim as a source of bind mount in OCI container spec. GCS resolves the symlink and adds the corresponding device cgroup. Without the cgroup, the container won't be able to work with the block device. We chose a symlink approach instead of bind mounting the device directly, because the shim doesn't know the path at which the device will appear inside UVM. For this to work, we either need to encode the SCSI controller/LUN in the OCI mount's HostPath or update the communication protocol between the shim and GCS, where GCS would either return the device path, or add capability for the shim to query for it. Below are some CRI container config examples for physical and virtual disks: Passthrough physical disk: ```json { ... "mounts": [ { "host_path": "\\\\.\\PHYSICALDRIVE1", "container_path": "blockdev:///my/block/mount", "readonly": false } ] ... } ``` Virtual VHD disk: ```json { ... "mounts": [ { "host_path": "C:\\path\\to\\my\\disk.vhdx", "container_path": "blockdev:///my/block/mount", "readonly": false } ] ... } ``` Signed-off-by: Maksim An --- internal/guest/runtime/hcsv2/uvm.go | 2 + .../guest/runtime/hcsv2/workload_container.go | 44 +++++++++++++++++++ internal/guest/storage/scsi/scsi.go | 26 +++++++++++ internal/guestpath/paths.go | 3 ++ internal/hcsoci/resources_lcow.go | 19 +++++--- internal/protocol/guestresource/resources.go | 1 + internal/uvm/scsi/backend.go | 2 + internal/uvm/scsi/manager.go | 5 +++ internal/uvm/scsi/mount.go | 1 + 9 files changed, 98 insertions(+), 5 deletions(-) diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 617d2bdf7d..f4fb3be5e6 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -1005,6 +1005,7 @@ func modifyMappedVirtualDisk( VerityInfo: verityInfo, EnsureFilesystem: mvd.EnsureFilesystem, Filesystem: mvd.Filesystem, + BlockDev: mvd.BlockDev, } return scsi.Mount(mountCtx, mvd.Controller, mvd.Lun, mvd.Partition, mvd.MountPath, mvd.ReadOnly, mvd.Options, config) @@ -1022,6 +1023,7 @@ func modifyMappedVirtualDisk( VerityInfo: verityInfo, EnsureFilesystem: mvd.EnsureFilesystem, Filesystem: mvd.Filesystem, + BlockDev: mvd.BlockDev, } if err := scsi.Unmount(ctx, mvd.Controller, mvd.Lun, mvd.Partition, mvd.MountPath, config); err != nil { diff --git a/internal/guest/runtime/hcsv2/workload_container.go b/internal/guest/runtime/hcsv2/workload_container.go index 28349de5df..26c0fd0f6a 100644 --- a/internal/guest/runtime/hcsv2/workload_container.go +++ b/internal/guest/runtime/hcsv2/workload_container.go @@ -5,10 +5,12 @@ package hcsv2 import ( "context" + "fmt" "os" "path/filepath" "strings" + "github.com/opencontainers/runc/libcontainer/devices" oci "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" "go.opencensus.io/trace" @@ -84,6 +86,44 @@ func updateHugePageMounts(sbid string, spec *oci.Spec) error { return nil } +func updateBlockDeviceMounts(spec *oci.Spec) error { + for i, m := range spec.Mounts { + if !strings.HasPrefix(m.Destination, guestpath.BlockDevMountPrefix) { + continue + } + permissions := "rwm" + for _, o := range m.Options { + if o == "ro" { + permissions = "r" + } + } + + // For block device mounts, the source will be a symlink. Resolve it first + // before passing to `DeviceFromPath`, which expects a real device path. + rPath, err := os.Readlink(m.Source) + if err != nil { + return fmt.Errorf("failed to readlink %s: %w", m.Source, err) + } + + sourceDevice, err := devices.DeviceFromPath(rPath, permissions) + if err != nil { + return fmt.Errorf("failed to get device from path: %w", err) + } + + deviceCgroup := oci.LinuxDeviceCgroup{ + Allow: true, + Type: string(sourceDevice.Type), + Major: &sourceDevice.Major, + Minor: &sourceDevice.Minor, + Access: string(sourceDevice.Permissions), + } + + spec.Linux.Resources.Devices = append(spec.Linux.Resources.Devices, deviceCgroup) + spec.Mounts[i].Destination = strings.TrimPrefix(m.Destination, guestpath.BlockDevMountPrefix) + } + return nil +} + func specHasGPUDevice(spec *oci.Spec) bool { for _, d := range spec.Windows.Devices { if d.IDType == "gpu" { @@ -115,6 +155,10 @@ func setupWorkloadContainerSpec(ctx context.Context, sbid, id string, spec *oci. return errors.Wrapf(err, "failed to update hugepages mounts for container %v in sandbox %v", id, sbid) } + if err = updateBlockDeviceMounts(spec); err != nil { + return fmt.Errorf("failed to update block device mounts for container %v in sandbox %v: %w", id, sbid, err) + } + // Add default mounts for container networking (e.g. /etc/hostname, /etc/hosts), // if spec didn't override them explicitly. networkingMounts := specInternal.GenerateWorkloadContainerNetworkMounts(sbid, spec) diff --git a/internal/guest/storage/scsi/scsi.go b/internal/guest/storage/scsi/scsi.go index a24946e467..61ff383372 100644 --- a/internal/guest/storage/scsi/scsi.go +++ b/internal/guest/storage/scsi/scsi.go @@ -112,6 +112,7 @@ type Config struct { VerityInfo *guestresource.DeviceVerityInfo EnsureFilesystem bool Filesystem string + BlockDev bool } // Mount creates a mount from the SCSI device on `controller` index `lun` to @@ -163,6 +164,22 @@ func Mount( } } + // create and symlink block device mount target + if config.BlockDev { + parent := filepath.Dir(target) + log.G(ctx).WithField("parent", parent).Debug("creating parent directory for block device mount") + if err := osMkdirAll(parent, 0700); err != nil { + return err + } + defer func() { + if err != nil { + _ = os.Remove(target) + } + }() + log.G(ctx).Debug("creating symlink for block device mount") + return os.Symlink(source, target) + } + if err := osMkdirAll(target, 0700); err != nil { return err } @@ -280,6 +297,15 @@ func Unmount( trace.Int64Attribute("partition", int64(partition)), trace.StringAttribute("target", target)) + // skip unmount logic for block devices, since they are just symlinks + if config.BlockDev { + log.G(ctx).Debug("removing symlink for block device mount") + if err := os.Remove(target); err != nil { + return fmt.Errorf("failed to remove symlink: %w", err) + } + return nil + } + // unmount target if err := storageUnmountPath(ctx, target, true); err != nil { return errors.Wrapf(err, "unmount failed: %s", target) diff --git a/internal/guestpath/paths.go b/internal/guestpath/paths.go index 8ab4ac7159..1ff048c406 100644 --- a/internal/guestpath/paths.go +++ b/internal/guestpath/paths.go @@ -13,6 +13,9 @@ const ( // HugePagesMountPrefix is mount prefix used in container spec to mark a // huge-pages mount HugePagesMountPrefix = "hugepages://" + // BlockDevMountPrefix is mount prefix used in container spec to mark a + // block-device mount. + BlockDevMountPrefix = "blockdev://" // PipePrefix is the mount prefix used in container spec to mark a named pipe PipePrefix = `\\.\pipe` // LCOWMountPathPrefixFmt is the path format in the LCOW UVM where diff --git a/internal/hcsoci/resources_lcow.go b/internal/hcsoci/resources_lcow.go index a99496b9a1..b98493ce4c 100644 --- a/internal/hcsoci/resources_lcow.go +++ b/internal/hcsoci/resources_lcow.go @@ -83,6 +83,8 @@ func allocateLinuxResources(ctx context.Context, coi *createOptionsInternal, r * } l := log.G(ctx).WithField("mount", fmt.Sprintf("%+v", mount)) + + isBlockDev := strings.HasPrefix(mount.Destination, guestpath.BlockDevMountPrefix) if mount.Type == MountTypePhysicalDisk { l.Debug("hcsshim::allocateLinuxResources Hot-adding SCSI physical disk for OCI mount") scsiMount, err := coi.HostingSystem.SCSIManager.AddPhysicalDisk( @@ -90,15 +92,18 @@ func allocateLinuxResources(ctx context.Context, coi *createOptionsInternal, r * hostPath, readOnly, coi.HostingSystem.ID(), - &scsi.MountConfig{Options: mount.Options}, + &scsi.MountConfig{Options: mount.Options, BlockDev: isBlockDev}, ) if err != nil { return errors.Wrapf(err, "adding SCSI physical disk mount %+v", mount) } - uvmPathForFile = scsiMount.GuestPath() r.Add(scsiMount) - coi.Spec.Mounts[i].Type = "none" + mt := "none" + if isBlockDev { + mt = "bind" + } + coi.Spec.Mounts[i].Type = mt } else if mount.Type == MountTypeVirtualDisk { l.Debug("hcsshim::allocateLinuxResources Hot-adding SCSI virtual disk for OCI mount") @@ -109,7 +114,7 @@ func allocateLinuxResources(ctx context.Context, coi *createOptionsInternal, r * hostPath, readOnly, coi.HostingSystem.ID(), - &scsi.MountConfig{Options: mount.Options}, + &scsi.MountConfig{Options: mount.Options, BlockDev: isBlockDev}, ) if err != nil { return errors.Wrapf(err, "adding SCSI virtual disk mount %+v", mount) @@ -117,7 +122,11 @@ func allocateLinuxResources(ctx context.Context, coi *createOptionsInternal, r * uvmPathForFile = scsiMount.GuestPath() r.Add(scsiMount) - coi.Spec.Mounts[i].Type = "none" + mt := "none" + if isBlockDev { + mt = "bind" + } + coi.Spec.Mounts[i].Type = mt } else if strings.HasPrefix(mount.Source, guestpath.SandboxMountPrefix) { // Mounts that map to a path in UVM are specified with 'sandbox://' prefix. // example: sandbox:///a/dirInUvm destination:/b/dirInContainer diff --git a/internal/protocol/guestresource/resources.go b/internal/protocol/guestresource/resources.go index 8b68bc4d7d..1eb695e801 100644 --- a/internal/protocol/guestresource/resources.go +++ b/internal/protocol/guestresource/resources.go @@ -85,6 +85,7 @@ type LCOWMappedVirtualDisk struct { ReadOnly bool `json:"ReadOnly,omitempty"` Encrypted bool `json:"Encrypted,omitempty"` Options []string `json:"Options,omitempty"` + BlockDev bool `json:"BlockDev,omitempty"` // Deprecated: verity info is read by the guest VerityInfo *DeviceVerityInfo `json:"VerityInfo,omitempty"` EnsureFilesystem bool `json:"EnsureFilesystem,omitempty"` diff --git a/internal/uvm/scsi/backend.go b/internal/uvm/scsi/backend.go index 9194b94324..6219a15172 100644 --- a/internal/uvm/scsi/backend.go +++ b/internal/uvm/scsi/backend.go @@ -196,6 +196,7 @@ func mountRequest(controller, lun uint, path string, config *mountConfig, osType Options: config.options, EnsureFilesystem: config.ensureFilesystem, Filesystem: config.filesystem, + BlockDev: config.blockDev, } default: return guestrequest.ModificationRequest{}, fmt.Errorf("unsupported os type: %s", osType) @@ -221,6 +222,7 @@ func unmountRequest(controller, lun uint, path string, config *mountConfig, osTy Lun: uint8(lun), Partition: config.partition, Controller: uint8(controller), + BlockDev: config.blockDev, } default: return guestrequest.ModificationRequest{}, fmt.Errorf("unsupported os type: %s", osType) diff --git a/internal/uvm/scsi/manager.go b/internal/uvm/scsi/manager.go index bcf87c76ca..fd7173b5b1 100644 --- a/internal/uvm/scsi/manager.go +++ b/internal/uvm/scsi/manager.go @@ -83,6 +83,9 @@ type MountConfig struct { // mounted as. // This is only supported for LCOW. Filesystem string + // BlockDev indicates if the device should be mounted as a block device. + // This is only supported for LCOW. + BlockDev bool } // Mount represents a SCSI device that has been attached to a VM, and potentially @@ -157,6 +160,7 @@ func (m *Manager) AddVirtualDisk( options: mc.Options, ensureFilesystem: mc.EnsureFilesystem, filesystem: mc.Filesystem, + blockDev: mc.BlockDev, } } return m.add(ctx, @@ -202,6 +206,7 @@ func (m *Manager) AddPhysicalDisk( options: mc.Options, ensureFilesystem: mc.EnsureFilesystem, filesystem: mc.Filesystem, + blockDev: mc.BlockDev, } } return m.add(ctx, diff --git a/internal/uvm/scsi/mount.go b/internal/uvm/scsi/mount.go index c4bdcc6d81..ed77fa9991 100644 --- a/internal/uvm/scsi/mount.go +++ b/internal/uvm/scsi/mount.go @@ -41,6 +41,7 @@ type mountConfig struct { partition uint64 readOnly bool encrypted bool + blockDev bool options []string ensureFilesystem bool filesystem string