From 187179e847095e8177d9e61003c0f27393b86cfe Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Mon, 16 Dec 2019 13:29:21 +0100 Subject: [PATCH] imagefile: embed namespace and filesystem inside a file The resulting file can be used as backing store for a QEMU nvdimm device. This is based on the approach that is used for the Kata Container rootfs (https://github.com/kata-containers/osbuilder/blob/dbbf16082da3de37d89af0783e023269210b2c91/image-builder/image_builder.sh) and reuses some of the same code, but also differs from that in some regards: - The start of the partition is aligned a multiple of the 2MiB huge page size (https://github.com/kata-containers/runtime/issues/2262#issuecomment-566956462). - The size of the QEMU object is the same as the nominal size of the file. In Kata Containers the size is a fixed 128MiB (https://github.com/kata-containers/osbuilder/issues/391#issuecomment-566651878). --- pkg/imagefile/imagefile.go | 507 ++++++++++++++++++++++++++++++++ pkg/imagefile/imagefile_test.go | 292 ++++++++++++++++++ test/check-imagefile.sh | 119 ++++++++ 3 files changed, 918 insertions(+) create mode 100644 pkg/imagefile/imagefile.go create mode 100644 pkg/imagefile/imagefile_test.go create mode 100755 test/check-imagefile.sh diff --git a/pkg/imagefile/imagefile.go b/pkg/imagefile/imagefile.go new file mode 100644 index 0000000000..72283592d4 --- /dev/null +++ b/pkg/imagefile/imagefile.go @@ -0,0 +1,507 @@ +/* + +Copyright (c) 2017-2019 Intel Corporation + +SPDX-License-Identifier: Apache-2.0 + +This file contains code originally published by Intel under GPL: +- https://github.com/torvalds/linux/blob/72c0870e3a05d9cd5466d08c3d2a3069ed0a2f9f/drivers/nvdimm/claim.c#L228-L249 + from https://github.com/torvalds/linux/commit/e1455744b27c9e6115c3508a7b2902157c2c4347 +- https://github.com/torvalds/linux/blob/72c0870e3a05d9cd5466d08c3d2a3069ed0a2f9f/drivers/nvdimm/core.c#L179-L193 + from https://github.com/torvalds/linux/commit/eaf961536e1622ad21247ac8d44acd48ba65566e +- https://github.com/torvalds/linux/blob/a3619190d62ed9d66416891be2416f6bea2b3ca4/drivers/nvdimm/pfn.h#L12-L34 + from multiple commits by djb, see https://github.com/torvalds/linux/blame/a3619190d62ed9d66416891be2416f6bea2b3ca4/drivers/nvdimm/pfn.h#L12-L34 +- https://github.com/kata-containers/osbuilder/blob/dbbf16082da3de37d89af0783e023269210b2c91/image-builder/nsdax.gpl.c#L1-L171 + from https://github.com/kata-containers/osbuilder/commit/726f798ff795ef4a8300201cab8d83e83c1496a5#diff-1d1124b18f3d6153eb2a9bba67c6314d + +That code gets re-published here under Apache-2.0. + +Furthermore, this file is based on the following code published by Intel under Apache-2.0: +- https://github.com/kata-containers/osbuilder/blob/d1751a35e1bd1613e66df87221faed195225718e/image-builder/image_builder.sh +*/ + +/* +Package imagefile contains code to create a file with the following content: + + .-----------.----------.---------------.-----------. + | 0 - 512 B | 4 - 8 Kb | 2M - 2M+512B | 4M | + |-----------+----------+---------------+-----------+ + | MBR #1 | DAX | MBR #2 | FS | + '-----------'----------'---------------'-----------+ + | | ^ | ^ + | '-data-' '--------' + | | + '--------rootfs-partition---------' + + ^ ^ ^ + daxHeaderOffset | | + daxHeaderSize | + HeaderSize + + +MBR: Master boot record. +DAX: Metadata required by the NVDIMM driver to enable DAX in the guest [1][2] (struct nd_pfn_sb). +FS: partition that contains a filesystem. + +Kernels and hypervisors that support DAX/NVDIMM read the MBR #2, otherwise MBR #1 is read. +The /dev/pmem0 device starts at the MBR which is used. + +When such a file is created on a dax-capable filesystem, then it can +be used as backing store for a [QEMU nvdimm +device](https://github.com/qemu/qemu/blob/master/docs/nvdimm.txt) such +that the guest kernel provides a /dev/pmem0p1 (the FS partition above) +which can be mounted with -odax. For full dax semantic, the QEMU +device configuration must use the 'pmem' and 'share' flags. +*/ +package imagefile + +// Here the original C code gets included verbatim and compiled with cgo. + +// #include +// #include +// #include +// +// #define __KERNEL__ +// #include +// #include +// +// /* +// Next types, definitions and functions were copied from kernel 4.19.24 source +// code, specifically from nvdimm driver +// */ +// +// #define PFN_SIG_LEN 16 +// #define PFN_SIG "NVDIMM_PFN_INFO\0" +// #define SZ_4K 0x00001000 +// +// typedef __u16 u16; +// typedef __u8 u8; +// typedef __u64 u64; +// typedef __u32 u32; +// typedef int bool; +// +// enum nd_pfn_mode { +// PFN_MODE_NONE, +// PFN_MODE_RAM, +// PFN_MODE_PMEM, +// }; +// +// struct nd_pfn_sb { +// u8 signature[PFN_SIG_LEN]; +// u8 uuid[16]; +// u8 parent_uuid[16]; +// __le32 flags; +// __le16 version_major; +// __le16 version_minor; +// __le64 dataoff; /* relative to namespace_base + start_pad */ +// __le64 npfns; +// __le32 mode; +// /* minor-version-1 additions for section alignment */ +// __le32 start_pad; +// __le32 end_trunc; +// /* minor-version-2 record the base alignment of the mapping */ +// __le32 align; +// /* minor-version-3 guarantee the padding and flags are zero */ +// u8 padding[4000]; +// __le64 checksum; +// }; +// +// struct nd_gen_sb { +// char reserved[SZ_4K - 8]; +// __le64 checksum; +// }; +// +// u64 nd_fletcher64(void *addr, size_t len, bool le) +// { +// u32 *buf = addr; +// u32 lo32 = 0; +// u64 hi32 = 0; +// int i; +// +// for (i = 0; i < len / sizeof(u32); i++) { +// lo32 += le ? le32toh((__le32) buf[i]) : buf[i]; +// hi32 += lo32; +// } +// +// return hi32 << 32 | lo32; +// } +// +// /* +// * nd_sb_checksum: compute checksum for a generic info block +// * +// * Returns a fletcher64 checksum of everything in the given info block +// * except the last field (since that's where the checksum lives). +// */ +// u64 nd_sb_checksum(struct nd_gen_sb *nd_gen_sb) +// { +// u64 sum; +// __le64 sum_save; +// +// sum_save = nd_gen_sb->checksum; +// nd_gen_sb->checksum = 0; +// sum = nd_fletcher64(nd_gen_sb, sizeof(*nd_gen_sb), 1); +// nd_gen_sb->checksum = sum_save; +// return sum; +// } +// +// void nsdax(void *p, unsigned int data_offset, unsigned int alignment) +// { +// struct nd_pfn_sb *sb = p; +// memset(sb, 0, sizeof(*sb)); +// +// strcpy((char*)sb->signature, PFN_SIG); +// sb->mode = PFN_MODE_RAM; +// sb->align = htole32(alignment); +// sb->dataoff = htole64((unsigned long)data_offset); +// sb->version_minor = 2; +// +// // checksum must be calculated at the end +// sb->checksum = nd_sb_checksum((struct nd_gen_sb*)sb); +// } +import "C" + +import ( + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "syscall" + + "github.com/intel/pmem-csi/third-party/go-fibmap" +) + +type FsType string + +const ( + Ext4 FsType = "ext4" + Xfs FsType = "xfs" +) + +type Bytes int64 + +const ( + MiB = Bytes(1024 * 1024) + KiB = Bytes(1024) + + // Alignment of partitions relative to the start of the block device, i.e. the MBR. + // For DAX huge pages this has to be 2MB, see https://nvdimm.wiki.kernel.org/2mib_fs_dax + DaxAlignment = 2 * MiB + + // DAX header offset in the file. This is where the Linux kernel expects it + // (https://github.com/torvalds/linux/blob/2187f215ebaac73ddbd814696d7c7fa34f0c3de0/drivers/nvdimm/pfn_devs.c#L438-L596). + daxHeaderOffset = 4 * KiB + + // Start of MBR#2. + // Chosen so that we have enough space before it for MBR #1 and the DAX metadata, + // doesn't really have to be aligned. + daxHeaderSize = DaxAlignment + + // Total size of the MBR + DAX data before the actual partition. + // This has to be aligned relative to MBR#2, which is at daxHeaderSize, + // for huge pages to work inside a VM. + HeaderSize = daxHeaderSize + DaxAlignment + + // Block size used for the filesystem. ext4 only supports dax with 4KiB blocks. + BlockSize = 4 * KiB +) + +// Create writes a complete image file of a certain total size. +// The size must be a multiple of DaxAlignment. The resulting +// partition then is size - HeaderSize large. +// +// Depending on the filesystem, additional constraints apply +// for the size of that partition. A multiple of BlockSize +// should be safe. +// +// The resulting file will have "size" bytes in use, but the nominal +// size may be larger to accomodate aligment requirements when +// creating a nvdimm device for QEMU. The extra bytes at the end +// are not allocated and never should be read or written because +// the partition ends before them. +// +// The result will be sparse, i.e. empty parts are not actually +// written yet, but they will be allocated, so there is no risk +// later on that attempting to write fails due to lack of space. +func Create(filename string, size Bytes, fs FsType) error { + if size <= HeaderSize { + return fmt.Errorf("invalid image file size %d, must be larger than HeaderSize=%d", size, HeaderSize) + } + fsSize := size - HeaderSize + + dir, err := os.Open(filepath.Dir(filename)) + if err != nil { + return err + } + defer dir.Close() + + file, err := os.Create(filename) + if err != nil { + return err + } + + // Delete file on failure. + success := false + defer func() { + if !success { + os.Remove(filename) + } + }() + defer file.Close() + + // Enlarge the file and ensure that we really have enough space for it. + if err := file.Truncate(int64(size)); err != nil { + return fmt.Errorf("resize %q to %d: %w", filename, size, err) + } + if err := syscall.Fallocate(int(file.Fd()), 0, 0, int64(size)); err != nil { + return fmt.Errorf("fallocate %q size %d: %w", filename, size, err) + } + + // We write MBRs and rootfs into temporary files, then copy into the + // final image file at the end. + tmp, err := ioutil.TempDir("", "pmem-csi-imagefile") + if err != nil { + return fmt.Errorf("temp dir: %w", err) + } + defer os.RemoveAll(tmp) + mbr1 := filepath.Join(tmp, "mbr1") + mbr2 := filepath.Join(tmp, "mbr2") + fsimage := filepath.Join(tmp, "fsimage") + + // This is for the full image file. + if err := writeMBR(mbr1, fs, HeaderSize, size); err != nil { + return err + } + + // This is for the image file minus the dax header. + if err := writeMBR(mbr2, fs, DaxAlignment, size-daxHeaderSize); err != nil { + return err + } + + // Create a file of the desired size, then let mkfs write into it. + fsFile, err := os.Create(fsimage) + if err != nil { + return err + } + defer fsFile.Close() + if err := fsFile.Truncate(int64(fsSize)); err != nil { + return err + } + args := []string{fmt.Sprintf("mkfs.%s", fs)} + // Required for dax semantic. + switch fs { + case Ext4: + args = append(args, "-b", fmt.Sprintf("%d", BlockSize)) + case Xfs: + args = append(args, "-b", fmt.Sprintf("size=%d", BlockSize)) + } + args = append(args, fsimage) + cmd := exec.Command(args[0], args[1:]...) + if _, err := cmd.Output(); err != nil { + return fmt.Errorf("mkfs.%s for fs of size %d: %w", fs, fsSize, err) + } + + // Now copy to the actual file. + if err := dd(mbr1, filename, true, 0); err != nil { + return err + } + if _, err := file.Seek(int64(daxHeaderOffset), os.SEEK_SET); err != nil { + return err + } + if _, err := file.Write(nsdax(uint(daxHeaderSize), uint(DaxAlignment))); err != nil { + return err + } + if err := dd(mbr2, filename, true, daxHeaderSize); err != nil { + return err + } + if err := dd(fsimage, filename, true, HeaderSize); err != nil { + return err + } + + // Some (but not all) kernels seem to expect the entire file to align at + // a 2GiB boundary. Therefore we round up and add some un-allocated padding + // at the end. + newSize := (size + DaxAlignment - 1) / DaxAlignment * DaxAlignment + if err := file.Truncate(int64(newSize)); err != nil { + return fmt.Errorf("resize %q to %d: %w", filename, newSize, err) + } + + // Safely close the file: + // - sync content + // - close it + // - sync directory + if err := file.Sync(); err != nil { + return fmt.Errorf("syncing %q: %w", filename, err) + } + if err := file.Close(); err != nil { + return fmt.Errorf("closing %q: %w", filename, err) + } + if err := dir.Sync(); err != nil { + return fmt.Errorf("syncing parent directory of %q: %w", filename, err) + } + + success = true + return nil +} + +// nsdax prepares 4KiB of DAX metadata. +func nsdax(dataOffset uint, alignment uint) []byte { + p := C.malloc(C.sizeof_struct_nd_pfn_sb) + defer C.free(p) + + // Fill allocated memory... + C.nsdax(p, C.uint(dataOffset), C.uint(alignment)) + + // ... and return a copy in a normal slice. + return C.GoBytes(p, C.sizeof_struct_nd_pfn_sb) +} + +// writeMBR writes a master boot record at the start of the given image file. +func writeMBR(to string, fs FsType, partitionStart Bytes, partitionEnd Bytes) error { + // Doesn't have to be a block device, but must exist and be large enough. + file, err := os.Create(to) + if err != nil { + return err + } + defer file.Close() + if err := file.Truncate(int64(partitionEnd)); err != nil { + return fmt.Errorf("resize %q to %d: %w", to, partitionEnd, err) + } + + // From https://github.com/kata-containers/osbuilder/blob/d1751a35e1bd1613e66df87221faed195225718e/image-builder/image_builder.sh#L346-L348 + // with some changes: + // - no alignment + // - subtract one from the end because it looks like start and end of the partition + // are both inclusive; at least for end == size of file we get an error + // (Error: The location .... is outside of the device ...). + cmd := exec.Command("parted", "--script", "--align", "none", to, "--", + "mklabel", "msdos", + "mkpart", "primary", string(fs), + fmt.Sprintf("%dB", partitionStart), + fmt.Sprintf("%dB", partitionEnd-1), + ) + if _, err := cmd.Output(); err != nil { + return fmt.Errorf("write MBR with parted to %q: %w", to, err) + } + return nil +} + +// dd copies one complete file into another at a certain target offset. +// Whether it actually writes the data even when it's just zeros is +// configurable. +func dd(from, to string, sparse bool, seek Bytes) error { + fi, err := os.Stat(from) + if err != nil { + return err + } + + var extents []fibmap.Extent + if sparse { + e, err := getAllExtents(from) + if err != nil { + return err + } + extents = e + } else { + // Copy the entire file. + extents = append(extents, fibmap.Extent{Length: uint64(fi.Size())}) + } + + in, err := os.Open(from) + if err != nil { + return err + } + defer in.Close() + out, err := os.OpenFile(to, os.O_WRONLY, 0) + if err != nil { + return err + } + defer out.Close() + + for _, extent := range extents { + // The extent might stretch beyond the end of the file. + length := int64(extent.Length) + remaining := fi.Size() - int64(extent.Logical) + if length > remaining { + length = remaining + } + if err := copyRange(in, out, int64(extent.Logical), int64(seek)+int64(extent.Logical), length); err != nil { + return err + } + } + + return nil +} + +const ioChunkSize = 256 * 1024 * 1024 + +func copyRange(from, to *os.File, skip, seek, size int64) error { + buffer := make([]byte, ioChunkSize) + + if _, err := from.Seek(skip, os.SEEK_SET); err != nil { + return err + } + if _, err := to.Seek(seek, os.SEEK_SET); err != nil { + return err + } + remaining := size + for remaining > 0 { + current := remaining + if current > int64(len(buffer)) { + current = int64(len(buffer)) + } + + // We shouldn't run into io.EOF here because we don't + // attempt to read past the end of the file, so any error + // is a reason to fail. + read, err := from.Read(buffer[0:current]) + if err != nil { + return err + } + if int64(read) < current { + return fmt.Errorf("%q: unexpected short read, got %d instead of %d bytes", from.Name(), read, current) + } + // In contrast to reads, a short write is guaranteed to return an error. + if _, err := to.Write(buffer[0:current]); err != nil { + return err + } + remaining = remaining - current + } + return nil +} + +const initialExtentSize = 16 + +func getAllExtents(from string) ([]fibmap.Extent, error) { + file, err := os.Open(from) + if err != nil { + return nil, err + } + fibmapFile := fibmap.NewFibmapFile(file) + + // Try with FIBMAP first. + for size := uint32(initialExtentSize); ; size = size * 2 { + extents, err := fibmapFile.Fiemap(size) + // Got all extents? + if err == 0 && + (len(extents) == 0 || (extents[len(extents)-1].Flags&fibmap.FIEMAP_EXTENT_LAST) != 0) { + return extents, nil + } + if err == syscall.ENOTSUP { + break + } + if err != 0 { + return nil, &os.PathError{Op: "fibmap", Path: from, + Err: &os.SyscallError{Syscall: "ioctl", Err: err}} + } + } + + // Not supported by tmpfs, which supports SEEK_DATA and SEEK_HOLE. Fall back to that. + // TODO: error reporting in SeekDataHole() + offsets := fibmapFile.SeekDataHole() + var extents []fibmap.Extent + for i := 0; i+1 < len(offsets); i += 2 { + extents = append(extents, fibmap.Extent{Logical: uint64(offsets[i]), Length: uint64(offsets[i+1])}) + } + return extents, nil +} diff --git a/pkg/imagefile/imagefile_test.go b/pkg/imagefile/imagefile_test.go new file mode 100644 index 0000000000..f613bfca4d --- /dev/null +++ b/pkg/imagefile/imagefile_test.go @@ -0,0 +1,292 @@ +/* + +Copyright (c) 2017-2019 Intel Corporation + +SPDX-License-Identifier: Apache-2.0 + +*/ + +package imagefile + +import ( + "bufio" + "bytes" + "errors" + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "strings" + "syscall" + "testing" + "time" + + "k8s.io/apimachinery/pkg/api/resource" + + "github.com/stretchr/testify/assert" +) + +func TestNsdax(t *testing.T) { + type testcase struct { + dataOffset uint + alignment uint + odDump string + } + + // Expected output comes from: + // - curl -O https://github.com/kata-containers/osbuilder/raw/726f798ff795ef4a8300201cab8d83e83c1496a5/image-builder/nsdax.gpl.c + // - gcc -o nsdax.gpl nsdax.gpl.c + // - truncate -s 0 /tmp/image + // - ./fsdax.gpl /tmp/image + // - tail -c +$((0x00001001)) /tmp/image | od -t x1 -a + // + // 0x00001001 = SZ_4K + 1 + tests := []struct { + dataOffset uint + alignment uint + odDump string + }{ + {1024, 2048, + `0000000 4e 56 44 49 4d 4d 5f 50 46 4e 5f 49 4e 46 4f 00 + N V D I M M _ P F N _ I N F O nul +0000020 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + nul nul nul nul nul nul nul nul nul nul nul nul nul nul nul nul +* +0000060 00 00 00 00 00 00 02 00 00 04 00 00 00 00 00 00 + nul nul nul nul nul nul stx nul nul eot nul nul nul nul nul nul +0000100 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 + nul nul nul nul nul nul nul nul soh nul nul nul nul nul nul nul +0000120 00 00 00 00 00 08 00 00 00 00 00 00 00 00 00 00 + nul nul nul nul nul bs nul nul nul nul nul nul nul nul nul nul +0000140 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + nul nul nul nul nul nul nul nul nul nul nul nul nul nul nul nul +* +0007760 00 00 00 00 00 00 00 00 30 44 54 e3 2b 23 ea 6c + nul nul nul nul nul nul nul nul 0 D T c + # j l +0010000 +`}, + {1, 2, `0000000 4e 56 44 49 4d 4d 5f 50 46 4e 5f 49 4e 46 4f 00 + N V D I M M _ P F N _ I N F O nul +0000020 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + nul nul nul nul nul nul nul nul nul nul nul nul nul nul nul nul +* +0000060 00 00 00 00 00 00 02 00 01 00 00 00 00 00 00 00 + nul nul nul nul nul nul stx nul soh nul nul nul nul nul nul nul +0000100 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 + nul nul nul nul nul nul nul nul soh nul nul nul nul nul nul nul +0000120 00 00 00 00 02 00 00 00 00 00 00 00 00 00 00 00 + nul nul nul nul stx nul nul nul nul nul nul nul nul nul nul nul +0000140 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + nul nul nul nul nul nul nul nul nul nul nul nul nul nul nul nul +* +0007760 00 00 00 00 00 00 00 00 33 38 54 e3 f3 0e bb 6c + nul nul nul nul nul nul nul nul 3 8 T c s so ; l +0010000 +`}, + } + + for _, tt := range tests { + tt := tt + t.Run(fmt.Sprintf("offset %d, alignment %d", tt.dataOffset, tt.alignment), + func(t *testing.T) { + t.Parallel() + data := nsdax(tt.dataOffset, tt.alignment) + cmd := exec.Command("od", "-t", "x1", "-a") + cmd.Stdin = bytes.NewBuffer(data) + out, err := cmd.Output() + if err != nil { + t.Fatalf("od failed: %v", err) + } + assert.Equal(t, tt.odDump, string(out)) + }) + } +} + +func rmTmpfile(file *os.File) { + file.Close() + os.Remove(file.Name()) +} + +func TestExtents(t *testing.T) { + file, err := ioutil.TempFile("", "extents") + if err != nil { + t.Fatalf("create temp file: %v", err) + } + defer rmTmpfile(file) + + // Create a sparse file with one byte at 1MB, 2 bytes at 2MB, 4 bytes at 4MB, etc. + // The file then should have one extent at each of these offsets. + const numExtents = initialExtentSize + offset := int64(0) + for count := 1; count <= numExtents; count++ { + if _, err := file.Seek(offset, os.SEEK_SET); err != nil { + t.Fatalf("seek to %d: %v", offset, err) + } + if _, err := file.Write(make([]byte, count)); err != nil { + t.Fatalf("write %d bytes at %d: %v", count, offset, err) + } + if offset == 0 { + offset = 1024 * 1024 + } else { + offset = offset * 2 + } + } + + verify := func(filename string) { + extents, err := getAllExtents(filename) + if err != nil { + var errno syscall.Errno + if errors.As(err, &errno) && errno == syscall.ENOTSUP { + t.Skipf("getting extents not supported for %q", filename) + } + t.Fatalf("could not get extents: %v", err) + } + assert.Equal(t, numExtents, len(extents), "number of extents") + offset = 0 + for count := 1; count <= numExtents && count <= len(extents); count++ { + extent := extents[count-1] + assert.Equal(t, extent.Logical, uint64(offset), "offset of extent %d", count) + if offset == 0 { + offset = 1024 * 1024 + } else { + offset = offset * 2 + } + } + } + + verify(file.Name()) + + // Now create a sparse copy. This should be almost + // instantaneous, despite the nominally large file. + copy, err := ioutil.TempFile(".", "copy") + if err != nil { + t.Fatalf("create temp file: %v", err) + } + defer rmTmpfile(copy) + start := time.Now() + if err := dd(file.Name(), copy.Name(), true /* sparse */, 0); err != nil { + t.Fatalf("failed to copy: %v", err) + } + delta := time.Since(start) + assert.Less(t, delta.Seconds(), 10.0, "time for copying file") + verify(copy.Name()) +} + +func logStderr(t *testing.T, err error) { + if err == nil { + return + } + var exitError *exec.ExitError + if errors.As(err, &exitError) { + t.Logf("command failed, stderr:\n%s", string(exitError.Stderr)) + } +} + +func TestImageFile(t *testing.T) { + tooSmallSize := HeaderSize - 1 + toString := func(size Bytes) string { + return resource.NewQuantity(int64(size), resource.BinarySI).String() + } + + // Try with ext4 and XFS. + run := func(fs FsType) { + t.Run(string(fs), func(t *testing.T) { + // Try with a variety of sizes because the image file is + // sensitive to alignment problems. + tests := []struct { + size string + expectedError string + }{ + {size: toString(tooSmallSize), expectedError: fmt.Sprintf("invalid image file size %d, must be larger than HeaderSize=%d", tooSmallSize, HeaderSize)}, + {size: "512Mi"}, + {size: "511Mi"}, + } + for _, tt := range tests { + tt := tt + t.Run(tt.size, func(t *testing.T) { + quantity := resource.MustParse(tt.size) + testImageFile(t, fs, Bytes(quantity.Value()), tt.expectedError) + }) + } + }) + } + run(Ext4) + run(Xfs) +} + +func testImageFile(t *testing.T, fs FsType, size Bytes, expectedError string) { + if _, err := exec.LookPath("parted"); err != nil { + t.Skipf("parted not found: %v", err) + } + + file, err := ioutil.TempFile("", "image") + if err != nil { + t.Fatalf("create temp file: %v", err) + } + defer rmTmpfile(file) + + err = Create(file.Name(), size, fs) + switch { + case expectedError == "" && err != nil: + logStderr(t, err) + t.Fatalf("failed to create image file: %v", err) + case expectedError != "" && err == nil: + t.Fatalf("did not fail with %q", expectedError) + case expectedError != "" && err != nil: + assert.Equal(t, err.Error(), expectedError, "wrong error message") + return + } + + fi, err := file.Stat() + if err != nil { + t.Fatalf("failed to stat image file: %v", err) + } + assert.GreaterOrEqual(t, fi.Size(), int64(size), "nominal image size") + + if os.Getenv("TEST_WORK") == "" || os.Getenv("VM_IMAGE") == "" { + t.Log("for testing the image under QEMU, download files for a cluster and set TEST_WORK and VM_IMAGE") + return + } + cmd := exec.Cmd{ + Path: "test/check-imagefile.sh", + Args: []string{"check-imagefile.sh"}, + Env: append(os.Environ(), + "EXISTING_VM_FILE="+file.Name(), + ), + Dir: filepath.Join(os.Getenv("TEST_WORK"), ".."), + } + stdout, err := cmd.StdoutPipe() + if err != nil { + t.Fatalf("failed to set up pipe: %v", err) + } + cmd.Stderr = cmd.Stdout + if err := cmd.Start(); err != nil { + t.Fatalf("start check-imagefile.sh: %v", err) + } + scanner := bufio.NewScanner(stdout) + success := "" + for scanner.Scan() { + line := scanner.Text() + t.Logf("check-imagefile.sh: %s", line) + if strings.HasPrefix(line, "SUCCESS: ") { + success = line + } + } + if err := cmd.Wait(); err != nil { + t.Fatalf("check-imagefile.sh failed: %v", err) + } + fsReadableForm := fs // %T output from stat + if fs == Ext4 { + // We don't bother with checking what is actually mounted, + // this is close enough. + fsReadableForm = "ext2/ext3" + } + assert.Equal(t, + fmt.Sprintf("SUCCESS: fstype=%s partition_size=%d partition_start=%d block_size=%d", + fsReadableForm, + size-HeaderSize, + DaxAlignment, + BlockSize), + success, "filesystem attributes") +} diff --git a/test/check-imagefile.sh b/test/check-imagefile.sh new file mode 100755 index 0000000000..c028807c2a --- /dev/null +++ b/test/check-imagefile.sh @@ -0,0 +1,119 @@ +#!/bin/bash +# +# Produces an image file for QEMU in a tmp directory, then +# checks that QEMU comes up with a /dev/pmem0p1 device. + +TEST_DIRECTORY=${TEST_DIRECTORY:-$(dirname $(readlink -f $0))} +source ${TEST_CONFIG:-${TEST_DIRECTORY}/test-config.sh} + +: ${GOVM_NAME:=pmem-csi-vm} +: ${RESOURCES_DIRECTORY:=_work/resources} +: ${VM_IMAGE:=${RESOURCES_DIRECTORY}/Fedora-Cloud-Base-30-1.2.x86_64.raw} +: ${EXISTING_VM_FILE:=} +: ${EFI:=false} +tmp=$(mktemp -d) +: ${VM_FILE:=$tmp/data/${GOVM_NAME}/nvdimm0} # same file as /data/nvdimm0 above for QEMU inside container +: ${GOVM_YAML:=$tmp/govm.yaml} +: ${SSH_KEY:=${RESOURCES_DIRECTORY}/id_rsa} +: ${SSH_PUBLIC_KEY:=${SSH_KEY}.pub} +: ${SLEEP_ON_FAILURE:=false} + +if [ "${EXISTING_VM_FILE}" ]; then + VM_FILE_SIZE=$(stat -c %s "${EXISTING_VM_FILE}") +else + VM_FILE_SIZE=$((TEST_PMEM_MEM_SIZE * 1024 * 1024)) +fi + +KVM_CPU_OPTS="${KVM_CPU_OPTS:-\ + -m ${TEST_NORMAL_MEM_SIZE}M,slots=${TEST_MEM_SLOTS},maxmem=$((${TEST_NORMAL_MEM_SIZE} + $(((VM_FILE_SIZE + 1024 * 1024 - 1) / 1024 / 1024)) ))M -smp ${TEST_NUM_CPUS} \ + -cpu host \ + -machine pc,accel=kvm,nvdimm=on}" +EXTRA_QEMU_OPTS="${EXTRA_QWEMU_OPTS:-\ + -object memory-backend-file,id=mem1,share=${TEST_PMEM_SHARE},\ +mem-path=/data/nvdimm0,size=${VM_FILE_SIZE} \ + -device nvdimm,id=nvdimm1,memdev=mem1 \ +}" + +SSH_TIMEOUT=120 +SSH_ARGS="-oIdentitiesOnly=yes -oStrictHostKeyChecking=no \ + -oUserKnownHostsFile=/dev/null -oLogLevel=error \ + -i ${SSH_KEY}" + +case ${VM_IMAGE} in + *Fedora*) CLOUD_USER=fedora;; + *clear*) CLOUD_USER=clear;; +esac + +atexit () { + rm -rf "$tmp" + govm rm "${GOVM_NAME}" +} +trap atexit EXIT + +function die() { + echo >&2 "ERROR: $@" + if ${SLEEP_ON_FAILURE}; then + sleep infinity + fi + exit 1 +} + +print_govm_yaml () { + cat <"${GOVM_YAML}" || die "failed to create ${GOVM_YAML}" + govm compose -f "${GOVM_YAML}" || die "govm failed" + IP=$(govm list -f '{{select (filterRegexp . "Name" "^'${GOVM_NAME}'$") "IP"}}') + echo "Waiting for ssh connectivity on vm with ip $IP" + while ! ssh $SSH_ARGS ${CLOUD_USER}@${IP} exit 2>/dev/null; do + if [ "$SECONDS" -gt "$SSH_TIMEOUT" ]; then + die "timeout accessing ${ip} through ssh" + fi + done +} + +result= +test_nvdimm () { + ssh $SSH_ARGS ${CLOUD_USER}@${IP} sudo mkdir -p /mnt || die "cannot created /mnt" + if ! ssh $SSH_ARGS ${CLOUD_USER}@${IP} sudo mount -odax /dev/pmem0p1 /mnt; then + ssh $SSH_ARGS ${CLOUD_USER}@${IP} sudo dmesg + die "cannot mount /dev/pmem0p1 with -odax" + fi + result="fstype=$(ssh $SSH_ARGS ${CLOUD_USER}@${IP} stat --file-system -c %T /mnt)" + result+=" partition_size=$(($(ssh $SSH_ARGS ${CLOUD_USER}@${IP} cat /sys/class/block/pmem0p1/size) * 512))" + result+=" partition_start=$(($(ssh $SSH_ARGS ${CLOUD_USER}@${IP} cat /sys/class/block/pmem0p1/start) * 512))" + result+=" block_size=$(ssh $SSH_ARGS ${CLOUD_USER}@${IP} stat --file-system -c %s /mnt)" +} + +create_image +start_vm +test_nvdimm +echo "SUCCESS: $result"