From b48fa8d22a6c124e0bde4992336dfe95ce52bbdd Mon Sep 17 00:00:00 2001 From: Andreas Klauer Date: Wed, 12 Feb 2014 13:57:52 +0100 Subject: [PATCH 01/28] fibmap package --- fibmap.go | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 fibmap.go diff --git a/fibmap.go b/fibmap.go new file mode 100644 index 0000000000..da0717ec34 --- /dev/null +++ b/fibmap.go @@ -0,0 +1,32 @@ +// Linux ioctl FIBMAP/FIEMAP for Go +// Copyright (C) 2014 Andreas Klauer +// License: GPL-2 + +package fibmap + +// #include +// #include +// #include +// #include +// +// void figetbsz() { +// int fd, blocksize; +// ioctl(fd, FIGETBSZ, &blocksize); +// } +// +// void fibmap() { +// int fd, block; +// ioctl(fd, FIBMAP, &block); +// } +// +// void fiemap() { +// int fd, fiemap; +// ioctl(fd, FS_IOC_FIEMAP, &fiemap); +// } +import "C" + +func Fibmap() { +} + +func Fiemap() { +} From a1871923f74ba19094d668ab2b624c7ceb59912f Mon Sep 17 00:00:00 2001 From: Andreas Klauer Date: Wed, 12 Feb 2014 15:03:15 +0100 Subject: [PATCH 02/28] fibmap as dedicated C file --- fibmap.c | 24 ++++++++++++++++++++++++ fibmap.go | 26 +++++++------------------- fibmap.h | 13 +++++++++++++ 3 files changed, 44 insertions(+), 19 deletions(-) create mode 100644 fibmap.c create mode 100644 fibmap.h diff --git a/fibmap.c b/fibmap.c new file mode 100644 index 0000000000..8159f1ecea --- /dev/null +++ b/fibmap.c @@ -0,0 +1,24 @@ +// Linux ioctl FIBMAP/FIEMAP for Go +// Copyright (C) 2014 Andreas Klauer +// License: GPL-2 + +#include "fibmap.h" +#include + +void fibmap() { + int fd, block; + ioctl(fd, FIBMAP, &block); + printf("this is fibmap in C\n"); +} + +void fiemap() { + int fd, fiemap; + ioctl(fd, FS_IOC_FIEMAP, &fiemap); + printf("this is fiemap in C\n"); +} + +void figetbsz() { + int fd, blocksize; + ioctl(fd, FIGETBSZ, &blocksize); + printf("this is figetbsz in C\n"); +} diff --git a/fibmap.go b/fibmap.go index da0717ec34..86c9d644d7 100644 --- a/fibmap.go +++ b/fibmap.go @@ -4,29 +4,17 @@ package fibmap -// #include -// #include -// #include -// #include -// -// void figetbsz() { -// int fd, blocksize; -// ioctl(fd, FIGETBSZ, &blocksize); -// } -// -// void fibmap() { -// int fd, block; -// ioctl(fd, FIBMAP, &block); -// } -// -// void fiemap() { -// int fd, fiemap; -// ioctl(fd, FS_IOC_FIEMAP, &fiemap); -// } +// #include "fibmap.h" import "C" func Fibmap() { + C.fibmap() } func Fiemap() { + C.fiemap() +} + +func Figetbsz() { + C.figetbsz() } diff --git a/fibmap.h b/fibmap.h new file mode 100644 index 0000000000..9d6519c1e7 --- /dev/null +++ b/fibmap.h @@ -0,0 +1,13 @@ +#ifndef _FS1UP_FIBMAP_H +#define _FS1UP_FIBMAP_H + +#include +#include +#include +#include + +extern void fibmap(); +extern void fiemap(); +extern void figetbsz(); + +#endif // _FS1UP_FIBMAP_H From f41b08fe2871c2cb1ed80dba4129fc562c590a2a Mon Sep 17 00:00:00 2001 From: Andreas Klauer Date: Thu, 13 Feb 2014 19:51:07 +0100 Subject: [PATCH 03/28] figetbsz() --- fibmap.c | 8 ++++---- fibmap.go | 5 +++-- fibmap.h | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fibmap.c b/fibmap.c index 8159f1ecea..218fa1d82b 100644 --- a/fibmap.c +++ b/fibmap.c @@ -17,8 +17,8 @@ void fiemap() { printf("this is fiemap in C\n"); } -void figetbsz() { - int fd, blocksize; - ioctl(fd, FIGETBSZ, &blocksize); - printf("this is figetbsz in C\n"); +int figetbsz(int fd, int* err) { + int blocksize = 0; + (*err) = ioctl(fd, FIGETBSZ, &blocksize); + return blocksize; } diff --git a/fibmap.go b/fibmap.go index 86c9d644d7..ef5ef6b31d 100644 --- a/fibmap.go +++ b/fibmap.go @@ -15,6 +15,7 @@ func Fiemap() { C.fiemap() } -func Figetbsz() { - C.figetbsz() +func Figetbsz(fd uintptr) (int, int) { + var err C.int + return int(C.figetbsz(C.int(fd), &err)), int(err) } diff --git a/fibmap.h b/fibmap.h index 9d6519c1e7..0a9b83c4a0 100644 --- a/fibmap.h +++ b/fibmap.h @@ -8,6 +8,6 @@ extern void fibmap(); extern void fiemap(); -extern void figetbsz(); +extern int figetbsz(int, int*); #endif // _FS1UP_FIBMAP_H From 229abe3e378d4adbff4ff6de58e443d23bfe457d Mon Sep 17 00:00:00 2001 From: Andreas Klauer Date: Thu, 13 Feb 2014 20:18:06 +0100 Subject: [PATCH 04/28] fibmap() --- fibmap.c | 7 +++---- fibmap.go | 5 +++-- fibmap.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fibmap.c b/fibmap.c index 218fa1d82b..6deae90d46 100644 --- a/fibmap.c +++ b/fibmap.c @@ -5,10 +5,9 @@ #include "fibmap.h" #include -void fibmap() { - int fd, block; - ioctl(fd, FIBMAP, &block); - printf("this is fibmap in C\n"); +int fibmap(int fd, int block, int* err) { + (*err) = ioctl(fd, FIBMAP, &block); + return block; } void fiemap() { diff --git a/fibmap.go b/fibmap.go index ef5ef6b31d..9531bae1ca 100644 --- a/fibmap.go +++ b/fibmap.go @@ -7,8 +7,9 @@ package fibmap // #include "fibmap.h" import "C" -func Fibmap() { - C.fibmap() +func Fibmap(fd uintptr, block int) (int, int) { + var err C.int + return int(C.fibmap(C.int(fd), C.int(block), &err)), int(err) } func Fiemap() { diff --git a/fibmap.h b/fibmap.h index 0a9b83c4a0..d0b55fab95 100644 --- a/fibmap.h +++ b/fibmap.h @@ -6,7 +6,7 @@ #include #include -extern void fibmap(); +extern int fibmap(int, int, int*); extern void fiemap(); extern int figetbsz(int, int*); From 480b3eed0bde6df0002b6178a484606638f88540 Mon Sep 17 00:00:00 2001 From: Andreas Klauer Date: Fri, 14 Feb 2014 21:21:25 +0100 Subject: [PATCH 05/28] fiemap() --- fibmap.c | 18 +++++++++++++----- fibmap.go | 14 ++++++++++++-- fibmap.h | 3 ++- 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/fibmap.c b/fibmap.c index 6deae90d46..daaae9e396 100644 --- a/fibmap.c +++ b/fibmap.c @@ -3,17 +3,25 @@ // License: GPL-2 #include "fibmap.h" -#include int fibmap(int fd, int block, int* err) { (*err) = ioctl(fd, FIBMAP, &block); return block; } -void fiemap() { - int fd, fiemap; - ioctl(fd, FS_IOC_FIEMAP, &fiemap); - printf("this is fiemap in C\n"); +int fiemap(int fd, int bufsize, char* buf) { + struct fiemap* fie; + int err; + + fie = (struct fiemap*) buf; + fie->fm_start = 0; + fie->fm_length = FIEMAP_MAX_OFFSET; + fie->fm_flags = FIEMAP_FLAG_SYNC; + fie->fm_extent_count = (bufsize - sizeof(struct fiemap))/sizeof(struct fiemap_extent); + + err = ioctl(fd, FS_IOC_FIEMAP, fie); + + return err; } int figetbsz(int fd, int* err) { diff --git a/fibmap.go b/fibmap.go index 9531bae1ca..10a80daff8 100644 --- a/fibmap.go +++ b/fibmap.go @@ -5,15 +5,25 @@ package fibmap // #include "fibmap.h" +// #define FIEMAP_SIZE sizeof(struct fiemap) +// #define FIEMAP_EXTENT_SIZE sizeof(struct fiemap_extent) import "C" +import "unsafe" + +const ( + fiemap_SIZE = C.FIEMAP_SIZE + fiemap_EXTENT_SIZE = C.FIEMAP_EXTENT_SIZE +) func Fibmap(fd uintptr, block int) (int, int) { var err C.int return int(C.fibmap(C.int(fd), C.int(block), &err)), int(err) } -func Fiemap() { - C.fiemap() +func Fiemap(fd uintptr) ([]byte, int) { + buffer := make([]byte, fiemap_SIZE+fiemap_EXTENT_SIZE) + err := C.fiemap(C.int(fd), C.int(cap(buffer)), (*C.char)(unsafe.Pointer(&buffer[0]))) + return buffer, int(err) } func Figetbsz(fd uintptr) (int, int) { diff --git a/fibmap.h b/fibmap.h index d0b55fab95..3d3ef178dd 100644 --- a/fibmap.h +++ b/fibmap.h @@ -4,10 +4,11 @@ #include #include #include +#include #include extern int fibmap(int, int, int*); -extern void fiemap(); +extern int fiemap(int, int, char*); extern int figetbsz(int, int*); #endif // _FS1UP_FIBMAP_H From 26f71cec475b19d73a4ca109468ccce3146a529c Mon Sep 17 00:00:00 2001 From: Andreas Klauer Date: Sat, 15 Feb 2014 21:30:36 +0100 Subject: [PATCH 06/28] Datahole() --- fibmap.go | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/fibmap.go b/fibmap.go index 10a80daff8..4984950a8e 100644 --- a/fibmap.go +++ b/fibmap.go @@ -9,10 +9,13 @@ package fibmap // #define FIEMAP_EXTENT_SIZE sizeof(struct fiemap_extent) import "C" import "unsafe" +import "os" const ( fiemap_SIZE = C.FIEMAP_SIZE fiemap_EXTENT_SIZE = C.FIEMAP_EXTENT_SIZE + SEEK_DATA = C.SEEK_DATA + SEEK_HOLE = C.SEEK_HOLE ) func Fibmap(fd uintptr, block int) (int, int) { @@ -30,3 +33,30 @@ func Figetbsz(fd uintptr) (int, int) { var err C.int return int(C.figetbsz(C.int(fd), &err)), int(err) } + +// use SEEK_DATA, SEEK_HOLE to find allocated data ranges in a file +func Datahole(f *os.File) []int64 { + old, _ := f.Seek(0, os.SEEK_CUR) + f.Seek(0, os.SEEK_SET) + + var data, hole int64 + var datahole []int64 + + for { + data, _ = f.Seek(hole, SEEK_DATA) + + if data >= hole { + hole, _ = f.Seek(data, SEEK_HOLE) + + if hole > data { + datahole = append(datahole, data, hole-data) + continue + } + } + break + } + + f.Seek(old, os.SEEK_SET) + + return datahole +} From db6609745227bca08a2b29fd8ed9825c95bab58a Mon Sep 17 00:00:00 2001 From: Andreas Klauer Date: Sun, 16 Feb 2014 21:34:40 +0100 Subject: [PATCH 07/28] delete C file, stick with native (unsafe) Go implementation for now --- fibmap.c | 31 ----------------- fibmap.go | 101 ++++++++++++++++++++++++++++++++++++++++++------------ fibmap.h | 14 -------- 3 files changed, 80 insertions(+), 66 deletions(-) delete mode 100644 fibmap.c delete mode 100644 fibmap.h diff --git a/fibmap.c b/fibmap.c deleted file mode 100644 index daaae9e396..0000000000 --- a/fibmap.c +++ /dev/null @@ -1,31 +0,0 @@ -// Linux ioctl FIBMAP/FIEMAP for Go -// Copyright (C) 2014 Andreas Klauer -// License: GPL-2 - -#include "fibmap.h" - -int fibmap(int fd, int block, int* err) { - (*err) = ioctl(fd, FIBMAP, &block); - return block; -} - -int fiemap(int fd, int bufsize, char* buf) { - struct fiemap* fie; - int err; - - fie = (struct fiemap*) buf; - fie->fm_start = 0; - fie->fm_length = FIEMAP_MAX_OFFSET; - fie->fm_flags = FIEMAP_FLAG_SYNC; - fie->fm_extent_count = (bufsize - sizeof(struct fiemap))/sizeof(struct fiemap_extent); - - err = ioctl(fd, FS_IOC_FIEMAP, fie); - - return err; -} - -int figetbsz(int fd, int* err) { - int blocksize = 0; - (*err) = ioctl(fd, FIGETBSZ, &blocksize); - return blocksize; -} diff --git a/fibmap.go b/fibmap.go index 4984950a8e..077ee37a25 100644 --- a/fibmap.go +++ b/fibmap.go @@ -1,37 +1,96 @@ -// Linux ioctl FIBMAP/FIEMAP for Go +// This file is part of fs1up. // Copyright (C) 2014 Andreas Klauer // License: GPL-2 +// Package fibmap implements FIBMAP/FIEMAP and related Linux ioctl +// for dealing with low level file allocation. package fibmap -// #include "fibmap.h" -// #define FIEMAP_SIZE sizeof(struct fiemap) -// #define FIEMAP_EXTENT_SIZE sizeof(struct fiemap_extent) -import "C" -import "unsafe" -import "os" +import ( + "os" + "syscall" + "unsafe" +) const ( - fiemap_SIZE = C.FIEMAP_SIZE - fiemap_EXTENT_SIZE = C.FIEMAP_EXTENT_SIZE - SEEK_DATA = C.SEEK_DATA - SEEK_HOLE = C.SEEK_HOLE + FIEMAP_EXTENT_SIZE = 56 // sizeof(struct fiemap_extent) + FIEMAP_SIZE = 32 // sizeof(struct fiemap) + + // Defined in : + SEEK_SET = 0 // seek relative to beginning of file + SEEK_CUR = 1 // seek relative to current file position + SEEK_END = 2 // seek relative to end of file + SEEK_DATA = 3 // seek to the next data + SEEK_HOLE = 4 // seek to the next hole + SEEK_MAX = SEEK_HOLE + FIBMAP = 1 // bmap access + FIGETBSZ = 2 // get the block size used for bmap + FS_IOC_FIEMAP = 3223348747 + + // Defined in : + FIEMAP_MAX_OFFSET = ^uint64(0) + FIEMAP_FLAG_SYNC = 0x0001 // sync file data before map + FIEMAP_FLAG_XATTR = 0x0002 // map extended attribute tree + FIEMAP_FLAG_CACHE = 0x0004 // request caching of the extents + FIEMAP_FLAGS_COMPAT = (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) + FIEMAP_EXTENT_LAST = 0x0001 // Last extent in file. + FIEMAP_EXTENT_UNKNOWN = 0x0002 // Data location unknown. + FIEMAP_EXTENT_DELALLOC = 0x0004 // Location still pending. Sets EXTENT_UNKNOWN. + FIEMAP_EXTENT_ENCODED = 0x0008 // Data can not be read while fs is unmounted + FIEMAP_EXTENT_DATA_ENCRYPTED = 0x0080 // Data is encrypted by fs. Sets EXTENT_NO_BYPASS. + FIEMAP_EXTENT_NOT_ALIGNED = 0x0100 // Extent offsets may not be block aligned. + FIEMAP_EXTENT_DATA_INLINE = 0x0200 // Data mixed with metadata. Sets EXTENT_NOT_ALIGNED. + FIEMAP_EXTENT_DATA_TAIL = 0x0400 // Multiple files in block. Sets EXTENT_NOT_ALIGNED. + FIEMAP_EXTENT_UNWRITTEN = 0x0800 // Space allocated, but no data (i.e. zero). + FIEMAP_EXTENT_MERGED = 0x1000 // File does not natively support extents. Result merged for efficiency. + FIEMAP_EXTENT_SHARED = 0x2000 // Space shared with other files. ) -func Fibmap(fd uintptr, block int) (int, int) { - var err C.int - return int(C.fibmap(C.int(fd), C.int(block), &err)), int(err) +// based on struct fiemap from +type fiemap struct { + Start uint64 // logical offset (inclusive) at which to start mapping (in) + Length uint64 // logical length of mapping which userspace wants (in) + Flags uint32 // FIEMAP_FLAG_* flags for request (in/out) + Mapped_extents uint32 // number of extents that were mapped (out) + Extent_count uint32 // size of fm_extents array (in) + Reserved uint32 + // Extents [0]Extent // array of mapped extents (out) +} + +// based on struct fiemap_extent from +type Extent struct { + Logical uint64 // logical offset in bytes for the start of the extent from the beginning of the file + Physical uint64 // physical offset in bytes for the start of the extent from the beginning of the disk + Length uint64 // length in bytes for this extent + Reserved64 [2]uint64 + Flags uint32 // FIEMAP_EXTENT_* flags for this extent + Reserved [3]uint32 } -func Fiemap(fd uintptr) ([]byte, int) { - buffer := make([]byte, fiemap_SIZE+fiemap_EXTENT_SIZE) - err := C.fiemap(C.int(fd), C.int(cap(buffer)), (*C.char)(unsafe.Pointer(&buffer[0]))) - return buffer, int(err) +func Fibmap(fd uintptr, block uint) (uint, syscall.Errno) { + _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, FIBMAP, uintptr(unsafe.Pointer(&block))) + return block, err +} + +func Fiemap(fd uintptr, size uint32) ([]Extent, syscall.Errno) { + extents := make([]Extent, size+1) + ptr := unsafe.Pointer(uintptr(unsafe.Pointer(&extents[1])) - FIEMAP_SIZE) + + t := (*fiemap)(ptr) + t.Start = 0 + t.Length = FIEMAP_MAX_OFFSET + t.Flags = FIEMAP_FLAG_SYNC + t.Extent_count = size + + _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, FS_IOC_FIEMAP, uintptr(ptr)) + + return extents[1 : 1+t.Mapped_extents], err } -func Figetbsz(fd uintptr) (int, int) { - var err C.int - return int(C.figetbsz(C.int(fd), &err)), int(err) +func Figetbsz(fd uintptr) (int, syscall.Errno) { + bsz := int(0) + _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, FIGETBSZ, uintptr(unsafe.Pointer(&bsz))) + return bsz, err } // use SEEK_DATA, SEEK_HOLE to find allocated data ranges in a file diff --git a/fibmap.h b/fibmap.h deleted file mode 100644 index 3d3ef178dd..0000000000 --- a/fibmap.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef _FS1UP_FIBMAP_H -#define _FS1UP_FIBMAP_H - -#include -#include -#include -#include -#include - -extern int fibmap(int, int, int*); -extern int fiemap(int, int, char*); -extern int figetbsz(int, int*); - -#endif // _FS1UP_FIBMAP_H From d3243148ffd36628fae98cd3031d1e96a28e2c36 Mon Sep 17 00:00:00 2001 From: Andreas Klauer Date: Mon, 17 Feb 2014 21:58:41 +0100 Subject: [PATCH 08/28] FibmapFile type, to keep track of extents and offsets in the future --- fibmap.go | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/fibmap.go b/fibmap.go index 077ee37a25..dadffdb4d6 100644 --- a/fibmap.go +++ b/fibmap.go @@ -67,12 +67,23 @@ type Extent struct { Reserved [3]uint32 } -func Fibmap(fd uintptr, block uint) (uint, syscall.Errno) { - _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, FIBMAP, uintptr(unsafe.Pointer(&block))) +type FibmapFile struct { + *os.File +} + +// return a new FibmapFile +func NewFibmapFile(f *os.File) FibmapFile { + return FibmapFile{f} +} + +// call FIBMAP ioctl +func (f FibmapFile) Fibmap(block uint) (uint, syscall.Errno) { + _, _, err := syscall.Syscall(syscall.SYS_IOCTL, f.Fd(), FIBMAP, uintptr(unsafe.Pointer(&block))) return block, err } -func Fiemap(fd uintptr, size uint32) ([]Extent, syscall.Errno) { +// call FIEMAP ioctl +func (f FibmapFile) Fiemap(size uint32) ([]Extent, syscall.Errno) { extents := make([]Extent, size+1) ptr := unsafe.Pointer(uintptr(unsafe.Pointer(&extents[1])) - FIEMAP_SIZE) @@ -82,21 +93,21 @@ func Fiemap(fd uintptr, size uint32) ([]Extent, syscall.Errno) { t.Flags = FIEMAP_FLAG_SYNC t.Extent_count = size - _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, FS_IOC_FIEMAP, uintptr(ptr)) + _, _, err := syscall.Syscall(syscall.SYS_IOCTL, f.Fd(), FS_IOC_FIEMAP, uintptr(ptr)) return extents[1 : 1+t.Mapped_extents], err } -func Figetbsz(fd uintptr) (int, syscall.Errno) { +// call FIGETBSZ ioctl +func (f FibmapFile) Figetbsz() (int, syscall.Errno) { bsz := int(0) - _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, FIGETBSZ, uintptr(unsafe.Pointer(&bsz))) + _, _, err := syscall.Syscall(syscall.SYS_IOCTL, f.Fd(), FIGETBSZ, uintptr(unsafe.Pointer(&bsz))) return bsz, err } // use SEEK_DATA, SEEK_HOLE to find allocated data ranges in a file -func Datahole(f *os.File) []int64 { +func (f FibmapFile) SeekDataHole() []int64 { old, _ := f.Seek(0, os.SEEK_CUR) - f.Seek(0, os.SEEK_SET) var data, hole int64 var datahole []int64 From 316ee4d733e1dc7152adf811a0969f9faa473ff6 Mon Sep 17 00:00:00 2001 From: Andreas Klauer Date: Tue, 18 Feb 2014 20:33:37 +0100 Subject: [PATCH 09/28] FibmapExtents() emulate FIEMAP with FIBMAP --- fibmap.go | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/fibmap.go b/fibmap.go index dadffdb4d6..8dbef96114 100644 --- a/fibmap.go +++ b/fibmap.go @@ -82,6 +82,67 @@ func (f FibmapFile) Fibmap(block uint) (uint, syscall.Errno) { return block, err } +// emulate FIEMAP with FIBMAP +func (f FibmapFile) FibmapExtents() ([]Extent, syscall.Errno) { + result := make([]Extent, 0) + + bsz, err := f.Figetbsz() + if err != 0 { + return nil, err + } + + stat, _ := f.Stat() + size := stat.Size() + if size == 0 { + return result, syscall.Errno(0) + } + + blocks := uint((size-1)/int64(bsz)) + 1 + var block, physical, length uint + var e Extent + + for i := uint(0); i < blocks; i++ { + block = i + _, _, err = syscall.Syscall(syscall.SYS_IOCTL, f.Fd(), FIBMAP, uintptr(unsafe.Pointer(&block))) + if block == 0 && physical == 0 { + // hole + continue + } + + if block == 0 { + // hole after extent + e.Length = uint64(length) * uint64(bsz) + result = append(result, e) + length = 0 + physical = 0 + continue + } + + if block == physical { + // segment + length++ + physical++ + continue + } + + // new extent + e = Extent{} + e.Logical = uint64(i) * uint64(bsz) + e.Physical = uint64(block) * uint64(bsz) + + length = 1 + physical = block + 1 + } + + if length > 0 { + // final extent + e.Length = uint64(length) * uint64(bsz) + result = append(result, e) + } + + return result, err +} + // call FIEMAP ioctl func (f FibmapFile) Fiemap(size uint32) ([]Extent, syscall.Errno) { extents := make([]Extent, size+1) From 27496d1b69e90c832a258fd266367462e86cd680 Mon Sep 17 00:00:00 2001 From: Andreas Klauer Date: Wed, 19 Feb 2014 19:12:27 +0100 Subject: [PATCH 10/28] Fallocate()/PunchHole() --- fibmap.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fibmap.go b/fibmap.go index 8dbef96114..7db15b9ec0 100644 --- a/fibmap.go +++ b/fibmap.go @@ -44,6 +44,11 @@ const ( FIEMAP_EXTENT_UNWRITTEN = 0x0800 // Space allocated, but no data (i.e. zero). FIEMAP_EXTENT_MERGED = 0x1000 // File does not natively support extents. Result merged for efficiency. FIEMAP_EXTENT_SHARED = 0x2000 // Space shared with other files. + + // Defined in : + FALLOC_FL_KEEP_SIZE = 0x01 // default is extend size + FALLOC_FL_PUNCH_HOLE = 0x02 // de-allocates range + FALLOC_FL_NO_HIDE_STAE = 0x04 // reserved codepoint ) // based on struct fiemap from @@ -191,3 +196,13 @@ func (f FibmapFile) SeekDataHole() []int64 { return datahole } + +// allocate using fallocate +func (f FibmapFile) Fallocate(offset int64, length int64) error { + return syscall.Fallocate(int(f.Fd()), 0, offset, length) +} + +// punch hole using fallocate +func (f FibmapFile) PunchHole(offset int64, length int64) error { + return syscall.Fallocate(int(f.Fd()), FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, offset, length) +} From 7b72b12230e51bad0464e5fc4c0f2c4bab105a20 Mon Sep 17 00:00:00 2001 From: Andreas Klauer Date: Fri, 21 Feb 2014 21:11:35 +0100 Subject: [PATCH 11/28] MixedCaps for local constants --- fibmap.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fibmap.go b/fibmap.go index 7db15b9ec0..99d0fc74b5 100644 --- a/fibmap.go +++ b/fibmap.go @@ -13,8 +13,8 @@ import ( ) const ( - FIEMAP_EXTENT_SIZE = 56 // sizeof(struct fiemap_extent) - FIEMAP_SIZE = 32 // sizeof(struct fiemap) + FiemapExtentSize = 56 // sizeof(struct fiemap_extent) + FiemapSize = 32 // sizeof(struct fiemap) // Defined in : SEEK_SET = 0 // seek relative to beginning of file @@ -151,7 +151,7 @@ func (f FibmapFile) FibmapExtents() ([]Extent, syscall.Errno) { // call FIEMAP ioctl func (f FibmapFile) Fiemap(size uint32) ([]Extent, syscall.Errno) { extents := make([]Extent, size+1) - ptr := unsafe.Pointer(uintptr(unsafe.Pointer(&extents[1])) - FIEMAP_SIZE) + ptr := unsafe.Pointer(uintptr(unsafe.Pointer(&extents[1])) - FiemapSize) t := (*fiemap)(ptr) t.Start = 0 From 28cd1bf5146ab50aae02aba4e6e6282f51435fc6 Mon Sep 17 00:00:00 2001 From: Andreas Klauer Date: Fri, 25 Apr 2014 16:23:08 +0200 Subject: [PATCH 12/28] initialize repository --- .gitignore | 23 +++++++++++++++++++++++ LICENSE | 21 +++++++++++++++++++++ README.md | 4 ++++ 3 files changed, 48 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..836562412f --- /dev/null +++ b/.gitignore @@ -0,0 +1,23 @@ +# Compiled Object files, Static and Dynamic libs (Shared Objects) +*.o +*.a +*.so + +# Folders +_obj +_test + +# Architecture specific extensions/prefixes +*.[568vq] +[568vq].out + +*.cgo1.go +*.cgo2.c +_cgo_defun.c +_cgo_gotypes.go +_cgo_export.* + +_testmain.go + +*.exe +*.test diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000..e8c6031889 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014 Andreas Klauer + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000000..524e069c4c --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +fibmap +====== + +Golang Linux ioctl FIBMAP, FIEMAP, SEEK_DATA/HOLE, PUNCH_HOLE, ... From 77fdb38ec66c3908e00b9f9c38cff25df636f919 Mon Sep 17 00:00:00 2001 From: Andreas Klauer Date: Fri, 25 Apr 2014 16:24:46 +0200 Subject: [PATCH 13/28] MIT license --- LICENSE | 4 ++-- fibmap.go | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/LICENSE b/LICENSE index e8c6031889..305a2b29cb 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2014 Andreas Klauer +Copyright (c) 2014 Andreas Klauer Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file +SOFTWARE. diff --git a/fibmap.go b/fibmap.go index 99d0fc74b5..ef1aa889ea 100644 --- a/fibmap.go +++ b/fibmap.go @@ -1,6 +1,5 @@ -// This file is part of fs1up. // Copyright (C) 2014 Andreas Klauer -// License: GPL-2 +// License: MIT // Package fibmap implements FIBMAP/FIEMAP and related Linux ioctl // for dealing with low level file allocation. From dccaeceaf3b5c9985e1f2040f5e1ed063e269a66 Mon Sep 17 00:00:00 2001 From: chenzhongtao Date: Thu, 11 Aug 2016 10:18:57 +0800 Subject: [PATCH 14/28] Update fibmap.go not continuous segment is different extent --- fibmap.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fibmap.go b/fibmap.go index ef1aa889ea..54614f8381 100644 --- a/fibmap.go +++ b/fibmap.go @@ -129,6 +129,12 @@ func (f FibmapFile) FibmapExtents() ([]Extent, syscall.Errno) { continue } + if block != physical && physical != 0 { + //not continuous segment + e.Length = uint64(length) * uint64(bsz) + result = append(result, e) + } + // new extent e = Extent{} e.Logical = uint64(i) * uint64(bsz) From be6fe65f5eeca1980862cc72b4ebc3cfb6fab233 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Tue, 17 Dec 2019 13:16:47 +0100 Subject: [PATCH 15/28] third-party: support forking of upstream components We cannot always vendor some upstream component. If we can't, then we can fork it by copying the files into "third-party", either with plain "cp" or "git subtree", and the add our own changes on top of it. --- .dockerignore | 1 + hack/copy-modules-license.sh | 19 ++++++++++++++++--- third-party/README.md | 3 +++ 3 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 third-party/README.md diff --git a/.dockerignore b/.dockerignore index 18639bcdbf..7ec04d855f 100644 --- a/.dockerignore +++ b/.dockerignore @@ -7,6 +7,7 @@ ! pkg/** ! test/test-config.d/** ! test/test-config.sh +! third-party/** ! hack/** ! vendor/** ! go.mod diff --git a/hack/copy-modules-license.sh b/hack/copy-modules-license.sh index 51667a361d..e90a01d48c 100755 --- a/hack/copy-modules-license.sh +++ b/hack/copy-modules-license.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # # Copyright 2019 Intel Corporation. # @@ -31,9 +31,22 @@ pushd vendor > /dev/null for lic in $LICENSE_FILES; do # Copy the license if its repository path is found in package .Deps - if [ $(echo $PACKAGE_DEPS | grep -c `dirname $lic`) -gt 0 ]; then - cp -t "$licensedir" --parent $lic + if echo "$PACKAGE_DEPS" | grep -q "^$(dirname $lic)"; then + cp -t "$licensedir" --parents $lic fi done popd > /dev/null + +# Same for forked third-party code. +LICENSE_FILES=$(find third-party | grep -e LICENSE -e NOTICE | cut -d / -f 2-) + +pushd third-party > /dev/null + +for lic in $LICENSE_FILES; do + if echo "$PACKAGE_DEPS" | grep -q "^github.com/intel/pmem-csi/third-party/$(dirname $lic)"; then + cp -t "$licensedir" --parents $lic + fi +done + +popd > /dev/null diff --git a/third-party/README.md b/third-party/README.md new file mode 100644 index 0000000000..c28a3cbf02 --- /dev/null +++ b/third-party/README.md @@ -0,0 +1,3 @@ +This directory contains forks of upstream code under various licenses. +Code might have to be forked because additional changes had to be made +or upstream is inactive and we need to maintain the code ourselves. From 997a767e9ddf03414c208415377c6864a6570678 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Mon, 16 Dec 2019 13:29:21 +0100 Subject: [PATCH 16/28] imagefile: embed namespace and filesystem inside a file The resulting file can be used as backing store for a QEMU nvdimm device. This is based on the approach that is used for the Kata Container rootfs (https://github.com/kata-containers/osbuilder/blob/dbbf16082da3de37d89af0783e023269210b2c91/image-builder/image_builder.sh) and reuses some of the same code, but also differs from that in some regards: - The start of the partition is aligned a multiple of the 2MiB huge page size (https://github.com/kata-containers/runtime/issues/2262#issuecomment-566956462). - The size of the QEMU object is the same as the nominal size of the file. In Kata Containers the size is a fixed 128MiB (https://github.com/kata-containers/osbuilder/issues/391#issuecomment-566651878). --- pkg/imagefile/imagefile.go | 507 ++++++++++++++++++++++++++++++++ pkg/imagefile/imagefile_test.go | 292 ++++++++++++++++++ test/check-imagefile.sh | 119 ++++++++ 3 files changed, 918 insertions(+) create mode 100644 pkg/imagefile/imagefile.go create mode 100644 pkg/imagefile/imagefile_test.go create mode 100755 test/check-imagefile.sh diff --git a/pkg/imagefile/imagefile.go b/pkg/imagefile/imagefile.go new file mode 100644 index 0000000000..72283592d4 --- /dev/null +++ b/pkg/imagefile/imagefile.go @@ -0,0 +1,507 @@ +/* + +Copyright (c) 2017-2019 Intel Corporation + +SPDX-License-Identifier: Apache-2.0 + +This file contains code originally published by Intel under GPL: +- https://github.com/torvalds/linux/blob/72c0870e3a05d9cd5466d08c3d2a3069ed0a2f9f/drivers/nvdimm/claim.c#L228-L249 + from https://github.com/torvalds/linux/commit/e1455744b27c9e6115c3508a7b2902157c2c4347 +- https://github.com/torvalds/linux/blob/72c0870e3a05d9cd5466d08c3d2a3069ed0a2f9f/drivers/nvdimm/core.c#L179-L193 + from https://github.com/torvalds/linux/commit/eaf961536e1622ad21247ac8d44acd48ba65566e +- https://github.com/torvalds/linux/blob/a3619190d62ed9d66416891be2416f6bea2b3ca4/drivers/nvdimm/pfn.h#L12-L34 + from multiple commits by djb, see https://github.com/torvalds/linux/blame/a3619190d62ed9d66416891be2416f6bea2b3ca4/drivers/nvdimm/pfn.h#L12-L34 +- https://github.com/kata-containers/osbuilder/blob/dbbf16082da3de37d89af0783e023269210b2c91/image-builder/nsdax.gpl.c#L1-L171 + from https://github.com/kata-containers/osbuilder/commit/726f798ff795ef4a8300201cab8d83e83c1496a5#diff-1d1124b18f3d6153eb2a9bba67c6314d + +That code gets re-published here under Apache-2.0. + +Furthermore, this file is based on the following code published by Intel under Apache-2.0: +- https://github.com/kata-containers/osbuilder/blob/d1751a35e1bd1613e66df87221faed195225718e/image-builder/image_builder.sh +*/ + +/* +Package imagefile contains code to create a file with the following content: + + .-----------.----------.---------------.-----------. + | 0 - 512 B | 4 - 8 Kb | 2M - 2M+512B | 4M | + |-----------+----------+---------------+-----------+ + | MBR #1 | DAX | MBR #2 | FS | + '-----------'----------'---------------'-----------+ + | | ^ | ^ + | '-data-' '--------' + | | + '--------rootfs-partition---------' + + ^ ^ ^ + daxHeaderOffset | | + daxHeaderSize | + HeaderSize + + +MBR: Master boot record. +DAX: Metadata required by the NVDIMM driver to enable DAX in the guest [1][2] (struct nd_pfn_sb). +FS: partition that contains a filesystem. + +Kernels and hypervisors that support DAX/NVDIMM read the MBR #2, otherwise MBR #1 is read. +The /dev/pmem0 device starts at the MBR which is used. + +When such a file is created on a dax-capable filesystem, then it can +be used as backing store for a [QEMU nvdimm +device](https://github.com/qemu/qemu/blob/master/docs/nvdimm.txt) such +that the guest kernel provides a /dev/pmem0p1 (the FS partition above) +which can be mounted with -odax. For full dax semantic, the QEMU +device configuration must use the 'pmem' and 'share' flags. +*/ +package imagefile + +// Here the original C code gets included verbatim and compiled with cgo. + +// #include +// #include +// #include +// +// #define __KERNEL__ +// #include +// #include +// +// /* +// Next types, definitions and functions were copied from kernel 4.19.24 source +// code, specifically from nvdimm driver +// */ +// +// #define PFN_SIG_LEN 16 +// #define PFN_SIG "NVDIMM_PFN_INFO\0" +// #define SZ_4K 0x00001000 +// +// typedef __u16 u16; +// typedef __u8 u8; +// typedef __u64 u64; +// typedef __u32 u32; +// typedef int bool; +// +// enum nd_pfn_mode { +// PFN_MODE_NONE, +// PFN_MODE_RAM, +// PFN_MODE_PMEM, +// }; +// +// struct nd_pfn_sb { +// u8 signature[PFN_SIG_LEN]; +// u8 uuid[16]; +// u8 parent_uuid[16]; +// __le32 flags; +// __le16 version_major; +// __le16 version_minor; +// __le64 dataoff; /* relative to namespace_base + start_pad */ +// __le64 npfns; +// __le32 mode; +// /* minor-version-1 additions for section alignment */ +// __le32 start_pad; +// __le32 end_trunc; +// /* minor-version-2 record the base alignment of the mapping */ +// __le32 align; +// /* minor-version-3 guarantee the padding and flags are zero */ +// u8 padding[4000]; +// __le64 checksum; +// }; +// +// struct nd_gen_sb { +// char reserved[SZ_4K - 8]; +// __le64 checksum; +// }; +// +// u64 nd_fletcher64(void *addr, size_t len, bool le) +// { +// u32 *buf = addr; +// u32 lo32 = 0; +// u64 hi32 = 0; +// int i; +// +// for (i = 0; i < len / sizeof(u32); i++) { +// lo32 += le ? le32toh((__le32) buf[i]) : buf[i]; +// hi32 += lo32; +// } +// +// return hi32 << 32 | lo32; +// } +// +// /* +// * nd_sb_checksum: compute checksum for a generic info block +// * +// * Returns a fletcher64 checksum of everything in the given info block +// * except the last field (since that's where the checksum lives). +// */ +// u64 nd_sb_checksum(struct nd_gen_sb *nd_gen_sb) +// { +// u64 sum; +// __le64 sum_save; +// +// sum_save = nd_gen_sb->checksum; +// nd_gen_sb->checksum = 0; +// sum = nd_fletcher64(nd_gen_sb, sizeof(*nd_gen_sb), 1); +// nd_gen_sb->checksum = sum_save; +// return sum; +// } +// +// void nsdax(void *p, unsigned int data_offset, unsigned int alignment) +// { +// struct nd_pfn_sb *sb = p; +// memset(sb, 0, sizeof(*sb)); +// +// strcpy((char*)sb->signature, PFN_SIG); +// sb->mode = PFN_MODE_RAM; +// sb->align = htole32(alignment); +// sb->dataoff = htole64((unsigned long)data_offset); +// sb->version_minor = 2; +// +// // checksum must be calculated at the end +// sb->checksum = nd_sb_checksum((struct nd_gen_sb*)sb); +// } +import "C" + +import ( + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "syscall" + + "github.com/intel/pmem-csi/third-party/go-fibmap" +) + +type FsType string + +const ( + Ext4 FsType = "ext4" + Xfs FsType = "xfs" +) + +type Bytes int64 + +const ( + MiB = Bytes(1024 * 1024) + KiB = Bytes(1024) + + // Alignment of partitions relative to the start of the block device, i.e. the MBR. + // For DAX huge pages this has to be 2MB, see https://nvdimm.wiki.kernel.org/2mib_fs_dax + DaxAlignment = 2 * MiB + + // DAX header offset in the file. This is where the Linux kernel expects it + // (https://github.com/torvalds/linux/blob/2187f215ebaac73ddbd814696d7c7fa34f0c3de0/drivers/nvdimm/pfn_devs.c#L438-L596). + daxHeaderOffset = 4 * KiB + + // Start of MBR#2. + // Chosen so that we have enough space before it for MBR #1 and the DAX metadata, + // doesn't really have to be aligned. + daxHeaderSize = DaxAlignment + + // Total size of the MBR + DAX data before the actual partition. + // This has to be aligned relative to MBR#2, which is at daxHeaderSize, + // for huge pages to work inside a VM. + HeaderSize = daxHeaderSize + DaxAlignment + + // Block size used for the filesystem. ext4 only supports dax with 4KiB blocks. + BlockSize = 4 * KiB +) + +// Create writes a complete image file of a certain total size. +// The size must be a multiple of DaxAlignment. The resulting +// partition then is size - HeaderSize large. +// +// Depending on the filesystem, additional constraints apply +// for the size of that partition. A multiple of BlockSize +// should be safe. +// +// The resulting file will have "size" bytes in use, but the nominal +// size may be larger to accomodate aligment requirements when +// creating a nvdimm device for QEMU. The extra bytes at the end +// are not allocated and never should be read or written because +// the partition ends before them. +// +// The result will be sparse, i.e. empty parts are not actually +// written yet, but they will be allocated, so there is no risk +// later on that attempting to write fails due to lack of space. +func Create(filename string, size Bytes, fs FsType) error { + if size <= HeaderSize { + return fmt.Errorf("invalid image file size %d, must be larger than HeaderSize=%d", size, HeaderSize) + } + fsSize := size - HeaderSize + + dir, err := os.Open(filepath.Dir(filename)) + if err != nil { + return err + } + defer dir.Close() + + file, err := os.Create(filename) + if err != nil { + return err + } + + // Delete file on failure. + success := false + defer func() { + if !success { + os.Remove(filename) + } + }() + defer file.Close() + + // Enlarge the file and ensure that we really have enough space for it. + if err := file.Truncate(int64(size)); err != nil { + return fmt.Errorf("resize %q to %d: %w", filename, size, err) + } + if err := syscall.Fallocate(int(file.Fd()), 0, 0, int64(size)); err != nil { + return fmt.Errorf("fallocate %q size %d: %w", filename, size, err) + } + + // We write MBRs and rootfs into temporary files, then copy into the + // final image file at the end. + tmp, err := ioutil.TempDir("", "pmem-csi-imagefile") + if err != nil { + return fmt.Errorf("temp dir: %w", err) + } + defer os.RemoveAll(tmp) + mbr1 := filepath.Join(tmp, "mbr1") + mbr2 := filepath.Join(tmp, "mbr2") + fsimage := filepath.Join(tmp, "fsimage") + + // This is for the full image file. + if err := writeMBR(mbr1, fs, HeaderSize, size); err != nil { + return err + } + + // This is for the image file minus the dax header. + if err := writeMBR(mbr2, fs, DaxAlignment, size-daxHeaderSize); err != nil { + return err + } + + // Create a file of the desired size, then let mkfs write into it. + fsFile, err := os.Create(fsimage) + if err != nil { + return err + } + defer fsFile.Close() + if err := fsFile.Truncate(int64(fsSize)); err != nil { + return err + } + args := []string{fmt.Sprintf("mkfs.%s", fs)} + // Required for dax semantic. + switch fs { + case Ext4: + args = append(args, "-b", fmt.Sprintf("%d", BlockSize)) + case Xfs: + args = append(args, "-b", fmt.Sprintf("size=%d", BlockSize)) + } + args = append(args, fsimage) + cmd := exec.Command(args[0], args[1:]...) + if _, err := cmd.Output(); err != nil { + return fmt.Errorf("mkfs.%s for fs of size %d: %w", fs, fsSize, err) + } + + // Now copy to the actual file. + if err := dd(mbr1, filename, true, 0); err != nil { + return err + } + if _, err := file.Seek(int64(daxHeaderOffset), os.SEEK_SET); err != nil { + return err + } + if _, err := file.Write(nsdax(uint(daxHeaderSize), uint(DaxAlignment))); err != nil { + return err + } + if err := dd(mbr2, filename, true, daxHeaderSize); err != nil { + return err + } + if err := dd(fsimage, filename, true, HeaderSize); err != nil { + return err + } + + // Some (but not all) kernels seem to expect the entire file to align at + // a 2GiB boundary. Therefore we round up and add some un-allocated padding + // at the end. + newSize := (size + DaxAlignment - 1) / DaxAlignment * DaxAlignment + if err := file.Truncate(int64(newSize)); err != nil { + return fmt.Errorf("resize %q to %d: %w", filename, newSize, err) + } + + // Safely close the file: + // - sync content + // - close it + // - sync directory + if err := file.Sync(); err != nil { + return fmt.Errorf("syncing %q: %w", filename, err) + } + if err := file.Close(); err != nil { + return fmt.Errorf("closing %q: %w", filename, err) + } + if err := dir.Sync(); err != nil { + return fmt.Errorf("syncing parent directory of %q: %w", filename, err) + } + + success = true + return nil +} + +// nsdax prepares 4KiB of DAX metadata. +func nsdax(dataOffset uint, alignment uint) []byte { + p := C.malloc(C.sizeof_struct_nd_pfn_sb) + defer C.free(p) + + // Fill allocated memory... + C.nsdax(p, C.uint(dataOffset), C.uint(alignment)) + + // ... and return a copy in a normal slice. + return C.GoBytes(p, C.sizeof_struct_nd_pfn_sb) +} + +// writeMBR writes a master boot record at the start of the given image file. +func writeMBR(to string, fs FsType, partitionStart Bytes, partitionEnd Bytes) error { + // Doesn't have to be a block device, but must exist and be large enough. + file, err := os.Create(to) + if err != nil { + return err + } + defer file.Close() + if err := file.Truncate(int64(partitionEnd)); err != nil { + return fmt.Errorf("resize %q to %d: %w", to, partitionEnd, err) + } + + // From https://github.com/kata-containers/osbuilder/blob/d1751a35e1bd1613e66df87221faed195225718e/image-builder/image_builder.sh#L346-L348 + // with some changes: + // - no alignment + // - subtract one from the end because it looks like start and end of the partition + // are both inclusive; at least for end == size of file we get an error + // (Error: The location .... is outside of the device ...). + cmd := exec.Command("parted", "--script", "--align", "none", to, "--", + "mklabel", "msdos", + "mkpart", "primary", string(fs), + fmt.Sprintf("%dB", partitionStart), + fmt.Sprintf("%dB", partitionEnd-1), + ) + if _, err := cmd.Output(); err != nil { + return fmt.Errorf("write MBR with parted to %q: %w", to, err) + } + return nil +} + +// dd copies one complete file into another at a certain target offset. +// Whether it actually writes the data even when it's just zeros is +// configurable. +func dd(from, to string, sparse bool, seek Bytes) error { + fi, err := os.Stat(from) + if err != nil { + return err + } + + var extents []fibmap.Extent + if sparse { + e, err := getAllExtents(from) + if err != nil { + return err + } + extents = e + } else { + // Copy the entire file. + extents = append(extents, fibmap.Extent{Length: uint64(fi.Size())}) + } + + in, err := os.Open(from) + if err != nil { + return err + } + defer in.Close() + out, err := os.OpenFile(to, os.O_WRONLY, 0) + if err != nil { + return err + } + defer out.Close() + + for _, extent := range extents { + // The extent might stretch beyond the end of the file. + length := int64(extent.Length) + remaining := fi.Size() - int64(extent.Logical) + if length > remaining { + length = remaining + } + if err := copyRange(in, out, int64(extent.Logical), int64(seek)+int64(extent.Logical), length); err != nil { + return err + } + } + + return nil +} + +const ioChunkSize = 256 * 1024 * 1024 + +func copyRange(from, to *os.File, skip, seek, size int64) error { + buffer := make([]byte, ioChunkSize) + + if _, err := from.Seek(skip, os.SEEK_SET); err != nil { + return err + } + if _, err := to.Seek(seek, os.SEEK_SET); err != nil { + return err + } + remaining := size + for remaining > 0 { + current := remaining + if current > int64(len(buffer)) { + current = int64(len(buffer)) + } + + // We shouldn't run into io.EOF here because we don't + // attempt to read past the end of the file, so any error + // is a reason to fail. + read, err := from.Read(buffer[0:current]) + if err != nil { + return err + } + if int64(read) < current { + return fmt.Errorf("%q: unexpected short read, got %d instead of %d bytes", from.Name(), read, current) + } + // In contrast to reads, a short write is guaranteed to return an error. + if _, err := to.Write(buffer[0:current]); err != nil { + return err + } + remaining = remaining - current + } + return nil +} + +const initialExtentSize = 16 + +func getAllExtents(from string) ([]fibmap.Extent, error) { + file, err := os.Open(from) + if err != nil { + return nil, err + } + fibmapFile := fibmap.NewFibmapFile(file) + + // Try with FIBMAP first. + for size := uint32(initialExtentSize); ; size = size * 2 { + extents, err := fibmapFile.Fiemap(size) + // Got all extents? + if err == 0 && + (len(extents) == 0 || (extents[len(extents)-1].Flags&fibmap.FIEMAP_EXTENT_LAST) != 0) { + return extents, nil + } + if err == syscall.ENOTSUP { + break + } + if err != 0 { + return nil, &os.PathError{Op: "fibmap", Path: from, + Err: &os.SyscallError{Syscall: "ioctl", Err: err}} + } + } + + // Not supported by tmpfs, which supports SEEK_DATA and SEEK_HOLE. Fall back to that. + // TODO: error reporting in SeekDataHole() + offsets := fibmapFile.SeekDataHole() + var extents []fibmap.Extent + for i := 0; i+1 < len(offsets); i += 2 { + extents = append(extents, fibmap.Extent{Logical: uint64(offsets[i]), Length: uint64(offsets[i+1])}) + } + return extents, nil +} diff --git a/pkg/imagefile/imagefile_test.go b/pkg/imagefile/imagefile_test.go new file mode 100644 index 0000000000..f613bfca4d --- /dev/null +++ b/pkg/imagefile/imagefile_test.go @@ -0,0 +1,292 @@ +/* + +Copyright (c) 2017-2019 Intel Corporation + +SPDX-License-Identifier: Apache-2.0 + +*/ + +package imagefile + +import ( + "bufio" + "bytes" + "errors" + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "strings" + "syscall" + "testing" + "time" + + "k8s.io/apimachinery/pkg/api/resource" + + "github.com/stretchr/testify/assert" +) + +func TestNsdax(t *testing.T) { + type testcase struct { + dataOffset uint + alignment uint + odDump string + } + + // Expected output comes from: + // - curl -O https://github.com/kata-containers/osbuilder/raw/726f798ff795ef4a8300201cab8d83e83c1496a5/image-builder/nsdax.gpl.c + // - gcc -o nsdax.gpl nsdax.gpl.c + // - truncate -s 0 /tmp/image + // - ./fsdax.gpl /tmp/image + // - tail -c +$((0x00001001)) /tmp/image | od -t x1 -a + // + // 0x00001001 = SZ_4K + 1 + tests := []struct { + dataOffset uint + alignment uint + odDump string + }{ + {1024, 2048, + `0000000 4e 56 44 49 4d 4d 5f 50 46 4e 5f 49 4e 46 4f 00 + N V D I M M _ P F N _ I N F O nul +0000020 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + nul nul nul nul nul nul nul nul nul nul nul nul nul nul nul nul +* +0000060 00 00 00 00 00 00 02 00 00 04 00 00 00 00 00 00 + nul nul nul nul nul nul stx nul nul eot nul nul nul nul nul nul +0000100 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 + nul nul nul nul nul nul nul nul soh nul nul nul nul nul nul nul +0000120 00 00 00 00 00 08 00 00 00 00 00 00 00 00 00 00 + nul nul nul nul nul bs nul nul nul nul nul nul nul nul nul nul +0000140 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + nul nul nul nul nul nul nul nul nul nul nul nul nul nul nul nul +* +0007760 00 00 00 00 00 00 00 00 30 44 54 e3 2b 23 ea 6c + nul nul nul nul nul nul nul nul 0 D T c + # j l +0010000 +`}, + {1, 2, `0000000 4e 56 44 49 4d 4d 5f 50 46 4e 5f 49 4e 46 4f 00 + N V D I M M _ P F N _ I N F O nul +0000020 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + nul nul nul nul nul nul nul nul nul nul nul nul nul nul nul nul +* +0000060 00 00 00 00 00 00 02 00 01 00 00 00 00 00 00 00 + nul nul nul nul nul nul stx nul soh nul nul nul nul nul nul nul +0000100 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 + nul nul nul nul nul nul nul nul soh nul nul nul nul nul nul nul +0000120 00 00 00 00 02 00 00 00 00 00 00 00 00 00 00 00 + nul nul nul nul stx nul nul nul nul nul nul nul nul nul nul nul +0000140 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + nul nul nul nul nul nul nul nul nul nul nul nul nul nul nul nul +* +0007760 00 00 00 00 00 00 00 00 33 38 54 e3 f3 0e bb 6c + nul nul nul nul nul nul nul nul 3 8 T c s so ; l +0010000 +`}, + } + + for _, tt := range tests { + tt := tt + t.Run(fmt.Sprintf("offset %d, alignment %d", tt.dataOffset, tt.alignment), + func(t *testing.T) { + t.Parallel() + data := nsdax(tt.dataOffset, tt.alignment) + cmd := exec.Command("od", "-t", "x1", "-a") + cmd.Stdin = bytes.NewBuffer(data) + out, err := cmd.Output() + if err != nil { + t.Fatalf("od failed: %v", err) + } + assert.Equal(t, tt.odDump, string(out)) + }) + } +} + +func rmTmpfile(file *os.File) { + file.Close() + os.Remove(file.Name()) +} + +func TestExtents(t *testing.T) { + file, err := ioutil.TempFile("", "extents") + if err != nil { + t.Fatalf("create temp file: %v", err) + } + defer rmTmpfile(file) + + // Create a sparse file with one byte at 1MB, 2 bytes at 2MB, 4 bytes at 4MB, etc. + // The file then should have one extent at each of these offsets. + const numExtents = initialExtentSize + offset := int64(0) + for count := 1; count <= numExtents; count++ { + if _, err := file.Seek(offset, os.SEEK_SET); err != nil { + t.Fatalf("seek to %d: %v", offset, err) + } + if _, err := file.Write(make([]byte, count)); err != nil { + t.Fatalf("write %d bytes at %d: %v", count, offset, err) + } + if offset == 0 { + offset = 1024 * 1024 + } else { + offset = offset * 2 + } + } + + verify := func(filename string) { + extents, err := getAllExtents(filename) + if err != nil { + var errno syscall.Errno + if errors.As(err, &errno) && errno == syscall.ENOTSUP { + t.Skipf("getting extents not supported for %q", filename) + } + t.Fatalf("could not get extents: %v", err) + } + assert.Equal(t, numExtents, len(extents), "number of extents") + offset = 0 + for count := 1; count <= numExtents && count <= len(extents); count++ { + extent := extents[count-1] + assert.Equal(t, extent.Logical, uint64(offset), "offset of extent %d", count) + if offset == 0 { + offset = 1024 * 1024 + } else { + offset = offset * 2 + } + } + } + + verify(file.Name()) + + // Now create a sparse copy. This should be almost + // instantaneous, despite the nominally large file. + copy, err := ioutil.TempFile(".", "copy") + if err != nil { + t.Fatalf("create temp file: %v", err) + } + defer rmTmpfile(copy) + start := time.Now() + if err := dd(file.Name(), copy.Name(), true /* sparse */, 0); err != nil { + t.Fatalf("failed to copy: %v", err) + } + delta := time.Since(start) + assert.Less(t, delta.Seconds(), 10.0, "time for copying file") + verify(copy.Name()) +} + +func logStderr(t *testing.T, err error) { + if err == nil { + return + } + var exitError *exec.ExitError + if errors.As(err, &exitError) { + t.Logf("command failed, stderr:\n%s", string(exitError.Stderr)) + } +} + +func TestImageFile(t *testing.T) { + tooSmallSize := HeaderSize - 1 + toString := func(size Bytes) string { + return resource.NewQuantity(int64(size), resource.BinarySI).String() + } + + // Try with ext4 and XFS. + run := func(fs FsType) { + t.Run(string(fs), func(t *testing.T) { + // Try with a variety of sizes because the image file is + // sensitive to alignment problems. + tests := []struct { + size string + expectedError string + }{ + {size: toString(tooSmallSize), expectedError: fmt.Sprintf("invalid image file size %d, must be larger than HeaderSize=%d", tooSmallSize, HeaderSize)}, + {size: "512Mi"}, + {size: "511Mi"}, + } + for _, tt := range tests { + tt := tt + t.Run(tt.size, func(t *testing.T) { + quantity := resource.MustParse(tt.size) + testImageFile(t, fs, Bytes(quantity.Value()), tt.expectedError) + }) + } + }) + } + run(Ext4) + run(Xfs) +} + +func testImageFile(t *testing.T, fs FsType, size Bytes, expectedError string) { + if _, err := exec.LookPath("parted"); err != nil { + t.Skipf("parted not found: %v", err) + } + + file, err := ioutil.TempFile("", "image") + if err != nil { + t.Fatalf("create temp file: %v", err) + } + defer rmTmpfile(file) + + err = Create(file.Name(), size, fs) + switch { + case expectedError == "" && err != nil: + logStderr(t, err) + t.Fatalf("failed to create image file: %v", err) + case expectedError != "" && err == nil: + t.Fatalf("did not fail with %q", expectedError) + case expectedError != "" && err != nil: + assert.Equal(t, err.Error(), expectedError, "wrong error message") + return + } + + fi, err := file.Stat() + if err != nil { + t.Fatalf("failed to stat image file: %v", err) + } + assert.GreaterOrEqual(t, fi.Size(), int64(size), "nominal image size") + + if os.Getenv("TEST_WORK") == "" || os.Getenv("VM_IMAGE") == "" { + t.Log("for testing the image under QEMU, download files for a cluster and set TEST_WORK and VM_IMAGE") + return + } + cmd := exec.Cmd{ + Path: "test/check-imagefile.sh", + Args: []string{"check-imagefile.sh"}, + Env: append(os.Environ(), + "EXISTING_VM_FILE="+file.Name(), + ), + Dir: filepath.Join(os.Getenv("TEST_WORK"), ".."), + } + stdout, err := cmd.StdoutPipe() + if err != nil { + t.Fatalf("failed to set up pipe: %v", err) + } + cmd.Stderr = cmd.Stdout + if err := cmd.Start(); err != nil { + t.Fatalf("start check-imagefile.sh: %v", err) + } + scanner := bufio.NewScanner(stdout) + success := "" + for scanner.Scan() { + line := scanner.Text() + t.Logf("check-imagefile.sh: %s", line) + if strings.HasPrefix(line, "SUCCESS: ") { + success = line + } + } + if err := cmd.Wait(); err != nil { + t.Fatalf("check-imagefile.sh failed: %v", err) + } + fsReadableForm := fs // %T output from stat + if fs == Ext4 { + // We don't bother with checking what is actually mounted, + // this is close enough. + fsReadableForm = "ext2/ext3" + } + assert.Equal(t, + fmt.Sprintf("SUCCESS: fstype=%s partition_size=%d partition_start=%d block_size=%d", + fsReadableForm, + size-HeaderSize, + DaxAlignment, + BlockSize), + success, "filesystem attributes") +} diff --git a/test/check-imagefile.sh b/test/check-imagefile.sh new file mode 100755 index 0000000000..c028807c2a --- /dev/null +++ b/test/check-imagefile.sh @@ -0,0 +1,119 @@ +#!/bin/bash +# +# Produces an image file for QEMU in a tmp directory, then +# checks that QEMU comes up with a /dev/pmem0p1 device. + +TEST_DIRECTORY=${TEST_DIRECTORY:-$(dirname $(readlink -f $0))} +source ${TEST_CONFIG:-${TEST_DIRECTORY}/test-config.sh} + +: ${GOVM_NAME:=pmem-csi-vm} +: ${RESOURCES_DIRECTORY:=_work/resources} +: ${VM_IMAGE:=${RESOURCES_DIRECTORY}/Fedora-Cloud-Base-30-1.2.x86_64.raw} +: ${EXISTING_VM_FILE:=} +: ${EFI:=false} +tmp=$(mktemp -d) +: ${VM_FILE:=$tmp/data/${GOVM_NAME}/nvdimm0} # same file as /data/nvdimm0 above for QEMU inside container +: ${GOVM_YAML:=$tmp/govm.yaml} +: ${SSH_KEY:=${RESOURCES_DIRECTORY}/id_rsa} +: ${SSH_PUBLIC_KEY:=${SSH_KEY}.pub} +: ${SLEEP_ON_FAILURE:=false} + +if [ "${EXISTING_VM_FILE}" ]; then + VM_FILE_SIZE=$(stat -c %s "${EXISTING_VM_FILE}") +else + VM_FILE_SIZE=$((TEST_PMEM_MEM_SIZE * 1024 * 1024)) +fi + +KVM_CPU_OPTS="${KVM_CPU_OPTS:-\ + -m ${TEST_NORMAL_MEM_SIZE}M,slots=${TEST_MEM_SLOTS},maxmem=$((${TEST_NORMAL_MEM_SIZE} + $(((VM_FILE_SIZE + 1024 * 1024 - 1) / 1024 / 1024)) ))M -smp ${TEST_NUM_CPUS} \ + -cpu host \ + -machine pc,accel=kvm,nvdimm=on}" +EXTRA_QEMU_OPTS="${EXTRA_QWEMU_OPTS:-\ + -object memory-backend-file,id=mem1,share=${TEST_PMEM_SHARE},\ +mem-path=/data/nvdimm0,size=${VM_FILE_SIZE} \ + -device nvdimm,id=nvdimm1,memdev=mem1 \ +}" + +SSH_TIMEOUT=120 +SSH_ARGS="-oIdentitiesOnly=yes -oStrictHostKeyChecking=no \ + -oUserKnownHostsFile=/dev/null -oLogLevel=error \ + -i ${SSH_KEY}" + +case ${VM_IMAGE} in + *Fedora*) CLOUD_USER=fedora;; + *clear*) CLOUD_USER=clear;; +esac + +atexit () { + rm -rf "$tmp" + govm rm "${GOVM_NAME}" +} +trap atexit EXIT + +function die() { + echo >&2 "ERROR: $@" + if ${SLEEP_ON_FAILURE}; then + sleep infinity + fi + exit 1 +} + +print_govm_yaml () { + cat <"${GOVM_YAML}" || die "failed to create ${GOVM_YAML}" + govm compose -f "${GOVM_YAML}" || die "govm failed" + IP=$(govm list -f '{{select (filterRegexp . "Name" "^'${GOVM_NAME}'$") "IP"}}') + echo "Waiting for ssh connectivity on vm with ip $IP" + while ! ssh $SSH_ARGS ${CLOUD_USER}@${IP} exit 2>/dev/null; do + if [ "$SECONDS" -gt "$SSH_TIMEOUT" ]; then + die "timeout accessing ${ip} through ssh" + fi + done +} + +result= +test_nvdimm () { + ssh $SSH_ARGS ${CLOUD_USER}@${IP} sudo mkdir -p /mnt || die "cannot created /mnt" + if ! ssh $SSH_ARGS ${CLOUD_USER}@${IP} sudo mount -odax /dev/pmem0p1 /mnt; then + ssh $SSH_ARGS ${CLOUD_USER}@${IP} sudo dmesg + die "cannot mount /dev/pmem0p1 with -odax" + fi + result="fstype=$(ssh $SSH_ARGS ${CLOUD_USER}@${IP} stat --file-system -c %T /mnt)" + result+=" partition_size=$(($(ssh $SSH_ARGS ${CLOUD_USER}@${IP} cat /sys/class/block/pmem0p1/size) * 512))" + result+=" partition_start=$(($(ssh $SSH_ARGS ${CLOUD_USER}@${IP} cat /sys/class/block/pmem0p1/start) * 512))" + result+=" block_size=$(ssh $SSH_ARGS ${CLOUD_USER}@${IP} stat --file-system -c %s /mnt)" +} + +create_image +start_vm +test_nvdimm +echo "SUCCESS: $result" From 063bb1ef8029a476789682a0b5c43ab53fef7ccc Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Wed, 18 Dec 2019 15:35:31 +0100 Subject: [PATCH 17/28] CI: integrate imagefile testing The test needs files prepared as part of a cluster creation, without running in that cluster itself. --- Jenkinsfile | 3 +- pkg/imagefile/imagefile_test.go | 123 ------------------- pkg/imagefile/test/imagefile_test.go | 36 ++++++ pkg/imagefile/test/imagefiletest.go | 169 +++++++++++++++++++++++++++ test/check-imagefile.sh | 37 ++++-- test/e2e/e2e_test.go | 1 + test/e2e/imagefile/imagefilee2e.go | 45 +++++++ test/start-kubernetes.sh | 1 + 8 files changed, 284 insertions(+), 131 deletions(-) create mode 100644 pkg/imagefile/test/imagefile_test.go create mode 100644 pkg/imagefile/test/imagefiletest.go create mode 100644 test/e2e/imagefile/imagefilee2e.go diff --git a/Jenkinsfile b/Jenkinsfile index cc96f8abbb..9844afcff1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -161,7 +161,8 @@ pipeline { // Install additional tools: // - ssh client for govm // - python3 for Sphinx (i.e. make html) - sh "docker exec ${env.BUILD_CONTAINER} swupd bundle-add openssh-client python3-basic" + // - parted, xfsprogs, os-cloudguest-aws (contains mkfs.ext4) for ImageFile test + sh "docker exec ${env.BUILD_CONTAINER} swupd bundle-add openssh-client python3-basic parted xfsprogs os-cloudguest-aws" // Now commit those changes to ensure that the result of "swupd bundle add" gets cached. sh "docker commit ${env.BUILD_CONTAINER} ${env.BUILD_IMAGE}" diff --git a/pkg/imagefile/imagefile_test.go b/pkg/imagefile/imagefile_test.go index f613bfca4d..c23f24838e 100644 --- a/pkg/imagefile/imagefile_test.go +++ b/pkg/imagefile/imagefile_test.go @@ -9,21 +9,16 @@ SPDX-License-Identifier: Apache-2.0 package imagefile import ( - "bufio" "bytes" "errors" "fmt" "io/ioutil" "os" "os/exec" - "path/filepath" - "strings" "syscall" "testing" "time" - "k8s.io/apimachinery/pkg/api/resource" - "github.com/stretchr/testify/assert" ) @@ -172,121 +167,3 @@ func TestExtents(t *testing.T) { assert.Less(t, delta.Seconds(), 10.0, "time for copying file") verify(copy.Name()) } - -func logStderr(t *testing.T, err error) { - if err == nil { - return - } - var exitError *exec.ExitError - if errors.As(err, &exitError) { - t.Logf("command failed, stderr:\n%s", string(exitError.Stderr)) - } -} - -func TestImageFile(t *testing.T) { - tooSmallSize := HeaderSize - 1 - toString := func(size Bytes) string { - return resource.NewQuantity(int64(size), resource.BinarySI).String() - } - - // Try with ext4 and XFS. - run := func(fs FsType) { - t.Run(string(fs), func(t *testing.T) { - // Try with a variety of sizes because the image file is - // sensitive to alignment problems. - tests := []struct { - size string - expectedError string - }{ - {size: toString(tooSmallSize), expectedError: fmt.Sprintf("invalid image file size %d, must be larger than HeaderSize=%d", tooSmallSize, HeaderSize)}, - {size: "512Mi"}, - {size: "511Mi"}, - } - for _, tt := range tests { - tt := tt - t.Run(tt.size, func(t *testing.T) { - quantity := resource.MustParse(tt.size) - testImageFile(t, fs, Bytes(quantity.Value()), tt.expectedError) - }) - } - }) - } - run(Ext4) - run(Xfs) -} - -func testImageFile(t *testing.T, fs FsType, size Bytes, expectedError string) { - if _, err := exec.LookPath("parted"); err != nil { - t.Skipf("parted not found: %v", err) - } - - file, err := ioutil.TempFile("", "image") - if err != nil { - t.Fatalf("create temp file: %v", err) - } - defer rmTmpfile(file) - - err = Create(file.Name(), size, fs) - switch { - case expectedError == "" && err != nil: - logStderr(t, err) - t.Fatalf("failed to create image file: %v", err) - case expectedError != "" && err == nil: - t.Fatalf("did not fail with %q", expectedError) - case expectedError != "" && err != nil: - assert.Equal(t, err.Error(), expectedError, "wrong error message") - return - } - - fi, err := file.Stat() - if err != nil { - t.Fatalf("failed to stat image file: %v", err) - } - assert.GreaterOrEqual(t, fi.Size(), int64(size), "nominal image size") - - if os.Getenv("TEST_WORK") == "" || os.Getenv("VM_IMAGE") == "" { - t.Log("for testing the image under QEMU, download files for a cluster and set TEST_WORK and VM_IMAGE") - return - } - cmd := exec.Cmd{ - Path: "test/check-imagefile.sh", - Args: []string{"check-imagefile.sh"}, - Env: append(os.Environ(), - "EXISTING_VM_FILE="+file.Name(), - ), - Dir: filepath.Join(os.Getenv("TEST_WORK"), ".."), - } - stdout, err := cmd.StdoutPipe() - if err != nil { - t.Fatalf("failed to set up pipe: %v", err) - } - cmd.Stderr = cmd.Stdout - if err := cmd.Start(); err != nil { - t.Fatalf("start check-imagefile.sh: %v", err) - } - scanner := bufio.NewScanner(stdout) - success := "" - for scanner.Scan() { - line := scanner.Text() - t.Logf("check-imagefile.sh: %s", line) - if strings.HasPrefix(line, "SUCCESS: ") { - success = line - } - } - if err := cmd.Wait(); err != nil { - t.Fatalf("check-imagefile.sh failed: %v", err) - } - fsReadableForm := fs // %T output from stat - if fs == Ext4 { - // We don't bother with checking what is actually mounted, - // this is close enough. - fsReadableForm = "ext2/ext3" - } - assert.Equal(t, - fmt.Sprintf("SUCCESS: fstype=%s partition_size=%d partition_start=%d block_size=%d", - fsReadableForm, - size-HeaderSize, - DaxAlignment, - BlockSize), - success, "filesystem attributes") -} diff --git a/pkg/imagefile/test/imagefile_test.go b/pkg/imagefile/test/imagefile_test.go new file mode 100644 index 0000000000..061d2968a9 --- /dev/null +++ b/pkg/imagefile/test/imagefile_test.go @@ -0,0 +1,36 @@ +/* + +Copyright (c) 2017-2019 Intel Corporation + +SPDX-License-Identifier: Apache-2.0 + +*/ + +// Package test contains a black-box test for the imagefile package. +// It is a separate package to allow importing it into the E2E suite +// where we can use the setup files for the current cluster to +// actually use the image file inside a VM. +package test + +import ( + "testing" +) + +type TWrapper testing.T + +func (t *TWrapper) Outer(name string, cb func(t TInterface)) { + (*testing.T)(t).Run(name, func(t *testing.T) { + cb((*TWrapper)(t)) + }) +} + +func (t *TWrapper) Inner(name string, cb func(t TInterface)) { + t.Outer(name, cb) +} + +func (t *TWrapper) Parallel() {} + +// TestImageFile is used by "go test". +func TestImageFile(t *testing.T) { + ImageFile((*TWrapper)(t)) +} diff --git a/pkg/imagefile/test/imagefiletest.go b/pkg/imagefile/test/imagefiletest.go new file mode 100644 index 0000000000..54149faa24 --- /dev/null +++ b/pkg/imagefile/test/imagefiletest.go @@ -0,0 +1,169 @@ +/* + +Copyright (c) 2017-2019 Intel Corporation + +SPDX-License-Identifier: Apache-2.0 + +*/ + +// Package test contains a black-box test for the imagefile package. +// It is a separate package to allow importing it into the E2E suite +// where we can use the setup files for the current cluster to +// actually use the image file inside a VM. +package test + +import ( + "bufio" + "errors" + "fmt" + "io/ioutil" + "os" + "os/exec" + "strings" + + "k8s.io/apimachinery/pkg/api/resource" + + "github.com/onsi/ginkgo" + "github.com/stretchr/testify/assert" + + "github.com/intel/pmem-csi/pkg/imagefile" +) + +// TInterface extends the GinkgoTInterface with support for nested tests. +type TInterface interface { + ginkgo.GinkgoTInterface + + Outer(string, func(t TInterface)) + Inner(string, func(t TInterface)) +} + +// ImageFile can be embedded inside a Ginkgo suite or a normal Go test and +// verifies that image files really work, if possible also by mounting them +// in a VM. +func ImageFile(t TInterface) { + tooSmallSize := imagefile.HeaderSize - 1 + toString := func(size imagefile.Bytes) string { + return resource.NewQuantity(int64(size), resource.BinarySI).String() + } + + // Try with ext4 and XFS. + run := func(fs imagefile.FsType) { + t.Outer(string(fs), func(t TInterface) { + // Try with a variety of sizes because the image file is + // sensitive to alignment problems. + tests := []struct { + size string + expectedError string + }{ + {size: toString(tooSmallSize), expectedError: fmt.Sprintf("invalid image file size %d, must be larger than HeaderSize=%d", tooSmallSize, imagefile.HeaderSize)}, + {size: "512Mi"}, + {size: "511Mi"}, + } + for _, tt := range tests { + tt := tt + t.Inner(tt.size, func(t TInterface) { + quantity := resource.MustParse(tt.size) + testImageFile(t, fs, imagefile.Bytes(quantity.Value()), tt.expectedError) + }) + } + }) + } + run(imagefile.Ext4) + run(imagefile.Xfs) +} + +func testImageFile(t TInterface, fs imagefile.FsType, size imagefile.Bytes, expectedError string) { + if _, err := exec.LookPath("parted"); err != nil { + t.Skipf("parted not found: %v", err) + } + + file, err := ioutil.TempFile("", "image") + if err != nil { + t.Fatalf("create temp file: %v", err) + } + defer rmTmpfile(file) + + err = imagefile.Create(file.Name(), size, fs) + switch { + case expectedError == "" && err != nil: + logStderr(t, err) + t.Fatalf("failed to create image file: %v", err) + case expectedError != "" && err == nil: + t.Fatalf("did not fail with %q", expectedError) + case expectedError != "" && err != nil: + assert.Equal(t, err.Error(), expectedError, "wrong error message") + return + } + + fi, err := file.Stat() + if err != nil { + t.Fatalf("failed to stat image file: %v", err) + } + assert.GreaterOrEqual(t, fi.Size(), int64(size), "nominal image size") + + if os.Getenv("REPO_ROOT") == "" || os.Getenv("CLUSTER") == "" { + t.Log("for testing the image under QEMU, download files for a cluster and set REPO_ROOT and CLUSTER") + return + } + env := []string{ + "EXISTING_VM_FILE=" + file.Name(), + os.ExpandEnv("PATH=${REPO_ROOT}/_work/bin:${PATH}"), + os.ExpandEnv("RESOURCES_DIRECTORY=${REPO_ROOT}/_work/resources"), + os.ExpandEnv("VM_IMAGE=${REPO_ROOT}/_work/${CLUSTER}/cloud-image.qcow2"), + } + t.Logf("running %s check-imagefile.sh", strings.Join(env, " ")) + cmd := exec.Cmd{ + Path: os.ExpandEnv("${REPO_ROOT}/test/check-imagefile.sh"), + Args: []string{"check-imagefile.sh"}, + Env: append(os.Environ(), env...), + Dir: os.Getenv("REPO_ROOT"), + } + stdout, err := cmd.StdoutPipe() + if err != nil { + t.Fatalf("failed to set up pipe: %v", err) + } + cmd.Stderr = cmd.Stdout + if err := cmd.Start(); err != nil { + t.Fatalf("start check-imagefile.sh: %v", err) + } + scanner := bufio.NewScanner(stdout) + success := "" + for scanner.Scan() { + line := scanner.Text() + t.Logf("check-imagefile.sh: %s", line) + if strings.HasPrefix(line, "SUCCESS: ") { + success = line + } + } + if err := cmd.Wait(); err != nil { + t.Fatalf("check-imagefile.sh failed: %v", err) + } + fsReadableForm := fs // %T output from stat + if fs == imagefile.Ext4 { + // We don't bother with checking what is actually mounted, + // this is close enough. + fsReadableForm = "ext2/ext3" + } + assert.Equal(t, + fmt.Sprintf("SUCCESS: fstype=%s partition_size=%d partition_start=%d block_size=%d", + fsReadableForm, + size-imagefile.HeaderSize, + imagefile.DaxAlignment, + imagefile.BlockSize), + success, "filesystem attributes") +} + +func rmTmpfile(file *os.File) { + file.Close() + os.Remove(file.Name()) +} + +func logStderr(t TInterface, err error) { + if err == nil { + return + } + var exitError *exec.ExitError + if errors.As(err, &exitError) { + t.Logf("command failed, stderr:\n%s", string(exitError.Stderr)) + } +} diff --git a/test/check-imagefile.sh b/test/check-imagefile.sh index c028807c2a..5c5af42000 100755 --- a/test/check-imagefile.sh +++ b/test/check-imagefile.sh @@ -6,12 +6,16 @@ TEST_DIRECTORY=${TEST_DIRECTORY:-$(dirname $(readlink -f $0))} source ${TEST_CONFIG:-${TEST_DIRECTORY}/test-config.sh} +function die() { + echo >&2 "ERROR: $@" + exit 1 +} + : ${GOVM_NAME:=pmem-csi-vm} -: ${RESOURCES_DIRECTORY:=_work/resources} +: ${RESOURCES_DIRECTORY:=$(pwd)/_work/resources} : ${VM_IMAGE:=${RESOURCES_DIRECTORY}/Fedora-Cloud-Base-30-1.2.x86_64.raw} : ${EXISTING_VM_FILE:=} -: ${EFI:=false} -tmp=$(mktemp -d) +tmp=$(mktemp -d -p $(pwd)/_work check-imagefile.XXXXXX) : ${VM_FILE:=$tmp/data/${GOVM_NAME}/nvdimm0} # same file as /data/nvdimm0 above for QEMU inside container : ${GOVM_YAML:=$tmp/govm.yaml} : ${SSH_KEY:=${RESOURCES_DIRECTORY}/id_rsa} @@ -26,7 +30,7 @@ fi KVM_CPU_OPTS="${KVM_CPU_OPTS:-\ -m ${TEST_NORMAL_MEM_SIZE}M,slots=${TEST_MEM_SLOTS},maxmem=$((${TEST_NORMAL_MEM_SIZE} + $(((VM_FILE_SIZE + 1024 * 1024 - 1) / 1024 / 1024)) ))M -smp ${TEST_NUM_CPUS} \ - -cpu host \ + -cpu ${TEST_QEMU_CPU} \ -machine pc,accel=kvm,nvdimm=on}" EXTRA_QEMU_OPTS="${EXTRA_QWEMU_OPTS:-\ -object memory-backend-file,id=mem1,share=${TEST_PMEM_SHARE},\ @@ -39,9 +43,22 @@ SSH_ARGS="-oIdentitiesOnly=yes -oStrictHostKeyChecking=no \ -oUserKnownHostsFile=/dev/null -oLogLevel=error \ -i ${SSH_KEY}" +# Might be a symbolic link. +ABS_VM_IMAGE=$(readlink --canonicalize-existing "${VM_IMAGE}") || die "cannot find actual file for ${VM_IMAGE}" +VM_IMAGE="${ABS_VM_IMAGE}" + case ${VM_IMAGE} in - *Fedora*) CLOUD_USER=fedora;; - *clear*) CLOUD_USER=clear;; + *Fedora*) + CLOUD_USER=fedora + EFI=false + ;; + *clear*) + CLOUD_USER=clear + EFI=true + ;; + *) + die "unknown cloud image ${VM_IMAGE}" + ;; esac atexit () { @@ -83,7 +100,8 @@ create_image () { if [ "${EXISTING_VM_FILE}" ]; then local dir=$(dirname ${VM_FILE}) mkdir -p "$dir" || die "failed to create $dir directory" - cp "${EXISTING_VM_FILE}" "${VM_FILE}" || die "failed to create ${VM_FILE} from ${EXISTING_VM_FILE}" + # Try a hardlink first (faster), only copy if necessary. + ln "${EXISTING_VM_FILE}" "${VM_FILE}" 2>/dev/null || cp "${EXISTING_VM_FILE}" "${VM_FILE}" || die "failed to create ${VM_FILE} from ${EXISTING_VM_FILE}" return fi } @@ -95,6 +113,11 @@ start_vm () { echo "Waiting for ssh connectivity on vm with ip $IP" while ! ssh $SSH_ARGS ${CLOUD_USER}@${IP} exit 2>/dev/null; do if [ "$SECONDS" -gt "$SSH_TIMEOUT" ]; then + ( set -x; + govm list + docker ps + docker logs govm.$(id -n -u).${GOVM_NAME} + ) die "timeout accessing ${ip} through ssh" fi done diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index fe37ae5f83..f7708179e1 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -31,6 +31,7 @@ import ( // test sources _ "github.com/intel/pmem-csi/test/e2e/gotests" + _ "github.com/intel/pmem-csi/test/e2e/imagefile" _ "github.com/intel/pmem-csi/test/e2e/storage" _ "github.com/intel/pmem-csi/test/e2e/tls" diff --git a/test/e2e/imagefile/imagefilee2e.go b/test/e2e/imagefile/imagefilee2e.go new file mode 100644 index 0000000000..b80b1c5149 --- /dev/null +++ b/test/e2e/imagefile/imagefilee2e.go @@ -0,0 +1,45 @@ +/* + +Copyright (c) 2017-2019 Intel Corporation + +SPDX-License-Identifier: Apache-2.0 + +*/ + +package imagefilee2e + +import ( + "fmt" + + "github.com/onsi/ginkgo" + + "github.com/intel/pmem-csi/pkg/imagefile/test" +) + +type tImplementation struct { + ginkgo.GinkgoTInterface +} + +func (t *tImplementation) Outer(name string, cb func(t test.TInterface)) { + ginkgo.Context(name, func() { + cb(t) + }) +} + +func (t *tImplementation) Inner(name string, cb func(t test.TInterface)) { + ginkgo.It(name, func() { + // This code now can call GinkgoT and pass some actual implementation + // of the interface. + cb(&tImplementation{ginkgo.GinkgoT()}) + }) +} + +// Only necessary because of https://github.com/onsi/ginkgo/issues/659 +func (t *tImplementation) Skipf(format string, args ...interface{}) { + ginkgo.Skip(fmt.Sprintf(format, args...)) +} + +var _ = ginkgo.Describe("imagefile", func() { + // Our Outer and Inner implementation do not need a valid pointer. + test.ImageFile((*tImplementation)(nil)) +}) diff --git a/test/start-kubernetes.sh b/test/start-kubernetes.sh index 49638495ed..df8dd03c02 100755 --- a/test/start-kubernetes.sh +++ b/test/start-kubernetes.sh @@ -506,6 +506,7 @@ trap cleanup EXIT if init_workdir && CLOUD_IMAGE=$(download_image) && + ln -sf ${RESOURCES_DIRECTORY}/${CLOUD_IMAGE} ${CLUSTER_DIRECTORY}/cloud-image.qcow2 && create_vms && NO_PROXY=$(extend_no_proxy) && init_pmem_regions && From f26ee127ce15d54fa3ffa2aa112e78444ec946db Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Fri, 10 Jan 2020 09:52:41 +0100 Subject: [PATCH 18/28] imagefile: disable reflink for XFS Same change as inside the PMEM-CSI driver itself: we have to ensure that reflink is off because it is incompatible with "-o dax". --- pkg/imagefile/imagefile.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/imagefile/imagefile.go b/pkg/imagefile/imagefile.go index 72283592d4..95f59fda34 100644 --- a/pkg/imagefile/imagefile.go +++ b/pkg/imagefile/imagefile.go @@ -293,7 +293,10 @@ func Create(filename string, size Bytes, fs FsType) error { case Ext4: args = append(args, "-b", fmt.Sprintf("%d", BlockSize)) case Xfs: - args = append(args, "-b", fmt.Sprintf("size=%d", BlockSize)) + args = append(args, + "-b", fmt.Sprintf("size=%d", BlockSize), + "-m", "reflink=0", + ) } args = append(args, fsimage) cmd := exec.Command(args[0], args[1:]...) From 9589d6288dfa2c592ea945010c90277a11e45490 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Fri, 10 Jan 2020 09:54:19 +0100 Subject: [PATCH 19/28] imagefile: remove second MBR The first MBR may be useful when working with the image file on the host, but the second one has no real purpose. It just causes more work for the kernel and the kernel developers were even contemplating to remove or change partitioning support. Even though it looks like it will be kept (https://lkml.org/lkml/2020/1/9/904), we better shouldn't depend on it unnecessarily. --- pkg/imagefile/imagefile.go | 66 +++++++++++++---------------- pkg/imagefile/test/imagefiletest.go | 5 +-- test/check-imagefile.sh | 9 ++-- 3 files changed, 35 insertions(+), 45 deletions(-) diff --git a/pkg/imagefile/imagefile.go b/pkg/imagefile/imagefile.go index 95f59fda34..a4d9bce80b 100644 --- a/pkg/imagefile/imagefile.go +++ b/pkg/imagefile/imagefile.go @@ -23,33 +23,35 @@ Furthermore, this file is based on the following code published by Intel under A /* Package imagefile contains code to create a file with the following content: - .-----------.----------.---------------.-----------. - | 0 - 512 B | 4 - 8 Kb | 2M - 2M+512B | 4M | - |-----------+----------+---------------+-----------+ - | MBR #1 | DAX | MBR #2 | FS | - '-----------'----------'---------------'-----------+ - | | ^ | ^ - | '-data-' '--------' - | | - '--------rootfs-partition---------' - - ^ ^ ^ - daxHeaderOffset | | - daxHeaderSize | - HeaderSize + .-----------.----------.---------------. + | 0 - 512 B | 4 - 8 Kb | 2M - ... | + |-----------+----------+---------------+ + | MBR #1 | DAX | FS | + '-----------'----------'---------------' + | | ^ + | '-data-' + | | + '--fs-partition--' + + ^ ^ + daxHeaderOffset | + HeaderSize MBR: Master boot record. -DAX: Metadata required by the NVDIMM driver to enable DAX in the guest [1][2] (struct nd_pfn_sb). +DAX: Metadata required by the NVDIMM driver to enable DAX in the guest (struct nd_pfn_sb). FS: partition that contains a filesystem. -Kernels and hypervisors that support DAX/NVDIMM read the MBR #2, otherwise MBR #1 is read. -The /dev/pmem0 device starts at the MBR which is used. +The MBR is useful for working with the image file: +- the `file` utility uses it to determine what the file contains +- when binding the entire file to /dev/loop0, /dev/loop0p1 will be + the file system (beware that partprobe /dev/loop0 might be needed); + alternatively one could bind the file system directly by specifying an offset When such a file is created on a dax-capable filesystem, then it can be used as backing store for a [QEMU nvdimm device](https://github.com/qemu/qemu/blob/master/docs/nvdimm.txt) such -that the guest kernel provides a /dev/pmem0p1 (the FS partition above) +that the guest kernel provides a /dev/pmem0 (the FS partition above) which can be mounted with -odax. For full dax semantic, the QEMU device configuration must use the 'pmem' and 'share' flags. */ @@ -192,18 +194,17 @@ const ( // (https://github.com/torvalds/linux/blob/2187f215ebaac73ddbd814696d7c7fa34f0c3de0/drivers/nvdimm/pfn_devs.c#L438-L596). daxHeaderOffset = 4 * KiB - // Start of MBR#2. - // Chosen so that we have enough space before it for MBR #1 and the DAX metadata, - // doesn't really have to be aligned. - daxHeaderSize = DaxAlignment - - // Total size of the MBR + DAX data before the actual partition. - // This has to be aligned relative to MBR#2, which is at daxHeaderSize, - // for huge pages to work inside a VM. - HeaderSize = daxHeaderSize + DaxAlignment + // Start of the file system. + // Chosen so that we have enough space before it for MBR #1 and the DAX metadata. + HeaderSize = DaxAlignment // Block size used for the filesystem. ext4 only supports dax with 4KiB blocks. BlockSize = 4 * KiB + + // /dev/pmem0 will have some fake disk geometry attached to it. + // We have to make the final device size a multiple of the fake + // head * track... even if the actual filesystem is smaller. + DiskAlignment = DaxAlignment * 512 ) // Create writes a complete image file of a certain total size. @@ -265,7 +266,6 @@ func Create(filename string, size Bytes, fs FsType) error { } defer os.RemoveAll(tmp) mbr1 := filepath.Join(tmp, "mbr1") - mbr2 := filepath.Join(tmp, "mbr2") fsimage := filepath.Join(tmp, "fsimage") // This is for the full image file. @@ -273,11 +273,6 @@ func Create(filename string, size Bytes, fs FsType) error { return err } - // This is for the image file minus the dax header. - if err := writeMBR(mbr2, fs, DaxAlignment, size-daxHeaderSize); err != nil { - return err - } - // Create a file of the desired size, then let mkfs write into it. fsFile, err := os.Create(fsimage) if err != nil { @@ -311,10 +306,7 @@ func Create(filename string, size Bytes, fs FsType) error { if _, err := file.Seek(int64(daxHeaderOffset), os.SEEK_SET); err != nil { return err } - if _, err := file.Write(nsdax(uint(daxHeaderSize), uint(DaxAlignment))); err != nil { - return err - } - if err := dd(mbr2, filename, true, daxHeaderSize); err != nil { + if _, err := file.Write(nsdax(uint(HeaderSize), uint(DaxAlignment))); err != nil { return err } if err := dd(fsimage, filename, true, HeaderSize); err != nil { diff --git a/pkg/imagefile/test/imagefiletest.go b/pkg/imagefile/test/imagefiletest.go index 54149faa24..f852965f69 100644 --- a/pkg/imagefile/test/imagefiletest.go +++ b/pkg/imagefile/test/imagefiletest.go @@ -145,10 +145,9 @@ func testImageFile(t TInterface, fs imagefile.FsType, size imagefile.Bytes, expe fsReadableForm = "ext2/ext3" } assert.Equal(t, - fmt.Sprintf("SUCCESS: fstype=%s partition_size=%d partition_start=%d block_size=%d", + fmt.Sprintf("SUCCESS: fstype=%s partition_size=%d block_size=%d", fsReadableForm, - size-imagefile.HeaderSize, - imagefile.DaxAlignment, + fi.Size()-int64(imagefile.HeaderSize), imagefile.BlockSize), success, "filesystem attributes") } diff --git a/test/check-imagefile.sh b/test/check-imagefile.sh index 5c5af42000..c93afaef6f 100755 --- a/test/check-imagefile.sh +++ b/test/check-imagefile.sh @@ -1,7 +1,7 @@ #!/bin/bash # # Produces an image file for QEMU in a tmp directory, then -# checks that QEMU comes up with a /dev/pmem0p1 device. +# checks that QEMU comes up with a /dev/pmem0 device. TEST_DIRECTORY=${TEST_DIRECTORY:-$(dirname $(readlink -f $0))} source ${TEST_CONFIG:-${TEST_DIRECTORY}/test-config.sh} @@ -126,13 +126,12 @@ start_vm () { result= test_nvdimm () { ssh $SSH_ARGS ${CLOUD_USER}@${IP} sudo mkdir -p /mnt || die "cannot created /mnt" - if ! ssh $SSH_ARGS ${CLOUD_USER}@${IP} sudo mount -odax /dev/pmem0p1 /mnt; then + if ! ssh $SSH_ARGS ${CLOUD_USER}@${IP} sudo mount -odax /dev/pmem0 /mnt; then ssh $SSH_ARGS ${CLOUD_USER}@${IP} sudo dmesg - die "cannot mount /dev/pmem0p1 with -odax" + die "cannot mount /dev/pmem0 with -odax" fi result="fstype=$(ssh $SSH_ARGS ${CLOUD_USER}@${IP} stat --file-system -c %T /mnt)" - result+=" partition_size=$(($(ssh $SSH_ARGS ${CLOUD_USER}@${IP} cat /sys/class/block/pmem0p1/size) * 512))" - result+=" partition_start=$(($(ssh $SSH_ARGS ${CLOUD_USER}@${IP} cat /sys/class/block/pmem0p1/start) * 512))" + result+=" partition_size=$(($(ssh $SSH_ARGS ${CLOUD_USER}@${IP} cat /sys/class/block/pmem0/size) * 512))" result+=" block_size=$(ssh $SSH_ARGS ${CLOUD_USER}@${IP} stat --file-system -c %s /mnt)" } From 24d8a580041518aa5b4e4b18652422d7af6b71b8 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Tue, 14 Jan 2020 11:35:16 +0100 Subject: [PATCH 20/28] imagefile: allow allocating file with maximum size This is needed when trying to create an image that is as large as the volume allows. Implementing the necessary retry loop here is better than doing it in the caller because here we have more knowledge about what is a "not enough space" error. A fixed overhead was tried before, but that didn't work in practice because the overhead varied depending on the size of the volume. For example, with a 4 GiB ext4 volume the resulting file has to be 164 MiB smaller. --- pkg/imagefile/imagefile.go | 68 +++++++++++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 12 deletions(-) diff --git a/pkg/imagefile/imagefile.go b/pkg/imagefile/imagefile.go index a4d9bce80b..66e7826ff5 100644 --- a/pkg/imagefile/imagefile.go +++ b/pkg/imagefile/imagefile.go @@ -163,7 +163,9 @@ package imagefile import "C" import ( + "errors" "fmt" + "golang.org/x/sys/unix" "io/ioutil" "os" "os/exec" @@ -215,20 +217,16 @@ const ( // for the size of that partition. A multiple of BlockSize // should be safe. // -// The resulting file will have "size" bytes in use, but the nominal -// size may be larger to accomodate aligment requirements when -// creating a nvdimm device for QEMU. The extra bytes at the end -// are not allocated and never should be read or written because -// the partition ends before them. +// The resulting file will have "size" bytes in use. If bytes is zero, +// then the image file will be made as large as possible. // // The result will be sparse, i.e. empty parts are not actually // written yet, but they will be allocated, so there is no risk // later on that attempting to write fails due to lack of space. func Create(filename string, size Bytes, fs FsType) error { - if size <= HeaderSize { + if size != 0 && size <= HeaderSize { return fmt.Errorf("invalid image file size %d, must be larger than HeaderSize=%d", size, HeaderSize) } - fsSize := size - HeaderSize dir, err := os.Open(filepath.Dir(filename)) if err != nil { @@ -251,12 +249,11 @@ func Create(filename string, size Bytes, fs FsType) error { defer file.Close() // Enlarge the file and ensure that we really have enough space for it. - if err := file.Truncate(int64(size)); err != nil { - return fmt.Errorf("resize %q to %d: %w", filename, size, err) - } - if err := syscall.Fallocate(int(file.Fd()), 0, 0, int64(size)); err != nil { - return fmt.Errorf("fallocate %q size %d: %w", filename, size, err) + size, err = allocateFile(file, size) + if err != nil { + return err } + fsSize := size - HeaderSize // We write MBRs and rootfs into temporary files, then copy into the // final image file at the end. @@ -339,6 +336,53 @@ func Create(filename string, size Bytes, fs FsType) error { return nil } +func allocateFile(file *os.File, size Bytes) (Bytes, error) { + if size != 0 { + if err := file.Truncate(int64(size)); err != nil { + return 0, fmt.Errorf("resize %q to %d: %w", file.Name(), size, err) + } + if err := unix.Fallocate(int(file.Fd()), 0, 0, int64(size)); err != nil { + return 0, fmt.Errorf("fallocate %q size %d: %w", file.Name(), size, err) + } + return size, nil + } + + // Determine how much space is left on the volume of the image file. + var stat unix.Statfs_t + if err := unix.Fstatfs(int(file.Fd()), &stat); err != nil { + return 0, fmt.Errorf("unexpected error while checking the volume statistics for %q: %v", file.Name(), err) + } + + // We have to try how large the file can become. + blockOverhead := uint64(1) + for blockOverhead < stat.Bfree { + size := Bytes(int64(stat.Bfree-blockOverhead) * stat.Bsize) + size = (size + DaxAlignment - 1) / DaxAlignment * DaxAlignment + if size == 0 { + break + } + size, err := allocateFile(file, Bytes(size)) + if err == nil { + return size, nil + } + var errno syscall.Errno + if !errors.As(err, &errno) || errno != syscall.ENOSPC { + return 0, fmt.Errorf("allocate %d bytes for file %q: %v", size, file.Name(), err) + } + + // Double the overhead while it is still small, later just increment with a fixed amount. + // This way we don't make the overhead at most 64 * block size larger than it really + // has to be while not trying too often (which we would do when incrementing the overhead + // by just one from the start). + if blockOverhead < 64 { + blockOverhead *= 2 + } else { + blockOverhead += 64 + } + } + return 0, fmt.Errorf("volume of %d blocks (block size %d bytes) too small for image file", stat.Bfree, stat.Bsize) +} + // nsdax prepares 4KiB of DAX metadata. func nsdax(dataOffset uint, alignment uint) []byte { p := C.malloc(C.sizeof_struct_nd_pfn_sb) From 65804f660c295ed863b0c3e56c0bda7f864d9929 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Fri, 8 May 2020 09:39:17 +0200 Subject: [PATCH 21/28] check-imagefile.sh: retry SSH As seen before, sshd in Fedora seems to restart during booting, thus defeating the "wait for it to be ready" check: check-imagefile.sh: Waiting for ssh connectivity on vm with ip 172.17.0.8 check-imagefile.sh: ssh: connect to host 172.17.0.8 port 22: Connection refused check-imagefile.sh: [ 0.000000] Linux version 5.3.7-301.fc31.x86_64 (mockbuild@bkernel03.phx2.fedoraproject.org) ... Here it was the "sudo mount" command which failed, which wasn't particularly obvious. Now each attempt to log into the VM is tried multiple times and logging is a bit better. --- test/check-imagefile.sh | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/test/check-imagefile.sh b/test/check-imagefile.sh index c93afaef6f..755eb07f37 100755 --- a/test/check-imagefile.sh +++ b/test/check-imagefile.sh @@ -110,24 +110,41 @@ start_vm () { print_govm_yaml >"${GOVM_YAML}" || die "failed to create ${GOVM_YAML}" govm compose -f "${GOVM_YAML}" || die "govm failed" IP=$(govm list -f '{{select (filterRegexp . "Name" "^'${GOVM_NAME}'$") "IP"}}') - echo "Waiting for ssh connectivity on vm with ip $IP" - while ! ssh $SSH_ARGS ${CLOUD_USER}@${IP} exit 2>/dev/null; do +} + +sshtovm () { + echo "Running $@ on VM." + while true; do + if out=$(ssh $SSH_ARGS ${CLOUD_USER}@${IP} "$@" 2>&1); then + # Success! + echo "$out" + return + fi + # SSH access to the VM is unavailable directly after booting and (with Fedora 31) + # even intermittently when the SSH server restarts. We deal with this by + # retrying for a while when we get "connection refused" errors. if [ "$SECONDS" -gt "$SSH_TIMEOUT" ]; then ( set -x; govm list docker ps docker logs govm.$(id -n -u).${GOVM_NAME} ) - die "timeout accessing ${ip} through ssh" + die "timeout accessing ${IP} through ssh" + fi + if ! echo "$out" | grep -q "connect to host ${IP}"; then + # Some other error, probably in the command itself. Give up. + echo "$out" + return 1 fi done } result= test_nvdimm () { - ssh $SSH_ARGS ${CLOUD_USER}@${IP} sudo mkdir -p /mnt || die "cannot created /mnt" - if ! ssh $SSH_ARGS ${CLOUD_USER}@${IP} sudo mount -odax /dev/pmem0 /mnt; then - ssh $SSH_ARGS ${CLOUD_USER}@${IP} sudo dmesg + sshtovm sudo mkdir -p /mnt || die "cannot created /mnt" + if ! sshtovm sudo mount -odax /dev/pmem0 /mnt; then + echo + sshtovm sudo dmesg die "cannot mount /dev/pmem0 with -odax" fi result="fstype=$(ssh $SSH_ARGS ${CLOUD_USER}@${IP} stat --file-system -c %T /mnt)" From 5fe60ce08e21d711ddbda3aeebfa76889b0266f5 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Wed, 26 Feb 2020 17:01:38 +0100 Subject: [PATCH 22/28] pmem state: document that only .json files matter The implementation already worked like that, it just wasn't documented and thus it was unknown whether reusing the directory also for other local state (like the upcoming extra volume mounts) is okay. --- pkg/pmem-state/pmem-state.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/pmem-state/pmem-state.go b/pkg/pmem-state/pmem-state.go index b27b6f25ac..22370eb951 100644 --- a/pkg/pmem-state/pmem-state.go +++ b/pkg/pmem-state/pmem-state.go @@ -44,7 +44,9 @@ var _ StateManager = &fileState{} // NewFileState instantiates the file state manager with given directory // location. It ensures the provided directory exists. -// Returns error, if fails to create the directory in case of not pre-existing. +// State entries are mapped to files with the .json suffix in that directory and +// vice versa. Other directory content is ignored, which makes it possible +// to use the directory also for other state information. func NewFileState(directory string) (StateManager, error) { if err := ensureLocation(directory); err != nil { return nil, err From 477a33b5d242f9e5d49db94063a87245a04a96a7 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Mon, 13 Jan 2020 07:45:02 +0100 Subject: [PATCH 23/28] support Kata Containers A persistent or ephemeral volume can either be prepared for usage by DAX-enabled applications that don't run under Kata Containers (the default) or for DAX-enabled applications that run under Kata Containers. In both cases the volume can be used with and with Kata Containers, it's just that DAX only works either inside or ourside of Kata Containers. The Kata Container runtime must be able to access the image file while it is still mounted, therefore we cannot use something inside the target dir as mount point, because then the image file is shadowed by the mounted filesystem. We already have a local state dir for .json files. Putting something else inside it might confuse the state code, so instead we create a second directory with ".mount" appended to the directory name and use that for mount points. We also have to enable bi-directional mount propagation for it because otherwise the mounted fs with the image file is still only visible inside the container). --- Makefile | 4 +- deploy/common/pmem-kata-app-ephemeral.yaml | 26 ++ deploy/common/pmem-kata-app.yaml | 22 ++ deploy/common/pmem-kata-pvc.yaml | 11 + .../common/pmem-storageclass-ext4-kata.yaml | 13 + deploy/common/pmem-storageclass-xfs-kata.yaml | 13 + deploy/kubernetes-1.15/direct/pmem-csi.yaml | 1 + .../direct/testing/pmem-csi.yaml | 1 + deploy/kubernetes-1.15/lvm/pmem-csi.yaml | 1 + .../kubernetes-1.15/lvm/testing/pmem-csi.yaml | 1 + .../pmem-csi-direct-testing.yaml | 1 + deploy/kubernetes-1.15/pmem-csi-direct.yaml | 1 + .../kubernetes-1.15/pmem-csi-lvm-testing.yaml | 1 + deploy/kubernetes-1.15/pmem-csi-lvm.yaml | 1 + .../pmem-storageclass-ext4-kata.yaml | 1 + .../pmem-storageclass-xfs-kata.yaml | 1 + deploy/kubernetes-1.16/direct/pmem-csi.yaml | 1 + .../direct/testing/pmem-csi.yaml | 1 + deploy/kubernetes-1.16/lvm/pmem-csi.yaml | 1 + .../kubernetes-1.16/lvm/testing/pmem-csi.yaml | 1 + .../pmem-csi-direct-testing.yaml | 1 + deploy/kubernetes-1.16/pmem-csi-direct.yaml | 1 + .../kubernetes-1.16/pmem-csi-lvm-testing.yaml | 1 + deploy/kubernetes-1.16/pmem-csi-lvm.yaml | 1 + .../pmem-storageclass-ext4-kata.yaml | 1 + .../pmem-storageclass-xfs-kata.yaml | 1 + deploy/kustomize/driver/pmem-csi.yaml | 5 + docs/design.md | 44 ++- docs/install.md | 64 +++- pkg/pmem-csi-driver/nodeserver.go | 231 +++++++++++--- pkg/pmem-csi-driver/parameters/parameters.go | 35 +- pkg/pmem-csi-driver/pmem-csi-driver.go | 3 +- pkg/volumepathhandler/volume_path_handler.go | 298 ++++++++++++++++++ .../volume_path_handler_linux.go | 270 ++++++++++++++++ test/e2e/storage/csi_volumes.go | 58 +++- test/e2e/storage/dax/dax.go | 52 ++- test/setup-deployment.sh | 2 + test/setup-kata-containers.sh | 42 +++ test/start-kubernetes.sh | 5 +- test/test-config.sh | 5 + 40 files changed, 1156 insertions(+), 67 deletions(-) create mode 100644 deploy/common/pmem-kata-app-ephemeral.yaml create mode 100644 deploy/common/pmem-kata-app.yaml create mode 100644 deploy/common/pmem-kata-pvc.yaml create mode 100644 deploy/common/pmem-storageclass-ext4-kata.yaml create mode 100644 deploy/common/pmem-storageclass-xfs-kata.yaml create mode 120000 deploy/kubernetes-1.15/pmem-storageclass-ext4-kata.yaml create mode 120000 deploy/kubernetes-1.15/pmem-storageclass-xfs-kata.yaml create mode 120000 deploy/kubernetes-1.16/pmem-storageclass-ext4-kata.yaml create mode 120000 deploy/kubernetes-1.16/pmem-storageclass-xfs-kata.yaml create mode 100644 pkg/volumepathhandler/volume_path_handler.go create mode 100644 pkg/volumepathhandler/volume_path_handler_linux.go create mode 100755 test/setup-kata-containers.sh diff --git a/Makefile b/Makefile index 1e85e1d5fb..9a31521dd0 100644 --- a/Makefile +++ b/Makefile @@ -176,7 +176,7 @@ KUSTOMIZE_OUTPUT += deploy/common/pmem-storageclass-cache.yaml KUSTOMIZATION_deploy/common/pmem-storageclass-cache.yaml = deploy/kustomize/storageclass-cache KUSTOMIZE_OUTPUT += deploy/common/pmem-storageclass-late-binding.yaml KUSTOMIZATION_deploy/common/pmem-storageclass-late-binding.yaml = deploy/kustomize/storageclass-late-binding -kustomize: $(KUSTOMIZE_OUTPUT) +kustomize: clean_kustomize_output $(KUSTOMIZE_OUTPUT) $(KUSTOMIZE_OUTPUT): _work/kustomize $(KUSTOMIZE_INPUT) $< build --load_restrictor none $(KUSTOMIZATION_$@) >$@ if echo "$@" | grep -q '/pmem-csi-'; then \ @@ -185,6 +185,8 @@ $(KUSTOMIZE_OUTPUT): _work/kustomize $(KUSTOMIZE_INPUT) cp $@ $$dir/pmem-csi.yaml && \ echo 'resources: [ pmem-csi.yaml ]' > $$dir/kustomization.yaml; \ fi +clean_kustomize_output: + rm -f $(KUSTOMIZE_OUTPUT) # Always re-generate the output files because "git rebase" might have # left us with an inconsistent state. diff --git a/deploy/common/pmem-kata-app-ephemeral.yaml b/deploy/common/pmem-kata-app-ephemeral.yaml new file mode 100644 index 0000000000..a15ec622e7 --- /dev/null +++ b/deploy/common/pmem-kata-app-ephemeral.yaml @@ -0,0 +1,26 @@ +kind: Pod +apiVersion: v1 +metadata: + name: my-csi-app-inline-volume + labels: + io.katacontainers.config.hypervisor.memory_offset: "2147483648" # 2Gi, must be at least as large as the PMEM volume +spec: + # see https://github.com/kata-containers/packaging/tree/1.11.0-rc0/kata-deploy#run-a-sample-workload + runtimeClassName: kata-qemu + nodeSelector: + katacontainers.io/kata-runtime: "true" + containers: + - name: my-frontend + image: intel/pmem-csi-driver-test:canary + command: [ "sleep", "100000" ] + volumeMounts: + - mountPath: "/data" + name: my-csi-volume + volumes: + - name: my-csi-volume + csi: + driver: pmem-csi.intel.com + fsType: "xfs" + volumeAttributes: + size: "2Gi" + kataContainers: "true" diff --git a/deploy/common/pmem-kata-app.yaml b/deploy/common/pmem-kata-app.yaml new file mode 100644 index 0000000000..a610577341 --- /dev/null +++ b/deploy/common/pmem-kata-app.yaml @@ -0,0 +1,22 @@ +kind: Pod +apiVersion: v1 +metadata: + name: my-csi-kata-app + labels: + io.katacontainers.config.hypervisor.memory_offset: "2147483648" # 2Gi, must be at least as large as the PMEM volume +spec: + # see https://github.com/kata-containers/packaging/tree/1.11.0-rc0/kata-deploy#run-a-sample-workload + runtimeClassName: kata-qemu + nodeSelector: + katacontainers.io/kata-runtime: "true" + containers: + - name: my-frontend + image: intel/pmem-csi-driver-test:canary + command: [ "sleep", "100000" ] + volumeMounts: + - mountPath: "/data" + name: my-csi-volume + volumes: + - name: my-csi-volume + persistentVolumeClaim: + claimName: pmem-csi-pvc-kata # see pmem-kata-pvc.yaml diff --git a/deploy/common/pmem-kata-pvc.yaml b/deploy/common/pmem-kata-pvc.yaml new file mode 100644 index 0000000000..21c31bdb80 --- /dev/null +++ b/deploy/common/pmem-kata-pvc.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: pmem-csi-pvc-kata +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 4Gi + storageClassName: pmem-csi-sc-ext4-kata # defined in pmem-storageclass-ext4-kata.yaml diff --git a/deploy/common/pmem-storageclass-ext4-kata.yaml b/deploy/common/pmem-storageclass-ext4-kata.yaml new file mode 100644 index 0000000000..63d03c76f9 --- /dev/null +++ b/deploy/common/pmem-storageclass-ext4-kata.yaml @@ -0,0 +1,13 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: pmem-csi-sc-ext4-kata +parameters: + csi.storage.k8s.io/fstype: ext4 + eraseafter: "true" + kataContainers: "true" +provisioner: pmem-csi.intel.com +reclaimPolicy: Delete +# Kata Containers might not be available on all nodes, wait for pod scheduling +# and then create volume on the chosen node(s). +volumeBindingMode: WaitForFirstConsumer diff --git a/deploy/common/pmem-storageclass-xfs-kata.yaml b/deploy/common/pmem-storageclass-xfs-kata.yaml new file mode 100644 index 0000000000..da818ac6eb --- /dev/null +++ b/deploy/common/pmem-storageclass-xfs-kata.yaml @@ -0,0 +1,13 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: pmem-csi-sc-xfs-kata +parameters: + csi.storage.k8s.io/fstype: xfs + eraseafter: "true" + kataContainers: "true" +provisioner: pmem-csi.intel.com +reclaimPolicy: Delete +# Kata Containers might not be available on all nodes, wait for pod scheduling +# and then create volume on the chosen node(s). +volumeBindingMode: WaitForFirstConsumer diff --git a/deploy/kubernetes-1.15/direct/pmem-csi.yaml b/deploy/kubernetes-1.15/direct/pmem-csi.yaml index cba13d1883..66b8f05a77 100644 --- a/deploy/kubernetes-1.15/direct/pmem-csi.yaml +++ b/deploy/kubernetes-1.15/direct/pmem-csi.yaml @@ -314,6 +314,7 @@ spec: - mountPath: /dev name: dev-dir - mountPath: /var/lib/pmem-csi.intel.com + mountPropagation: Bidirectional name: pmem-state-dir - mountPath: /sys name: sys-dir diff --git a/deploy/kubernetes-1.15/direct/testing/pmem-csi.yaml b/deploy/kubernetes-1.15/direct/testing/pmem-csi.yaml index d789c28759..415315e725 100644 --- a/deploy/kubernetes-1.15/direct/testing/pmem-csi.yaml +++ b/deploy/kubernetes-1.15/direct/testing/pmem-csi.yaml @@ -351,6 +351,7 @@ spec: - mountPath: /dev name: dev-dir - mountPath: /var/lib/pmem-csi.intel.com + mountPropagation: Bidirectional name: pmem-state-dir - mountPath: /sys name: sys-dir diff --git a/deploy/kubernetes-1.15/lvm/pmem-csi.yaml b/deploy/kubernetes-1.15/lvm/pmem-csi.yaml index c595410f2a..bc2d3eec4f 100644 --- a/deploy/kubernetes-1.15/lvm/pmem-csi.yaml +++ b/deploy/kubernetes-1.15/lvm/pmem-csi.yaml @@ -314,6 +314,7 @@ spec: - mountPath: /dev name: dev-dir - mountPath: /var/lib/pmem-csi.intel.com + mountPropagation: Bidirectional name: pmem-state-dir - args: - -v=3 diff --git a/deploy/kubernetes-1.15/lvm/testing/pmem-csi.yaml b/deploy/kubernetes-1.15/lvm/testing/pmem-csi.yaml index 6b4e0e61c8..410cc481a1 100644 --- a/deploy/kubernetes-1.15/lvm/testing/pmem-csi.yaml +++ b/deploy/kubernetes-1.15/lvm/testing/pmem-csi.yaml @@ -351,6 +351,7 @@ spec: - mountPath: /dev name: dev-dir - mountPath: /var/lib/pmem-csi.intel.com + mountPropagation: Bidirectional name: pmem-state-dir - mountPath: /var/lib/pmem-csi-coverage name: coverage-dir diff --git a/deploy/kubernetes-1.15/pmem-csi-direct-testing.yaml b/deploy/kubernetes-1.15/pmem-csi-direct-testing.yaml index d789c28759..415315e725 100644 --- a/deploy/kubernetes-1.15/pmem-csi-direct-testing.yaml +++ b/deploy/kubernetes-1.15/pmem-csi-direct-testing.yaml @@ -351,6 +351,7 @@ spec: - mountPath: /dev name: dev-dir - mountPath: /var/lib/pmem-csi.intel.com + mountPropagation: Bidirectional name: pmem-state-dir - mountPath: /sys name: sys-dir diff --git a/deploy/kubernetes-1.15/pmem-csi-direct.yaml b/deploy/kubernetes-1.15/pmem-csi-direct.yaml index cba13d1883..66b8f05a77 100644 --- a/deploy/kubernetes-1.15/pmem-csi-direct.yaml +++ b/deploy/kubernetes-1.15/pmem-csi-direct.yaml @@ -314,6 +314,7 @@ spec: - mountPath: /dev name: dev-dir - mountPath: /var/lib/pmem-csi.intel.com + mountPropagation: Bidirectional name: pmem-state-dir - mountPath: /sys name: sys-dir diff --git a/deploy/kubernetes-1.15/pmem-csi-lvm-testing.yaml b/deploy/kubernetes-1.15/pmem-csi-lvm-testing.yaml index 6b4e0e61c8..410cc481a1 100644 --- a/deploy/kubernetes-1.15/pmem-csi-lvm-testing.yaml +++ b/deploy/kubernetes-1.15/pmem-csi-lvm-testing.yaml @@ -351,6 +351,7 @@ spec: - mountPath: /dev name: dev-dir - mountPath: /var/lib/pmem-csi.intel.com + mountPropagation: Bidirectional name: pmem-state-dir - mountPath: /var/lib/pmem-csi-coverage name: coverage-dir diff --git a/deploy/kubernetes-1.15/pmem-csi-lvm.yaml b/deploy/kubernetes-1.15/pmem-csi-lvm.yaml index c595410f2a..bc2d3eec4f 100644 --- a/deploy/kubernetes-1.15/pmem-csi-lvm.yaml +++ b/deploy/kubernetes-1.15/pmem-csi-lvm.yaml @@ -314,6 +314,7 @@ spec: - mountPath: /dev name: dev-dir - mountPath: /var/lib/pmem-csi.intel.com + mountPropagation: Bidirectional name: pmem-state-dir - args: - -v=3 diff --git a/deploy/kubernetes-1.15/pmem-storageclass-ext4-kata.yaml b/deploy/kubernetes-1.15/pmem-storageclass-ext4-kata.yaml new file mode 120000 index 0000000000..f2344cfd9f --- /dev/null +++ b/deploy/kubernetes-1.15/pmem-storageclass-ext4-kata.yaml @@ -0,0 +1 @@ +../common/pmem-storageclass-ext4-kata.yaml \ No newline at end of file diff --git a/deploy/kubernetes-1.15/pmem-storageclass-xfs-kata.yaml b/deploy/kubernetes-1.15/pmem-storageclass-xfs-kata.yaml new file mode 120000 index 0000000000..7e53d6ab8b --- /dev/null +++ b/deploy/kubernetes-1.15/pmem-storageclass-xfs-kata.yaml @@ -0,0 +1 @@ +../common/pmem-storageclass-xfs-kata.yaml \ No newline at end of file diff --git a/deploy/kubernetes-1.16/direct/pmem-csi.yaml b/deploy/kubernetes-1.16/direct/pmem-csi.yaml index b7764bb0c7..d2b6cba3b1 100644 --- a/deploy/kubernetes-1.16/direct/pmem-csi.yaml +++ b/deploy/kubernetes-1.16/direct/pmem-csi.yaml @@ -314,6 +314,7 @@ spec: - mountPath: /dev name: dev-dir - mountPath: /var/lib/pmem-csi.intel.com + mountPropagation: Bidirectional name: pmem-state-dir - mountPath: /sys name: sys-dir diff --git a/deploy/kubernetes-1.16/direct/testing/pmem-csi.yaml b/deploy/kubernetes-1.16/direct/testing/pmem-csi.yaml index 45f24caaad..cfcf2a8445 100644 --- a/deploy/kubernetes-1.16/direct/testing/pmem-csi.yaml +++ b/deploy/kubernetes-1.16/direct/testing/pmem-csi.yaml @@ -351,6 +351,7 @@ spec: - mountPath: /dev name: dev-dir - mountPath: /var/lib/pmem-csi.intel.com + mountPropagation: Bidirectional name: pmem-state-dir - mountPath: /sys name: sys-dir diff --git a/deploy/kubernetes-1.16/lvm/pmem-csi.yaml b/deploy/kubernetes-1.16/lvm/pmem-csi.yaml index 8291ded5be..4dea9d726b 100644 --- a/deploy/kubernetes-1.16/lvm/pmem-csi.yaml +++ b/deploy/kubernetes-1.16/lvm/pmem-csi.yaml @@ -314,6 +314,7 @@ spec: - mountPath: /dev name: dev-dir - mountPath: /var/lib/pmem-csi.intel.com + mountPropagation: Bidirectional name: pmem-state-dir - args: - -v=3 diff --git a/deploy/kubernetes-1.16/lvm/testing/pmem-csi.yaml b/deploy/kubernetes-1.16/lvm/testing/pmem-csi.yaml index fa7681c7b0..538d5ef5a7 100644 --- a/deploy/kubernetes-1.16/lvm/testing/pmem-csi.yaml +++ b/deploy/kubernetes-1.16/lvm/testing/pmem-csi.yaml @@ -351,6 +351,7 @@ spec: - mountPath: /dev name: dev-dir - mountPath: /var/lib/pmem-csi.intel.com + mountPropagation: Bidirectional name: pmem-state-dir - mountPath: /var/lib/pmem-csi-coverage name: coverage-dir diff --git a/deploy/kubernetes-1.16/pmem-csi-direct-testing.yaml b/deploy/kubernetes-1.16/pmem-csi-direct-testing.yaml index 45f24caaad..cfcf2a8445 100644 --- a/deploy/kubernetes-1.16/pmem-csi-direct-testing.yaml +++ b/deploy/kubernetes-1.16/pmem-csi-direct-testing.yaml @@ -351,6 +351,7 @@ spec: - mountPath: /dev name: dev-dir - mountPath: /var/lib/pmem-csi.intel.com + mountPropagation: Bidirectional name: pmem-state-dir - mountPath: /sys name: sys-dir diff --git a/deploy/kubernetes-1.16/pmem-csi-direct.yaml b/deploy/kubernetes-1.16/pmem-csi-direct.yaml index b7764bb0c7..d2b6cba3b1 100644 --- a/deploy/kubernetes-1.16/pmem-csi-direct.yaml +++ b/deploy/kubernetes-1.16/pmem-csi-direct.yaml @@ -314,6 +314,7 @@ spec: - mountPath: /dev name: dev-dir - mountPath: /var/lib/pmem-csi.intel.com + mountPropagation: Bidirectional name: pmem-state-dir - mountPath: /sys name: sys-dir diff --git a/deploy/kubernetes-1.16/pmem-csi-lvm-testing.yaml b/deploy/kubernetes-1.16/pmem-csi-lvm-testing.yaml index fa7681c7b0..538d5ef5a7 100644 --- a/deploy/kubernetes-1.16/pmem-csi-lvm-testing.yaml +++ b/deploy/kubernetes-1.16/pmem-csi-lvm-testing.yaml @@ -351,6 +351,7 @@ spec: - mountPath: /dev name: dev-dir - mountPath: /var/lib/pmem-csi.intel.com + mountPropagation: Bidirectional name: pmem-state-dir - mountPath: /var/lib/pmem-csi-coverage name: coverage-dir diff --git a/deploy/kubernetes-1.16/pmem-csi-lvm.yaml b/deploy/kubernetes-1.16/pmem-csi-lvm.yaml index 8291ded5be..4dea9d726b 100644 --- a/deploy/kubernetes-1.16/pmem-csi-lvm.yaml +++ b/deploy/kubernetes-1.16/pmem-csi-lvm.yaml @@ -314,6 +314,7 @@ spec: - mountPath: /dev name: dev-dir - mountPath: /var/lib/pmem-csi.intel.com + mountPropagation: Bidirectional name: pmem-state-dir - args: - -v=3 diff --git a/deploy/kubernetes-1.16/pmem-storageclass-ext4-kata.yaml b/deploy/kubernetes-1.16/pmem-storageclass-ext4-kata.yaml new file mode 120000 index 0000000000..f2344cfd9f --- /dev/null +++ b/deploy/kubernetes-1.16/pmem-storageclass-ext4-kata.yaml @@ -0,0 +1 @@ +../common/pmem-storageclass-ext4-kata.yaml \ No newline at end of file diff --git a/deploy/kubernetes-1.16/pmem-storageclass-xfs-kata.yaml b/deploy/kubernetes-1.16/pmem-storageclass-xfs-kata.yaml new file mode 120000 index 0000000000..7e53d6ab8b --- /dev/null +++ b/deploy/kubernetes-1.16/pmem-storageclass-xfs-kata.yaml @@ -0,0 +1 @@ +../common/pmem-storageclass-xfs-kata.yaml \ No newline at end of file diff --git a/deploy/kustomize/driver/pmem-csi.yaml b/deploy/kustomize/driver/pmem-csi.yaml index 5fd33d6076..3fdfd8f2bb 100644 --- a/deploy/kustomize/driver/pmem-csi.yaml +++ b/deploy/kustomize/driver/pmem-csi.yaml @@ -162,6 +162,11 @@ spec: mountPath: /dev - name: pmem-state-dir mountPath: /var/lib/pmem-csi.intel.com + # Needed for Kata Containers: we mount the PMEM volume inside our + # state dir and want that to be visible also on the host, because + # the host will need access to the image file that we create inside + # that mounted fs. + mountPropagation: Bidirectional - name: driver-registrar imagePullPolicy: Always image: quay.io/k8scsi/csi-node-driver-registrar:v1.X.Y diff --git a/docs/design.md b/docs/design.md index 25225b5263..32c177143b 100644 --- a/docs/design.md +++ b/docs/design.md @@ -4,6 +4,7 @@ - [Architecture and Operation](#architecture-and-operation) - [LVM device mode](#lvm-device-mode) - [Direct device mode](#direct-device-mode) + - [Kata Containers support](#kata-containers-support) - [Driver modes](#driver-modes) - [Driver Components](#driver-components) - [Communication between components](#communication-between-components) @@ -126,6 +127,47 @@ In direct device mode, the driver does not attempt to limit space use. It also does not mark "own" namespaces. The _Name_ field of a namespace gets value of the VolumeID. +## Kata Container support + +[Kata Containers](https://katacontainers.io) runs applications inside a +virtual machine. This poses a problem for App Direct mode, because +access to the filesystem prepared by PMEM-CSI is provided inside the +virtual machine by the 9p or virtio-fs filesystems. Both do not +support App Direct mode: +- 9p does not support `mmap` at all. +- virtio-fs only supports it when not using `MAP_SYNC`, i.e. without dax + semantic. + +This gets solved as follows: +- PMEM-CSI creates a volume as usual, either in direct mode or LVM mode. +- Inside that volume it sets up an ext4 or xfs filesystem. +- Inside that filesystem it creates a `pmem-csi-vm.img` file that contains + partition tables, dax metadata and a partition that takes up most of the + space available in the volume. +- That partition is bound to a `/dev/loop` device and the formatted + with the requested filesystem type for the volume. +- When an application needs access to the volume, PMEM-CSI mounts + that `/dev/loop` device. +- An application not running under Kata Containers then uses + that filesystem normally *but* due to limitations in the Linux + kernel, mounting might have to be done without `-odax` and thus + App Direct access does not work. +- When the Kata Container runtime is asked to provide access to that + filesystem, it will instead pass the underlying `pmem-csi-vm.img` + file into QEMU as a [nvdimm + device](https://github.com/qemu/qemu/blob/master/docs/nvdimm.txt) + and inside the VM mount the `/dev/pmem0p1` partition that the + Linux kernel sets up based on the dax meta data that was placed in the + file by PMEM-CSI. Inside the VM, the App Direct semantic is fully + supported. + +Such volumes can be used with full dax semantic *only* inside Kata +Containers. They are still usable with other runtimes, just not +with dax semantic. Because of that and the additional space overhead, +Kata Container support has to be enabled explicitly via a [storage +class parameter and Kata Containers must be set up +appropriately](install.md#kata-containers-support) + ## Driver modes The PMEM-CSI driver supports running in different modes, which can be @@ -388,4 +430,4 @@ that don't use PMEM-CSI at all. Users must take care to create PVCs first, then the pods if they want to use the webhook. In practice, that is often already done because it -is more natural, so it is not a big limitation. \ No newline at end of file +is more natural, so it is not a big limitation. diff --git a/docs/install.md b/docs/install.md index 94780bbea8..efbc12bcd2 100644 --- a/docs/install.md +++ b/docs/install.md @@ -8,6 +8,8 @@ - [Get source code](#get-source-code) - [Run PMEM-CSI on Kubernetes](#run-pmem-csi-on-kubernetes) - [Expose persistent and cache volumes to applications](#expose-persistent-and-cache-volumes-to-applications) + - [Kata Containers support](#kata-containers-support) + - [Ephemeral inline volumes](#ephemeral-inline-volumes) - [Raw block volumes](#raw-block-volumes) - [Enable scheduler extensions](#enable-scheduler-extensions) - [Filing issues and contributing](#filing-issues-and-contributing) @@ -381,12 +383,72 @@ Check with the provided * A node is only chosen the first time a pod starts. After that it will always restart on that node, because that is where the persistent volume was created. -Volume requests embedded in Pod spec are provisioned as ephemeral volumes. The volume request could use below fields as [`volumeAttributes`](https://kubernetes.io/docs/concepts/storage/volumes/#csi): +### Kata Containers support + +[Kata Containers support](design.md#kata-containers-support) gets enabled via +the `kataContainers` storage class parameter. It accepts the following +values: +* `true/1/t/TRUE` + Create the filesystem inside a partition inside a file, try to mount + on the host through a loop device with `-o dax` but proceed without + `-o dax` when the kernel does not support that. Currently Linux up + to and including 5.4 do not support it. In other words, on the host + such volumes are usable, but only without DAX. Inside Kata + Containers, DAX works. +* `false/0/f/FALSE` (default) + Create the filesystem directly on the volume. + +[Raw block volumes](#raw-block-volumes) are only supported with +`kataContainers: false`. Attempts to create them with `kataContainers: +true` are rejected. + +At the moment (= Kata Containers 1.11.0-rc0), only Kata Containers +with QEMU enable the special support for such volumes. Without QEMU or +with older releases of Kata Containers, the volume is still usable +through the normal remote filesystem support (9p or virtio-fs). Support +for Cloud Hypervisor is [in +progress](https://github.com/kata-containers/runtime/issues/2575). + +With Kata Containers for QEMU, the VM must be configured appropriately +to allow adding the PMEM volumes to their address space. This can be +done globally by setting the `memory_offset` in the +[`configuration-qemu.toml` +file](https://github.com/kata-containers/runtime/blob/ee985a608015d81772901c1d9999190495fc9a0a/cli/config/configuration-qemu.toml.in#L86-L91) +or per-pod by setting the +[`io.katacontainers.config.hypervisor.memory_offset` +label](https://github.com/kata-containers/documentation/blob/master/how-to/how-to-set-sandbox-config-kata.md#hypervisor-options) +in the pod meta data. In both cases, the value has to be large enough +for all PMEM volumes used by the pod, otherwise pod creation will fail +with an error similar to this: + +``` +Error: container create failed: QMP command failed: not enough space, currently 0x8000000 in use of total space for memory devices 0x3c100000 +``` + +The examples for usage of Kata Containers [with +ephemeral](../deploy/common/pmem-kata-app-ephemeral.yaml) and +[persistent](../deploy/common/pmem-kata-app.yaml) volumes use the pod +label. They assume that the `kata-qemu` runtime class [is +installed](https://github.com/kata-containers/packaging/tree/1.11.0-rc0/kata-deploy#run-a-sample-workload). + +For the QEMU test cluster, +[`setup-kata-containers.sh`](../test/setup-kata-containers.sh) can be +used to install Kata Containers. However, this currently only works on +Clear Linux because on Fedora, the Docker container runtime is used +and Kata Containers does not support that one. + + +### Ephemeral inline volumes + +Volume requests [embedded in the pod spec]() are provisioned as +ephemeral volumes. The volume request could use below fields as +[`volumeAttributes`](https://kubernetes.io/docs/concepts/storage/volumes/#csi): |key|meaning|optional|values| |---|-------|--------|-------------| |`size`|Size of the requested ephemeral volume as [Kubernetes memory string](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) ("1Mi" = 1024*1024 bytes, "1e3K = 1000000 bytes)|No|| |`eraseAfter`|Clear all data after use and before
deleting the volume|Yes|`true` (default),
`false`| +|`kataContainers`|Prepare volume for use in Kata Containers.|Yes|`false/0/f/FALSE` (default),
`true/1/t/TRUE`| Check with provided [example application](/deploy/kubernetes-1.15/pmem-app-ephemeral.yaml) for ephemeral volume usage. diff --git a/pkg/pmem-csi-driver/nodeserver.go b/pkg/pmem-csi-driver/nodeserver.go index 3d9114d696..2df7331873 100644 --- a/pkg/pmem-csi-driver/nodeserver.go +++ b/pkg/pmem-csi-driver/nodeserver.go @@ -23,9 +23,11 @@ import ( "k8s.io/klog" "k8s.io/utils/mount" + "github.com/intel/pmem-csi/pkg/imagefile" "github.com/intel/pmem-csi/pkg/pmem-csi-driver/parameters" pmdmanager "github.com/intel/pmem-csi/pkg/pmem-device-manager" pmemexec "github.com/intel/pmem-csi/pkg/pmem-exec" + "github.com/intel/pmem-csi/pkg/volumepathhandler" ) const ( @@ -34,6 +36,13 @@ const ( // the driver volumeProvisionerIdentity = "storage.kubernetes.io/csiProvisionerIdentity" defaultFilesystem = "ext4" + + // kataContainersImageFilename is the image file that Kata Containers + // needs to make available inside the VM. + kataContainersImageFilename = "kata-containers-pmem-csi-vm.img" + + // Mount point inside the target directory for the original volume. + kataContainersMount = "kata-containers-host-volume" ) type volumeInfo struct { @@ -48,12 +57,15 @@ type nodeServer struct { // Driver deployed to provision only ephemeral volumes(only for Kubernetes v1.15) mounter mount.Interface volInfo map[string]volumeInfo + + // A directory for additional mount points. + mountDirectory string } var _ csi.NodeServer = &nodeServer{} var _ PmemService = &nodeServer{} -func NewNodeServer(cs *nodeControllerServer) *nodeServer { +func NewNodeServer(cs *nodeControllerServer, mountDirectory string) *nodeServer { return &nodeServer{ nodeCaps: []*csi.NodeServiceCapability{ { @@ -64,9 +76,10 @@ func NewNodeServer(cs *nodeControllerServer) *nodeServer { }, }, }, - cs: cs, - mounter: mount.New(""), - volInfo: map[string]volumeInfo{}, + cs: cs, + mounter: mount.New(""), + volInfo: map[string]volumeInfo{}, + mountDirectory: mountDirectory, } } @@ -146,8 +159,15 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis ephemeral = device == nil && !ok && len(srcPath) == 0 } + var volumeParameters parameters.Volume if ephemeral { - device, err := ns.createEphemeralDevice(ctx, req) + v, err := parameters.Parse(parameters.EphemeralVolumeOrigin, req.GetVolumeContext()) + if err != nil { + return nil, status.Error(codes.InvalidArgument, "ephemeral inline volume parameters: "+err.Error()) + } + volumeParameters = v + + device, err := ns.createEphemeralDevice(ctx, req, volumeParameters) if err != nil { // createEphemeralDevice() returns status.Error, so safe to return return nil, err @@ -155,10 +175,12 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis srcPath = device.Path mountFlags = append(mountFlags, "dax") } else { - // Validate parameters. We don't actually use any of them here, but a sanity check is worthwhile anyway. - if _, err := parameters.Parse(parameters.PersistentVolumeOrigin, req.GetVolumeContext()); err != nil { + // Validate parameters. + v, err := parameters.Parse(parameters.PersistentVolumeOrigin, req.GetVolumeContext()) + if err != nil { return nil, status.Error(codes.InvalidArgument, "persistent volume context: "+err.Error()) } + volumeParameters = v if device, err = ns.cs.dm.GetDevice(req.VolumeId); err != nil { if errors.Is(err, pmdmanager.ErrDeviceNotFound) { @@ -178,26 +200,12 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis sort.Strings(mountFlags) joinedMountFlags := strings.Join(mountFlags[:], ",") + rawBlock := false switch req.VolumeCapability.GetAccessType().(type) { case *csi.VolumeCapability_Block: + rawBlock = true // For block volumes, source path is the actual Device path srcPath = device.Path - targetDir := filepath.Dir(targetPath) - // Make sure that parent directory of target path is existing, otherwise create it - targetPreExisting, err := ensureDirectory(ns.mounter, targetDir) - if err != nil { - return nil, err - } - f, err := os.OpenFile(targetPath, os.O_CREATE, os.FileMode(0644)) - defer f.Close() - if err != nil && !os.IsExist(err) { - if !targetPreExisting { - if rerr := os.Remove(targetDir); rerr != nil { - klog.Warningf("Could not remove created mount target %q: %v", targetDir, rerr) - } - } - return nil, status.Errorf(codes.Internal, "Could not create target device file %q: %v", targetPath, err) - } case *csi.VolumeCapability_Mount: if !ephemeral && len(srcPath) == 0 { return nil, status.Error(codes.FailedPrecondition, "Staging target path missing in request") @@ -232,19 +240,95 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis return nil, status.Error(codes.AlreadyExists, "Volume published but is incompatible") } } + } - if err := os.Mkdir(targetPath, os.FileMode(0755)); err != nil { - // Kubernetes is violating the CSI spec and creates the - // directory for us - // (https://github.com/kubernetes/kubernetes/issues/75535). We - // allow that by ignoring the "already exists" error. - if !os.IsExist(err) { - return nil, status.Error(codes.Internal, "make target dir: "+err.Error()) - } + if rawBlock && volumeParameters.GetKataContainers() { + // We cannot pass block devices with DAX semantic into QEMU. + // TODO: add validation of CreateVolumeRequest.VolumeCapabilities and already detect the problem there. + return nil, status.Error(codes.InvalidArgument, "raw block volumes are incompatible with Kata Containers") + } + + // We always (bind) mount. This is not strictly necessary for + // Kata Containers and persistent volumes because we could use + // the mounted filesystem at the staging path, but done anyway + // for two reason: + // - avoids special cases + // - we don't have the staging path in NodeUnpublishVolume + // and can't undo some of the operations there if they refer to + // the staging path + hostMount := targetPath + if volumeParameters.GetKataContainers() { + // Create a mount point inside our state directory. Mount directory gets created here, + // the mount point itself in ns.mount(). + if err := os.MkdirAll(ns.mountDirectory, os.FileMode(0755)); err != nil && !os.IsExist(err) { + return nil, status.Error(codes.Internal, "create parent directory for mounts: "+err.Error()) + } + hostMount = filepath.Join(ns.mountDirectory, req.GetVolumeId()) + } + if err := ns.mount(srcPath, hostMount, mountFlags, rawBlock); err != nil { + return nil, status.Error(codes.Internal, err.Error()) + } + + if !volumeParameters.GetKataContainers() { + // A normal volume, return early. + ns.volInfo[req.VolumeId] = volumeInfo{readOnly: readOnly, targetPath: targetPath, mountFlags: joinedMountFlags} + return &csi.NodePublishVolumeResponse{}, nil + } + + // When we get here, the volume is for Kata Containers and the + // following has already happened: + // - persistent filesystem volumes or ephemeral volume: formatted and mounted at hostMount + // - persistent raw block volumes: we bailed out with an error + // + // To support Kata Containers for ephemeral and persistent filesystem volumes, + // we now need to: + // - create the image file inside the mounted volume + // - bind-mount the partition inside that file to a loop device + // - mount the loop device instead of the original volume + // + // All of that has to be idempotent (because we might get + // killed while working on this) *and* we have to undo it when + // returning a failure (because then NodePublishVolume might + // not be called again - see in particular the final errors in + // https://github.com/kubernetes/kubernetes/blob/ca532c6fb2c08f859eca13e0557f3b2aec9a18e0/pkg/volume/csi/csi_client.go#L627-L649). + + // There's some overhead for the imagefile inside the host filesystem, but that should be small + // relative to the size of the volumes, so we simply create an image file that is as large as + // the mounted filesystem allows. Create() is not idempotent, so we have to check for the + // file before overwriting something that was already created earlier. + imageFile := filepath.Join(hostMount, kataContainersImageFilename) + if _, err := os.Stat(imageFile); err != nil { + if !os.IsNotExist(err) { + return nil, status.Error(codes.Internal, "unexpected error while checking for image file: "+err.Error()) + } + var imageFsType imagefile.FsType + switch fsType { + case "xfs": + imageFsType = imagefile.Xfs + case "": + fallthrough + case "ext4": + imageFsType = imagefile.Ext4 + default: + return nil, status.Error(codes.InvalidArgument, fmt.Sprintf("fsType %q not supported for Kata Containers", fsType)) } + if err := imagefile.Create(imageFile, 0 /* no fixed size */, imageFsType); err != nil { + return nil, status.Error(codes.Internal, "create Kata Container image file: "+err.Error()) + } + } + + // If this offset ever changes, then we have to make future versions of PMEM-CSI more + // flexible and dynamically determine the offset. For now we assume that the + // file was created by the current version and thus use the fixed offset. + offset := int64(imagefile.HeaderSize) + handler := volumepathhandler.VolumePathHandler{} + loopDev, err := handler.AttachFileDeviceWithOffset(imageFile, offset) + if err != nil { + return nil, status.Error(codes.Internal, "create loop device: "+err.Error()) } - if err := ns.mount(srcPath, targetPath, mountFlags); err != nil { + // TODO: Try to mount with dax first, fall back to mount without it if not supported. + if err := ns.mount(loopDev, targetPath, []string{}, false); err != nil { return nil, status.Error(codes.Internal, err.Error()) } @@ -327,7 +411,15 @@ func (ns *nodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpu klog.V(5).Infof("NodeUnpublishVolume: volume id:%s targetpath:%s has been unmounted", req.VolumeId, targetPath) } - os.Remove(targetPath) // nolint: gosec, errorchk + if p.GetKataContainers() { + if err := ns.nodeUnpublishKataContainerImage(ctx, req, p); err != nil { + return nil, err + } + } + + if err := os.Remove(targetPath); err != nil { + return nil, status.Error(codes.Internal, "unexpected error while removing target path: "+err.Error()) + } if p.GetPersistency() == parameters.PersistencyEphemeral { if _, err := ns.cs.DeleteVolume(ctx, &csi.DeleteVolumeRequest{VolumeId: vol.ID}); err != nil { @@ -339,6 +431,34 @@ func (ns *nodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpu return &csi.NodeUnpublishVolumeResponse{}, nil } +func (ns *nodeServer) nodeUnpublishKataContainerImage(cxt context.Context, req *csi.NodeUnpublishVolumeRequest, p parameters.Volume) error { + // Reconstruct where the volume was mounted before creating the image file. + hostMount := filepath.Join(ns.mountDirectory, req.GetVolumeId()) + + // We know the parent and the well-known image name, so we have the full path now + // and can detach the loop device from it. + imageFile := filepath.Join(hostMount, kataContainersImageFilename) + handler := volumepathhandler.VolumePathHandler{} + // This will warn if the loop device is not found, but won't treat that as an error. + // This is the behavior that we want. + if err := handler.DetachFileDevice(imageFile); err != nil { + return status.Error(codes.Internal, fmt.Sprintf("remove loop device for Kata Container image file %q: %v", imageFile, err)) + } + + // We do *not* remove the image file. It may be needed again + // when mounting a persistent volume a second time. If not, + // it'll get deleted together with the device. But before + // the device can be deleted, we need to unmount it. + if err := ns.mounter.Unmount(hostMount); err != nil { + return status.Error(codes.Internal, fmt.Sprintf("unmount ephemeral Kata Container volume: %v", err)) + } + if err := os.Remove(hostMount); err != nil { + return status.Error(codes.Internal, "unexpected error while removing ephemeral volume mount point: "+err.Error()) + } + + return nil +} + func (ns *nodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRequest) (*csi.NodeStageVolumeResponse, error) { // Check arguments @@ -403,7 +523,7 @@ func (ns *nodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVol mountOptions = append(mountOptions, "dax") - if err = ns.mount(device.Path, stagingtargetPath, mountOptions); err != nil { + if err = ns.mount(device.Path, stagingtargetPath, mountOptions, false /* raw block */); err != nil { return nil, status.Error(codes.Internal, err.Error()) } @@ -463,12 +583,7 @@ func (ns *nodeServer) NodeExpandVolume(context.Context, *csi.NodeExpandVolumeReq // createEphemeralDevice creates new pmem device for given req. // On failure it returns one of status errors. -func (ns *nodeServer) createEphemeralDevice(ctx context.Context, req *csi.NodePublishVolumeRequest) (*pmdmanager.PmemDeviceInfo, error) { - p, err := parameters.Parse(parameters.EphemeralVolumeOrigin, req.GetVolumeContext()) - if err != nil { - return nil, status.Error(codes.InvalidArgument, "ephemeral inline volume parameters: "+err.Error()) - } - +func (ns *nodeServer) createEphemeralDevice(ctx context.Context, req *csi.NodePublishVolumeRequest, p parameters.Volume) (*pmdmanager.PmemDeviceInfo, error) { // If the caller has use the heuristic for detecting ephemeral volumes, the flag won't // be set. Fix that here. ephemeral := parameters.PersistencyEphemeral @@ -498,8 +613,8 @@ func (ns *nodeServer) createEphemeralDevice(ctx context.Context, req *csi.NodePu return device, nil } -// provisionDevice initializes the device with requested filesystem -// and mounts at given targetPath. +// provisionDevice initializes the device with requested filesystem. +// It can be called multiple times for the same device (idempotent). func (ns *nodeServer) provisionDevice(device *pmdmanager.PmemDeviceInfo, fsType string) error { if fsType == "" { // Empty FsType means "unspecified" and we pick default, currently hard-coded to ext4 @@ -522,13 +637,22 @@ func (ns *nodeServer) provisionDevice(device *pmdmanager.PmemDeviceInfo, fsType } output, err := pmemexec.RunCommand(cmd, args...) if err != nil { + // If the filesystem is already mounted, then + // formatting it fails. In that case we don't need to + // format and can ignore that error. We could check + // for that ourselves in advance, but that would be + // extra code. + if strings.Contains(output, "contains a mounted filesystem") { + return nil + } return fmt.Errorf("mkfs failed: %s", output) } return nil } -func (ns *nodeServer) mount(sourcePath, targetPath string, mountOptions []string) error { +// mount creates the target path (parent must exist) and mounts the source there. It is idempotent. +func (ns *nodeServer) mount(sourcePath, targetPath string, mountOptions []string, rawBlock bool) error { notMnt, err := ns.mounter.IsLikelyNotMountPoint(targetPath) if err != nil && !os.IsNotExist(err) { return fmt.Errorf("failed to determain if '%s' is a valid mount point: %s", targetPath, err.Error()) @@ -537,13 +661,20 @@ func (ns *nodeServer) mount(sourcePath, targetPath string, mountOptions []string return nil } - if err := os.Mkdir(targetPath, os.FileMode(0755)); err != nil { - // Kubernetes is violating the CSI spec and creates the - // directory for us - // (https://github.com/kubernetes/kubernetes/issues/75535). We - // allow that by ignoring the "already exists" error. - if !os.IsExist(err) { - return fmt.Errorf("failed to create '%s': %s", targetPath, err.Error()) + // Create target path, using a file for raw block bind mounts + // or a directory for filesystems. Might already exist from a + // previous call or because Kubernetes erronously created it + // for us. + if rawBlock { + f, err := os.OpenFile(targetPath, os.O_CREATE, os.FileMode(0644)) + if err == nil { + defer f.Close() + } else if !os.IsExist(err) { + return fmt.Errorf("create target device file: %w", err) + } + } else { + if err := os.Mkdir(targetPath, os.FileMode(0755)); err != nil && !os.IsExist(err) { + return fmt.Errorf("create target directory: %w", err) } } diff --git a/pkg/pmem-csi-driver/parameters/parameters.go b/pkg/pmem-csi-driver/parameters/parameters.go index 4c541a7df9..0e27739e91 100644 --- a/pkg/pmem-csi-driver/parameters/parameters.go +++ b/pkg/pmem-csi-driver/parameters/parameters.go @@ -21,6 +21,7 @@ type Origin int const ( CacheSize = "cacheSize" EraseAfter = "eraseafter" + KataContainers = "kataContainers" Name = "name" PersistencyModel = "persistencyModel" VolumeID = "_id" @@ -58,6 +59,7 @@ var valid = map[Origin][]string{ CreateVolumeOrigin: []string{ CacheSize, EraseAfter, + KataContainers, PersistencyModel, }, @@ -65,6 +67,7 @@ var valid = map[Origin][]string{ CreateVolumeInternalOrigin: []string{ CacheSize, EraseAfter, + KataContainers, PersistencyModel, VolumeID, @@ -73,6 +76,7 @@ var valid = map[Origin][]string{ // Parameters from Kubernetes and users. EphemeralVolumeOrigin: []string{ EraseAfter, + KataContainers, PodInfoPrefix, Size, }, @@ -85,6 +89,7 @@ var valid = map[Origin][]string{ PersistentVolumeOrigin: []string{ CacheSize, EraseAfter, + KataContainers, PersistencyModel, Name, @@ -97,6 +102,7 @@ var valid = map[Origin][]string{ NodeVolumeOrigin: []string{ CacheSize, EraseAfter, + KataContainers, Name, PersistencyModel, Size, @@ -108,12 +114,13 @@ var valid = map[Origin][]string{ // The accessor functions always return a value, if unset // the default. type Volume struct { - CacheSize *uint - EraseAfter *bool - Name *string - Persistency *Persistency - Size *int64 - VolumeID *string + CacheSize *uint + EraseAfter *bool + KataContainers *bool + Name *string + Persistency *Persistency + Size *int64 + VolumeID *string } // VolumeContext represents the same settings as a string map. @@ -171,6 +178,12 @@ func Parse(origin Origin, stringmap map[string]string) (Volume, error) { } u := uint(c) result.CacheSize = &u + case KataContainers: + b, err := strconv.ParseBool(value) + if err != nil { + return result, fmt.Errorf("parameter %q: failed to parse %q as boolean: %v", key, value, err) + } + result.KataContainers = &b case Size: quantity, err := resource.ParseQuantity(value) if err != nil { @@ -240,6 +253,9 @@ func (v Volume) ToContext() VolumeContext { if v.Size != nil { result[Size] = fmt.Sprintf("%d", *v.Size) } + if v.KataContainers != nil { + result[KataContainers] = fmt.Sprintf("%v", *v.KataContainers) + } return result } @@ -279,6 +295,13 @@ func (v Volume) GetSize() int64 { return 0 } +func (v Volume) GetKataContainers() bool { + if v.KataContainers != nil { + return *v.KataContainers + } + return false +} + func (v Volume) GetVolumeID() string { if v.VolumeID != nil { return *v.VolumeID diff --git a/pkg/pmem-csi-driver/pmem-csi-driver.go b/pkg/pmem-csi-driver/pmem-csi-driver.go index 75474b01e2..f14a0954fa 100644 --- a/pkg/pmem-csi-driver/pmem-csi-driver.go +++ b/pkg/pmem-csi-driver/pmem-csi-driver.go @@ -16,6 +16,7 @@ import ( "net/http" "os" "os/signal" + "path/filepath" "syscall" "time" @@ -280,7 +281,7 @@ func (pmemd *pmemDriver) Run() error { return err } cs := NewNodeControllerServer(pmemd.cfg.NodeID, dm, sm) - ns := NewNodeServer(cs) + ns := NewNodeServer(cs, filepath.Clean(pmemd.cfg.StateBasePath)+"/mount") if pmemd.cfg.Endpoint != pmemd.cfg.ControllerEndpoint { if err := s.Start(pmemd.cfg.ControllerEndpoint, pmemd.serverTLSConfig, cs); err != nil { diff --git a/pkg/volumepathhandler/volume_path_handler.go b/pkg/volumepathhandler/volume_path_handler.go new file mode 100644 index 0000000000..c39a26b715 --- /dev/null +++ b/pkg/volumepathhandler/volume_path_handler.go @@ -0,0 +1,298 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package volumepathhandler + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + + "k8s.io/klog" + "k8s.io/utils/exec" + "k8s.io/utils/mount" + + "k8s.io/apimachinery/pkg/types" +) + +const ( + losetupPath = "losetup" + statPath = "stat" + ErrDeviceNotFound = "device not found" + ErrDeviceNotSupported = "device not supported" +) + +// BlockVolumePathHandler defines a set of operations for handling block volume-related operations +type BlockVolumePathHandler interface { + // MapDevice creates a symbolic link to block device under specified map path + MapDevice(devicePath string, mapPath string, linkName string, bindMount bool) error + // UnmapDevice removes a symbolic link to block device under specified map path + UnmapDevice(mapPath string, linkName string, bindMount bool) error + // RemovePath removes a file or directory on specified map path + RemoveMapPath(mapPath string) error + // IsSymlinkExist retruns true if specified symbolic link exists + IsSymlinkExist(mapPath string) (bool, error) + // IsDeviceBindMountExist retruns true if specified bind mount exists + IsDeviceBindMountExist(mapPath string) (bool, error) + // GetDeviceBindMountRefs searches bind mounts under global map path + GetDeviceBindMountRefs(devPath string, mapPath string) ([]string, error) + // FindGlobalMapPathUUIDFromPod finds {pod uuid} symbolic link under globalMapPath + // corresponding to map path symlink, and then return global map path with pod uuid. + FindGlobalMapPathUUIDFromPod(pluginDir, mapPath string, podUID types.UID) (string, error) + // AttachFileDevice takes a path to a regular file and makes it available as an + // attached block device. + AttachFileDevice(path string) (string, error) + // DetachFileDevice takes a path to the attached block device and + // detach it from block device. + DetachFileDevice(path string) error + // GetLoopDevice returns the full path to the loop device associated with the given path. + GetLoopDevice(path string) (string, error) +} + +// NewBlockVolumePathHandler returns a new instance of BlockVolumeHandler. +func NewBlockVolumePathHandler() BlockVolumePathHandler { + var volumePathHandler VolumePathHandler + return volumePathHandler +} + +// VolumePathHandler is path related operation handlers for block volume +type VolumePathHandler struct { +} + +// MapDevice creates a symbolic link to block device under specified map path +func (v VolumePathHandler) MapDevice(devicePath string, mapPath string, linkName string, bindMount bool) error { + // Example of global map path: + // globalMapPath/linkName: plugins/kubernetes.io/{PluginName}/{DefaultKubeletVolumeDevicesDirName}/{volumePluginDependentPath}/{podUid} + // linkName: {podUid} + // + // Example of pod device map path: + // podDeviceMapPath/linkName: pods/{podUid}/{DefaultKubeletVolumeDevicesDirName}/{escapeQualifiedPluginName}/{volumeName} + // linkName: {volumeName} + if len(devicePath) == 0 { + return fmt.Errorf("failed to map device to map path. devicePath is empty") + } + if len(mapPath) == 0 { + return fmt.Errorf("failed to map device to map path. mapPath is empty") + } + if !filepath.IsAbs(mapPath) { + return fmt.Errorf("the map path should be absolute: map path: %s", mapPath) + } + klog.V(5).Infof("MapDevice: devicePath %s", devicePath) + klog.V(5).Infof("MapDevice: mapPath %s", mapPath) + klog.V(5).Infof("MapDevice: linkName %s", linkName) + + // Check and create mapPath + _, err := os.Stat(mapPath) + if err != nil && !os.IsNotExist(err) { + return fmt.Errorf("cannot validate map path: %s: %v", mapPath, err) + } + if err = os.MkdirAll(mapPath, 0750); err != nil { + return fmt.Errorf("failed to mkdir %s: %v", mapPath, err) + } + + if bindMount { + return mapBindMountDevice(v, devicePath, mapPath, linkName) + } + return mapSymlinkDevice(v, devicePath, mapPath, linkName) +} + +func mapBindMountDevice(v VolumePathHandler, devicePath string, mapPath string, linkName string) error { + // Check bind mount exists + linkPath := filepath.Join(mapPath, string(linkName)) + + file, err := os.Stat(linkPath) + if err != nil { + if !os.IsNotExist(err) { + return fmt.Errorf("failed to stat file %s: %v", linkPath, err) + } + + // Create file + newFile, err := os.OpenFile(linkPath, os.O_CREATE|os.O_RDWR, 0750) + if err != nil { + return fmt.Errorf("failed to open file %s: %v", linkPath, err) + } + if err := newFile.Close(); err != nil { + return fmt.Errorf("failed to close file %s: %v", linkPath, err) + } + } else { + // Check if device file + // TODO: Need to check if this device file is actually the expected bind mount + if file.Mode()&os.ModeDevice == os.ModeDevice { + klog.Warningf("Warning: Map skipped because bind mount already exist on the path: %v", linkPath) + return nil + } + + klog.Warningf("Warning: file %s is already exist but not mounted, skip creating file", linkPath) + } + + // Bind mount file + mounter := &mount.SafeFormatAndMount{Interface: mount.New(""), Exec: exec.New()} + if err := mounter.Mount(devicePath, linkPath, "" /* fsType */, []string{"bind"}); err != nil { + return fmt.Errorf("failed to bind mount devicePath: %s to linkPath %s: %v", devicePath, linkPath, err) + } + + return nil +} + +func mapSymlinkDevice(v VolumePathHandler, devicePath string, mapPath string, linkName string) error { + // Remove old symbolic link(or file) then create new one. + // This should be done because current symbolic link is + // stale across node reboot. + linkPath := filepath.Join(mapPath, string(linkName)) + if err := os.Remove(linkPath); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("failed to remove file %s: %v", linkPath, err) + } + return os.Symlink(devicePath, linkPath) +} + +// UnmapDevice removes a symbolic link associated to block device under specified map path +func (v VolumePathHandler) UnmapDevice(mapPath string, linkName string, bindMount bool) error { + if len(mapPath) == 0 { + return fmt.Errorf("failed to unmap device from map path. mapPath is empty") + } + klog.V(5).Infof("UnmapDevice: mapPath %s", mapPath) + klog.V(5).Infof("UnmapDevice: linkName %s", linkName) + + if bindMount { + return unmapBindMountDevice(v, mapPath, linkName) + } + return unmapSymlinkDevice(v, mapPath, linkName) +} + +func unmapBindMountDevice(v VolumePathHandler, mapPath string, linkName string) error { + // Check bind mount exists + linkPath := filepath.Join(mapPath, string(linkName)) + if isMountExist, checkErr := v.IsDeviceBindMountExist(linkPath); checkErr != nil { + return checkErr + } else if !isMountExist { + klog.Warningf("Warning: Unmap skipped because bind mount does not exist on the path: %v", linkPath) + + // Check if linkPath still exists + if _, err := os.Stat(linkPath); err != nil { + if !os.IsNotExist(err) { + return fmt.Errorf("failed to check if path %s exists: %v", linkPath, err) + } + // linkPath has already been removed + return nil + } + // Remove file + if err := os.Remove(linkPath); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("failed to remove file %s: %v", linkPath, err) + } + return nil + } + + // Unmount file + mounter := &mount.SafeFormatAndMount{Interface: mount.New(""), Exec: exec.New()} + if err := mounter.Unmount(linkPath); err != nil { + return fmt.Errorf("failed to unmount linkPath %s: %v", linkPath, err) + } + + // Remove file + if err := os.Remove(linkPath); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("failed to remove file %s: %v", linkPath, err) + } + + return nil +} + +func unmapSymlinkDevice(v VolumePathHandler, mapPath string, linkName string) error { + // Check symbolic link exists + linkPath := filepath.Join(mapPath, string(linkName)) + if islinkExist, checkErr := v.IsSymlinkExist(linkPath); checkErr != nil { + return checkErr + } else if !islinkExist { + klog.Warningf("Warning: Unmap skipped because symlink does not exist on the path: %v", linkPath) + return nil + } + return os.Remove(linkPath) +} + +// RemoveMapPath removes a file or directory on specified map path +func (v VolumePathHandler) RemoveMapPath(mapPath string) error { + if len(mapPath) == 0 { + return fmt.Errorf("failed to remove map path. mapPath is empty") + } + klog.V(5).Infof("RemoveMapPath: mapPath %s", mapPath) + err := os.RemoveAll(mapPath) + if err != nil && !os.IsNotExist(err) { + return fmt.Errorf("failed to remove directory %s: %v", mapPath, err) + } + return nil +} + +// IsSymlinkExist returns true if specified file exists and the type is symbolik link. +// If file doesn't exist, or file exists but not symbolic link, return false with no error. +// On other cases, return false with error from Lstat(). +func (v VolumePathHandler) IsSymlinkExist(mapPath string) (bool, error) { + fi, err := os.Lstat(mapPath) + if err != nil { + // If file doesn't exist, return false and no error + if os.IsNotExist(err) { + return false, nil + } + // Return error from Lstat() + return false, fmt.Errorf("failed to Lstat file %s: %v", mapPath, err) + } + // If file exits and it's symbolic link, return true and no error + if fi.Mode()&os.ModeSymlink == os.ModeSymlink { + return true, nil + } + // If file exits but it's not symbolic link, return fale and no error + return false, nil +} + +// IsDeviceBindMountExist returns true if specified file exists and the type is device. +// If file doesn't exist, or file exists but not device, return false with no error. +// On other cases, return false with error from Lstat(). +func (v VolumePathHandler) IsDeviceBindMountExist(mapPath string) (bool, error) { + fi, err := os.Lstat(mapPath) + if err != nil { + // If file doesn't exist, return false and no error + if os.IsNotExist(err) { + return false, nil + } + + // Return error from Lstat() + return false, fmt.Errorf("failed to Lstat file %s: %v", mapPath, err) + } + // If file exits and it's device, return true and no error + if fi.Mode()&os.ModeDevice == os.ModeDevice { + return true, nil + } + // If file exits but it's not device, return fale and no error + return false, nil +} + +// GetDeviceBindMountRefs searches bind mounts under global map path +func (v VolumePathHandler) GetDeviceBindMountRefs(devPath string, mapPath string) ([]string, error) { + var refs []string + files, err := ioutil.ReadDir(mapPath) + if err != nil { + return nil, fmt.Errorf("directory cannot read %v", err) + } + for _, file := range files { + if file.Mode()&os.ModeDevice != os.ModeDevice { + continue + } + filename := file.Name() + // TODO: Might need to check if the file is actually linked to devPath + refs = append(refs, filepath.Join(mapPath, filename)) + } + klog.V(5).Infof("GetDeviceBindMountRefs: refs %v", refs) + return refs, nil +} diff --git a/pkg/volumepathhandler/volume_path_handler_linux.go b/pkg/volumepathhandler/volume_path_handler_linux.go new file mode 100644 index 0000000000..ec1c279006 --- /dev/null +++ b/pkg/volumepathhandler/volume_path_handler_linux.go @@ -0,0 +1,270 @@ +// +build linux + +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Forked from https://github.com/kubernetes/kubernetes/raw/8a09460c2f7ba8f6acd8a6fb7603ed3ac4805eb6/pkg/volume/util/volumepathhandler/volume_path_handler_linux.go +// to add file offset to AttachFileDevice. + +package volumepathhandler + +import ( + "bufio" + "errors" + "fmt" + "os" + "os/exec" + "path/filepath" + "regexp" + "strings" + + "golang.org/x/sys/unix" + + "k8s.io/apimachinery/pkg/types" + "k8s.io/klog" +) + +// AttachFileDevice takes a path to a regular file and makes it available as an +// attached block device. +func (v VolumePathHandler) AttachFileDevice(path string) (string, error) { + return v.AttachFileDeviceWithOffset(path, 0) +} + +// AttachFileDevice takes a path to a regular file and makes its content starting +// at the given offset available as an attached block device. +func (v VolumePathHandler) AttachFileDeviceWithOffset(path string, offset int64) (string, error) { + blockDevicePath, err := v.GetLoopDevice(path) + if err != nil && err.Error() != ErrDeviceNotFound { + return "", fmt.Errorf("GetLoopDevice failed for path %s: %v", path, err) + } + + // If no existing loop device for the path, create one + if blockDevicePath == "" { + klog.V(4).Infof("Creating device for path: %s", path) + blockDevicePath, err = makeLoopDevice(path, offset) + if err != nil { + return "", fmt.Errorf("makeLoopDevice failed for path %s: %v", path, err) + } + } + return blockDevicePath, nil +} + +// DetachFileDevice takes a path to the attached block device and +// detach it from block device. +func (v VolumePathHandler) DetachFileDevice(path string) error { + loopPath, err := v.GetLoopDevice(path) + if err != nil { + if err.Error() == ErrDeviceNotFound { + klog.Warningf("couldn't find loopback device which takes file descriptor lock. Skip detaching device. device path: %q", path) + } else { + return fmt.Errorf("GetLoopDevice failed for path %s: %v", path, err) + } + } else { + if len(loopPath) != 0 { + err = removeLoopDevice(loopPath) + if err != nil { + return fmt.Errorf("removeLoopDevice failed for path %s: %v", path, err) + } + } + } + return nil +} + +// GetLoopDevice returns the full path to the loop device associated with the given path. +func (v VolumePathHandler) GetLoopDevice(path string) (string, error) { + _, err := os.Stat(path) + if os.IsNotExist(err) { + klog.Warningf("loop device backing file %q not found, assuming loop device does not exist either", path) + return "", errors.New(ErrDeviceNotFound) + } + if err != nil { + return "", fmt.Errorf("not attachable: %v", err) + } + + args := []string{"-j", path} + cmd := exec.Command(losetupPath, args...) + out, err := cmd.CombinedOutput() + if err != nil { + klog.V(2).Infof("Failed device discover command for path %s: %v %s", path, err, out) + return "", fmt.Errorf("losetup -j %s failed: %v", path, err) + } + klog.V(5).Infof("losetup -j %q: %s", path, out) + return parseLosetupOutputForDevice(out, path) +} + +func makeLoopDevice(path string, offset int64) (string, error) { + args := []string{"-f", "--show"} + if offset != 0 { + args = append(args, "-o", fmt.Sprintf("%d", offset)) + } + args = append(args, path) + cmd := exec.Command(losetupPath, args...) + out, err := cmd.CombinedOutput() + if err != nil { + klog.V(2).Infof("Failed device create command for path: %s %v %s ", path, err, out) + return "", fmt.Errorf("losetup -f --show %s failed: %v", path, err) + } + + // losetup -f --show {path} returns device in the format: + // /dev/loop1 + if len(out) == 0 { + return "", errors.New(ErrDeviceNotFound) + } + + return strings.TrimSpace(string(out)), nil +} + +// removeLoopDevice removes specified loopback device +func removeLoopDevice(device string) error { + args := []string{"-d", device} + cmd := exec.Command(losetupPath, args...) + out, err := cmd.CombinedOutput() + if err != nil { + if _, err := os.Stat(device); os.IsNotExist(err) { + return nil + } + klog.V(2).Infof("Failed to remove loopback device: %s: %v %s", device, err, out) + return fmt.Errorf("losetup -d %s failed: %v", device, err) + } + return nil +} + +var offsetSuffix = regexp.MustCompile(`, offset \d+$`) + +func parseLosetupOutputForDevice(output []byte, path string) (string, error) { + if len(output) == 0 { + return "", errors.New(ErrDeviceNotFound) + } + + // losetup -j {path} returns device in the format: + // /dev/loop1: [0073]:148662 ({path}) + // /dev/loop2: [0073]:148662 (/dev/sdX) + // + // losetup -j shows all the loop device for the same device that has the same + // major/minor number, by resolving symlink and matching major/minor number. + // Therefore, there will be other path than {path} in output, as shown in above output. + // + // Optionally the lines have ", offset {offset}" at the end. We strip those + // and return all loop devices that refer to the same file. + s := string(output) + // Find the line that exact matches to the path, or "({path})" + var matched string + scanner := bufio.NewScanner(strings.NewReader(s)) + for scanner.Scan() { + // Remove the ", offset ..." suffix? + line := scanner.Text() + index := offsetSuffix.FindStringSubmatchIndex(line) + if index != nil { + line = line[0:index[0]] + } + + // Does it now match the file path? + if strings.HasSuffix(line, "("+path+")") { + matched = line + break + } + } + if len(matched) == 0 { + return "", errors.New(ErrDeviceNotFound) + } + s = matched + + // Get device name, or the 0th field of the output separated with ":". + // We don't need 1st field or later to be splitted, so passing 2 to SplitN. + device := strings.TrimSpace(strings.SplitN(s, ":", 2)[0]) + if len(device) == 0 { + return "", errors.New(ErrDeviceNotFound) + } + return device, nil +} + +// FindGlobalMapPathUUIDFromPod finds {pod uuid} bind mount under globalMapPath +// corresponding to map path symlink, and then return global map path with pod uuid. +// (See pkg/volume/volume.go for details on a global map path and a pod device map path.) +// ex. mapPath symlink: pods/{podUid}}/{DefaultKubeletVolumeDevicesDirName}/{escapeQualifiedPluginName}/{volumeName} -> /dev/sdX +// globalMapPath/{pod uuid} bind mount: plugins/kubernetes.io/{PluginName}/{DefaultKubeletVolumeDevicesDirName}/{volumePluginDependentPath}/{pod uuid} -> /dev/sdX +func (v VolumePathHandler) FindGlobalMapPathUUIDFromPod(pluginDir, mapPath string, podUID types.UID) (string, error) { + var globalMapPathUUID string + // Find symbolic link named pod uuid under plugin dir + err := filepath.Walk(pluginDir, func(path string, fi os.FileInfo, err error) error { + if err != nil { + return err + } + if (fi.Mode()&os.ModeDevice == os.ModeDevice) && (fi.Name() == string(podUID)) { + klog.V(5).Infof("FindGlobalMapPathFromPod: path %s, mapPath %s", path, mapPath) + if res, err := compareBindMountAndSymlinks(path, mapPath); err == nil && res { + globalMapPathUUID = path + } + } + return nil + }) + if err != nil { + return "", fmt.Errorf("FindGlobalMapPathUUIDFromPod failed: %v", err) + } + klog.V(5).Infof("FindGlobalMapPathFromPod: globalMapPathUUID %s", globalMapPathUUID) + // Return path contains global map path + {pod uuid} + return globalMapPathUUID, nil +} + +// compareBindMountAndSymlinks returns if global path (bind mount) and +// pod path (symlink) are pointing to the same device. +// If there is an error in checking it returns error. +func compareBindMountAndSymlinks(global, pod string) (bool, error) { + // To check if bind mount and symlink are pointing to the same device, + // we need to check if they are pointing to the devices that have same major/minor number. + + // Get the major/minor number for global path + devNumGlobal, err := getDeviceMajorMinor(global) + if err != nil { + return false, fmt.Errorf("getDeviceMajorMinor failed for path %s: %v", global, err) + } + + // Get the symlinked device from the pod path + devPod, err := os.Readlink(pod) + if err != nil { + return false, fmt.Errorf("failed to readlink path %s: %v", pod, err) + } + // Get the major/minor number for the symlinked device from the pod path + devNumPod, err := getDeviceMajorMinor(devPod) + if err != nil { + return false, fmt.Errorf("getDeviceMajorMinor failed for path %s: %v", devPod, err) + } + klog.V(5).Infof("CompareBindMountAndSymlinks: devNumGlobal %s, devNumPod %s", devNumGlobal, devNumPod) + + // Check if the major/minor number are the same + if devNumGlobal == devNumPod { + return true, nil + } + return false, nil +} + +// getDeviceMajorMinor returns major/minor number for the path with below format: +// major:minor (in hex) +// ex) +// fc:10 +func getDeviceMajorMinor(path string) (string, error) { + var stat unix.Stat_t + + if err := unix.Stat(path, &stat); err != nil { + return "", fmt.Errorf("failed to stat path %s: %v", path, err) + } + + devNumber := uint64(stat.Rdev) + major := unix.Major(devNumber) + minor := unix.Minor(devNumber) + + return fmt.Sprintf("%x:%x", major, minor), nil +} diff --git a/test/e2e/storage/csi_volumes.go b/test/e2e/storage/csi_volumes.go index f97656b7f4..d1b1a925b4 100644 --- a/test/e2e/storage/csi_volumes.go +++ b/test/e2e/storage/csi_volumes.go @@ -200,6 +200,43 @@ var _ = deploy.DescribeForAll("E2E", func(d *deploy.Deployment) { wg.Wait() }) }) + + // Also run some limited tests with Kata Containers. + kataDriver := &manifestDriver{ + driverInfo: testsuites.DriverInfo{ + Name: d.Name + "-pmem-csi-kata", + MaxFileSize: testpatterns.FileSizeMedium, + SupportedFsType: sets.NewString( + "xfs", + "ext4", + ), + Capabilities: map[testsuites.Capability]bool{ + testsuites.CapPersistence: true, + testsuites.CapFsGroup: true, + testsuites.CapExec: true, + testsuites.CapBlock: true, + }, + SupportedSizeRange: e2evolume.SizeRange{ + // There is test in VolumeIO suite creating 102 MB of content + // so we use 110 MB as minimum size to fit that with some margin. + // TODO: fix that upstream test to have a suitable minimum size + // + // Without VolumeIO suite, 16Mi would be enough as smallest xfs system size. + // Ref: http://man7.org/linux/man-pages/man8/mkfs.xfs.8.html + Min: "110Mi", + }, + }, + scManifest: map[string]string{ + "ext4": "deploy/common/pmem-storageclass-ext4-kata.yaml", + "xfs": "deploy/common/pmem-storageclass-xfs-kata.yaml", + }, + csiDriverName: "pmem-csi.intel.com", + } + Context("Kata Containers", func() { + testsuites.DefineTestSuite(kataDriver, []func() testsuites.TestSuite{ + dax.InitDaxTestSuite, + }) + }) }) type manifestDriver struct { @@ -213,12 +250,16 @@ type manifestDriver struct { var _ testsuites.TestDriver = &manifestDriver{} var _ testsuites.DynamicPVTestDriver = &manifestDriver{} +var _ testsuites.EphemeralTestDriver = &manifestDriver{} func (m *manifestDriver) GetDriverInfo() *testsuites.DriverInfo { return &m.driverInfo } -func (m *manifestDriver) SkipUnsupportedTest(testpatterns.TestPattern) { +func (m *manifestDriver) SkipUnsupportedTest(pattern testpatterns.TestPattern) { + if !m.driverInfo.SupportedFsType.Has(pattern.FsType) { + skipper.Skipf("fsType %q not supported", pattern.FsType) + } } func (m *manifestDriver) GetDynamicProvisionStorageClass(config *testsuites.PerTestConfig, fsType string) *storagev1.StorageClass { @@ -241,6 +282,18 @@ func (m *manifestDriver) GetDynamicProvisionStorageClass(config *testsuites.PerT } func (m *manifestDriver) PrepareTest(f *framework.Framework) (*testsuites.PerTestConfig, func()) { + // If the driver depends on Kata Containers, first make sure + // that we have it on at least one node. + if strings.HasSuffix(m.driverInfo.Name, "-kata") { + nodes, err := f.ClientSet.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{ + LabelSelector: "katacontainers.io/kata-runtime=true", + }) + framework.ExpectNoError(err, "list nodes") + if len(nodes.Items) == 0 { + skipper.Skipf("no nodes found with Kata Container runtime") + } + } + By(fmt.Sprintf("deploying %s driver", m.driverInfo.Name)) config := &testsuites.PerTestConfig{ Driver: m, @@ -272,6 +325,9 @@ func (m *manifestDriver) GetVolume(config *testsuites.PerTestConfig, volumeNumbe attributes := map[string]string{"size": m.driverInfo.SupportedSizeRange.Min} shared := false readOnly := false + if strings.HasSuffix(m.driverInfo.Name, "-kata") { + attributes["kataContainers"] = "true" + } return attributes, shared, readOnly } diff --git a/test/e2e/storage/dax/dax.go b/test/e2e/storage/dax/dax.go index 7f6fec4a71..decfbac7d9 100644 --- a/test/e2e/storage/dax/dax.go +++ b/test/e2e/storage/dax/dax.go @@ -118,11 +118,13 @@ func (p *daxTestSuite) DefineTests(driver testsuites.TestDriver, pattern testpat } } + withKataContainers := strings.HasSuffix(driver.GetDriverInfo().Name, "-kata") + It("should support MAP_SYNC", func() { init() defer cleanup() - l.testDaxInPod(f, l.resource.Pattern.VolMode, l.resource.VolSource, l.config) + l.testDaxInPod(f, l.resource.Pattern.VolMode, l.resource.VolSource, l.config, withKataContainers) }) } @@ -130,7 +132,9 @@ func (l local) testDaxInPod( f *framework.Framework, volumeMode v1.PersistentVolumeMode, source *v1.VolumeSource, - config *testsuites.PerTestConfig) { + config *testsuites.PerTestConfig, + withKataContainers bool, +) { const ( volPath = "/vol1" @@ -163,7 +167,20 @@ func (l local) testDaxInPod( RestartPolicy: v1.RestartPolicyNever, }, } - e2epod.SetNodeSelection(&pod.Spec, config.ClientNodeSelection) + if withKataContainers { + pod.Name += "-kata" + runtimeClassName := "kata-qemu" + pod.Spec.RuntimeClassName = &runtimeClassName + pod.Spec.NodeSelector = map[string]string{ + "katacontainers.io/kata-runtime": "true", + } + if pod.Labels == nil { + pod.Labels = map[string]string{} + } + pod.Labels["io.katacontainers.config.hypervisor.memory_offset"] = "2147483648" // large enough for all test volumes + } else { + e2epod.SetNodeSelection(&pod.Spec, config.ClientNodeSelection) + } switch volumeMode { case v1.PersistentVolumeBlock: // This is what we would like to use: @@ -244,9 +261,36 @@ func (l local) testDaxInPod( pmempod.RunInPod(f, l.root, []string{l.daxCheckBinary}, l.daxCheckBinary+" /no-dax; if [ $? -ne 1 ]; then echo should have reported missing DAX >&2; exit 1; fi", ns, pod.Name, containerName) By("checking volume for DAX support") - pmempod.RunInPod(f, l.root, []string{l.daxCheckBinary}, l.daxCheckBinary+" /mnt/daxtest", ns, pod.Name, containerName) + pmempod.RunInPod(f, l.root, []string{l.daxCheckBinary}, "lsblk; mount | grep /mnt; "+l.daxCheckBinary+" /mnt/daxtest", ns, pod.Name, containerName) + + // Data written in a container running under Kata Containers + // should be visible also in a normal container, unless the + // volume itself is ephemeral of course. We currently don't + // have DAX support there, though. + checkWithNormalRuntime := withKataContainers && source.CSI == nil + if checkWithNormalRuntime { + By("creating file for usage under normal pod") + pmempod.RunInPod(f, l.root, nil, "touch /mnt/hello-world", ns, pod.Name, containerName) + } By(fmt.Sprintf("Deleting pod %s", pod.Name)) err := e2epod.DeletePodWithWait(f.ClientSet, pod) framework.ExpectNoError(err, "while deleting pod") + + if checkWithNormalRuntime { + // Check for data written earlier. + pod.Spec.RuntimeClassName = nil + pod.Name = "data-volume-test" + + By(fmt.Sprintf("Creating pod %s", pod.Name)) + createdPod = podClient.Create(pod) + podErr := e2epod.WaitForPodRunningInNamespace(f.ClientSet, createdPod) + framework.ExpectNoError(podErr, "running second pod") + By("checking for previously created file under normal pod") + pmempod.RunInPod(f, l.root, nil, "ls -l /mnt/hello-world", ns, pod.Name, containerName) + + By(fmt.Sprintf("Deleting pod %s", pod.Name)) + err := e2epod.DeletePodWithWait(f.ClientSet, pod) + framework.ExpectNoError(err, "while deleting pod") + } } diff --git a/test/setup-deployment.sh b/test/setup-deployment.sh index 3c3dcb8d6d..b9b5d54ab6 100755 --- a/test/setup-deployment.sh +++ b/test/setup-deployment.sh @@ -30,7 +30,9 @@ esac DEPLOY=( ${TEST_DEVICEMODE}${deployment_suffix} pmem-storageclass-ext4.yaml + pmem-storageclass-ext4-kata.yaml pmem-storageclass-xfs.yaml + pmem-storageclass-xfs-kata.yaml pmem-storageclass-cache.yaml pmem-storageclass-late-binding.yaml scheduler diff --git a/test/setup-kata-containers.sh b/test/setup-kata-containers.sh new file mode 100755 index 0000000000..0e618d88b6 --- /dev/null +++ b/test/setup-kata-containers.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -o errexit +set -o pipefail + +TEST_DIRECTORY=${TEST_DIRECTORY:-$(dirname $(readlink -f $0))} +source ${TEST_CONFIG:-${TEST_DIRECTORY}/test-config.sh} + +CLUSTER=${CLUSTER:-pmem-govm} +REPO_DIRECTORY="${REPO_DIRECTORY:-$(dirname $(dirname $(readlink -f $0)))}" +CLUSTER_DIRECTORY="${CLUSTER_DIRECTORY:-${REPO_DIRECTORY}/_work/${CLUSTER}}" +SSH="${CLUSTER_DIRECTORY}/ssh.0" +KUBECTL="${SSH} kubectl" # Always use the kubectl installed in the cluster. + +curl --location --fail --silent \ + https://github.com/kata-containers/packaging/raw/${TEST_KATA_CONTAINERS_VERSION}/kata-deploy/kata-rbac/base/kata-rbac.yaml | + ${KUBECTL} apply -f - + +# kata-deploy.yaml always installs the latest Kata Containers. We override that +# here by locking the image to the specific version that we want. +curl --location --fail --silent \ + https://github.com/kata-containers/packaging/raw/${TEST_KATA_CONTAINERS_VERSION}/kata-deploy/kata-deploy/base/kata-deploy.yaml | + sed -e "s;image: katadocker/kata-deploy.*;image: katadocker/kata-deploy:${TEST_KATA_CONTAINERS_VERSION};" | + ${KUBECTL} apply -f - + +curl --location --fail --silent \ + https://raw.githubusercontent.com/kata-containers/packaging/${TEST_KATA_CONTAINERS_VERSION}/kata-deploy/k8s-1.14/kata-qemu-runtimeClass.yaml | + ${KUBECTL} apply -f - + +echo "Waiting for kata-deploy to label nodes..." +TIMEOUT=120 +while [ "$SECONDS" -lt "$TIMEOUT" ]; do + # We either get "No resources found in default namespace." or header plus node names. + if [ $(${KUBECTL} get nodes -l katacontainers.io/kata-runtime=true 2>&1 | wc -l) -gt 1 ]; then + echo "Kata Containers runtime available on:" + ${KUBECTL} get nodes -l katacontainers.io/kata-runtime=true + exit 0 + fi +done + +echo "kata-deploy has not labelled nodes after $TIMEOUT seconds. Is the container runtime perhaps Docker? It is not supported." +exit 1 diff --git a/test/start-kubernetes.sh b/test/start-kubernetes.sh index df8dd03c02..54eaea3c7c 100755 --- a/test/start-kubernetes.sh +++ b/test/start-kubernetes.sh @@ -56,7 +56,7 @@ case ${TEST_DISTRO} in else # Either cloud.img or cloudguest.img is fine, should have the same content. CLOUD_IMAGE=${CLOUD_IMAGE:-$(\ - curl -s https://download.clearlinux.org/image/latest-images | + curl -s https://download.clearlinux.org/image/latest-images.json | awk '/cloud.img|cloudguest.img/ {print $0}' | head -n1)} fi @@ -510,7 +510,8 @@ if init_workdir && create_vms && NO_PROXY=$(extend_no_proxy) && init_pmem_regions && - init_kubernetes_cluster; then + init_kubernetes_cluster && + ( [ $TEST_CRI = docker ] || ${TEST_DIRECTORY}/setup-kata-containers.sh ); then FAILED=false else exit 1 diff --git a/test/test-config.sh b/test/test-config.sh index 86f081f644..b74ae143cd 100644 --- a/test/test-config.sh +++ b/test/test-config.sh @@ -133,6 +133,11 @@ fi # to use the Kubernetes version that ships with it. : ${TEST_KUBERNETES_VERSION:=1.18} +# If non-empty, the version of Kata Containers which is to be installed +# in the Kubernetes cluster. Installation is done with +# https://hub.docker.com/r/katadocker/kata-deploy +: ${TEST_KATA_CONTAINERS_VERSION:=1.11.0-rc0} + # Kubernetes node port number # (https://kubernetes.io/docs/concepts/services-networking/service/#nodeport) # that is going to be used by kube-scheduler to reach the scheduler From d9f6adce46d547666fb02ef88647713776723c36 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Sun, 10 May 2020 21:17:15 +0200 Subject: [PATCH 24/28] test: dump Kubernetes objects after setup failure It can happen that Kubernetes comes up, but something else (like Kata Containers) doesn't. In that case "kubectl get all" may provide some hint. --- test/start-kubernetes.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/start-kubernetes.sh b/test/start-kubernetes.sh index 54eaea3c7c..25c08ead92 100755 --- a/test/start-kubernetes.sh +++ b/test/start-kubernetes.sh @@ -475,6 +475,12 @@ function cleanup() ( if $FAILED; then set +xe echo "Cluster creation failed." + if [ -e ${CLUSTER_DIRECTORY}/kube.config ] && [ -e ${CLUSTER_DIRECTORY}/ssh.0 ]; then + echo "Kubernetes status: " + ${CLUSTER_DIRECTORY}/ssh.0 kubectl get all --all-namespaces -o wide + else + echo "No ${CLUSTER_DIRECTORY}/kube.config or no ssh.0 -> no Kubernetes." + fi echo "govm status:" govm list echo "Docker status:" From 9e7ff93ef21633813ae6d632e0e513870741e91f Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Sun, 10 May 2020 21:19:33 +0200 Subject: [PATCH 25/28] test: increase timeout for Kata Containers Two minutes was enough locally, but not for the CI. --- test/setup-kata-containers.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/setup-kata-containers.sh b/test/setup-kata-containers.sh index 0e618d88b6..8723a86772 100755 --- a/test/setup-kata-containers.sh +++ b/test/setup-kata-containers.sh @@ -28,7 +28,7 @@ curl --location --fail --silent \ ${KUBECTL} apply -f - echo "Waiting for kata-deploy to label nodes..." -TIMEOUT=120 +TIMEOUT=300 while [ "$SECONDS" -lt "$TIMEOUT" ]; do # We either get "No resources found in default namespace." or header plus node names. if [ $(${KUBECTL} get nodes -l katacontainers.io/kata-runtime=true 2>&1 | wc -l) -gt 1 ]; then From 2001229da44652155cb09adfd3e970bcf2593fa7 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Mon, 11 May 2020 08:08:38 +0200 Subject: [PATCH 26/28] test: create target directories "make start" in an empty _work failed with: tar zxf _work/govm_0.9-alpha_Linux_amd64.tar.gz -C _work/bin/ tar: _work/bin: Cannot open: No such file or directory --- test/start-stop.make | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/start-stop.make b/test/start-stop.make index 1c36c0ef0a..942827b52f 100644 --- a/test/start-stop.make +++ b/test/start-stop.make @@ -1,9 +1,11 @@ # Build a suitable https://github.com/govm-project/govm/releases/tag/latest version. GOVM_VERSION=0.9-alpha _work/govm_$(GOVM_VERSION)_Linux_amd64.tar.gz: + mkdir -p $(@D) curl -L https://github.com/govm-project/govm/releases/download/$(GOVM_VERSION)/govm_$(GOVM_VERSION)_Linux_x86_64.tar.gz -o $(abspath $@) _work/bin/govm: _work/govm_$(GOVM_VERSION)_Linux_amd64.tar.gz + mkdir -p $(@D) tar zxf $< -C _work/bin/ touch $@ From 9bc0b3b17bf6d84d873e3b881a816bd21b8334b6 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Mon, 11 May 2020 08:51:17 +0200 Subject: [PATCH 27/28] test: fix kata-deploy on Clear Due to a race condition (?), kata-deploy fails in the CI because /etc/crio/crio.conf didn't exist at the time that it ran: $ kubectl logs -n kube-system kata-deploy-2dh2f copying kata artifacts onto host Add Kata Containers as a supported runtime for CRIO: cp: cannot stat '/etc/crio/crio.conf': No such file or directory Somehow it worked locally. --- test/setup-clear-govm.sh | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/test/setup-clear-govm.sh b/test/setup-clear-govm.sh index eac35f25e0..c55f2a2850 100755 --- a/test/setup-clear-govm.sh +++ b/test/setup-clear-govm.sh @@ -153,13 +153,21 @@ EOF After=$cri_daemon.service EOF - # flannel + CRI-O + Kata Containers needs a crio.conf change (https://clearlinux.org/documentation/clear-linux/tutorials/kubernetes): + # flannel + CRI-O + Kata Containers needs a crio.conf change (https://docs.01.org/clearlinux/latest/tutorials/kubernetes.html): # If you are using CRI-O and flannel and you want to use Kata Containers, edit the /etc/crio/crio.conf file to add: # [crio.runtime] # manage_network_ns_lifecycle = true # - # We don't use Kata Containers, so that particular change is not made to /etc/crio/crio.conf - # at this time. + # That comment seems to be out-dated, /usr/share/defaults/crio/crio.conf already contains that. + # + # If kata-runtime is installed, it may install a systemd overlay which copies the defaults to + # /etc/crio, something that is needed to enable Kata Containers because some entries for that + # runtime have to be added there. "kata-deploy" will fail if /etc/crio/crio.conf does not exist. + # + # But that systemd mechanism seems to be unreliable (failed in CI, worked locally) and is + # meant to be removed, so we do it ourselves here. + mkdir -p /etc/crio + cp /usr/share/defaults/crio/crio.conf /etc/crio # /opt/cni/bin is where runtimes like CRI-O expect CNI plugins. But cloud-native-basic installs into # /usr/libexec/cni. Instructions at https://clearlinux.org/documentation/clear-linux/tutorials/kubernetes#id2 From 4dcfcf50fd4d910a7b141399e44fff35ec4d0437 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Mon, 11 May 2020 22:28:21 +0200 Subject: [PATCH 28/28] test: disable Kata Containers by default Even with the "-vmx" override in the Jenkinsfile removed, nested virtualization with three levels (Azure HyperV -> QEMU (govm) -> QEMU (Kata Containers)) was not working well enough for Kata Containers: because the inner VM runs very slowly, there are timeouts in the communication between kubelet and Kata Containers ("container create failed: Failed to check if grpc server is working: rpc error: code = Unavailable desc = transport is closing"). That means that testing with Kata Containers has to be limited to bare metal. To achive that, it's turned off by default (and thus in the CI, which only runs on Azure) and has to be enabled with TEST_KATA_CONTAINERS_VERSION=1.11.0-rc0 or by invoking test/setup-kata-containers.sh manually. --- test/setup-kata-containers.sh | 9 +++++---- test/start-kubernetes.sh | 2 +- test/test-config.sh | 4 +++- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/test/setup-kata-containers.sh b/test/setup-kata-containers.sh index 8723a86772..07a242bada 100755 --- a/test/setup-kata-containers.sh +++ b/test/setup-kata-containers.sh @@ -11,20 +11,21 @@ REPO_DIRECTORY="${REPO_DIRECTORY:-$(dirname $(dirname $(readlink -f $0)))}" CLUSTER_DIRECTORY="${CLUSTER_DIRECTORY:-${REPO_DIRECTORY}/_work/${CLUSTER}}" SSH="${CLUSTER_DIRECTORY}/ssh.0" KUBECTL="${SSH} kubectl" # Always use the kubectl installed in the cluster. +VERSION="${TEST_KATA_CONTAINERS_VERSION:-1.11.0-rc0}" curl --location --fail --silent \ - https://github.com/kata-containers/packaging/raw/${TEST_KATA_CONTAINERS_VERSION}/kata-deploy/kata-rbac/base/kata-rbac.yaml | + https://github.com/kata-containers/packaging/raw/${VERSION}/kata-deploy/kata-rbac/base/kata-rbac.yaml | ${KUBECTL} apply -f - # kata-deploy.yaml always installs the latest Kata Containers. We override that # here by locking the image to the specific version that we want. curl --location --fail --silent \ - https://github.com/kata-containers/packaging/raw/${TEST_KATA_CONTAINERS_VERSION}/kata-deploy/kata-deploy/base/kata-deploy.yaml | - sed -e "s;image: katadocker/kata-deploy.*;image: katadocker/kata-deploy:${TEST_KATA_CONTAINERS_VERSION};" | + https://github.com/kata-containers/packaging/raw/${VERSION}/kata-deploy/kata-deploy/base/kata-deploy.yaml | + sed -e "s;image: katadocker/kata-deploy.*;image: katadocker/kata-deploy:${VERSION};" | ${KUBECTL} apply -f - curl --location --fail --silent \ - https://raw.githubusercontent.com/kata-containers/packaging/${TEST_KATA_CONTAINERS_VERSION}/kata-deploy/k8s-1.14/kata-qemu-runtimeClass.yaml | + https://raw.githubusercontent.com/kata-containers/packaging/${VERSION}/kata-deploy/k8s-1.14/kata-qemu-runtimeClass.yaml | ${KUBECTL} apply -f - echo "Waiting for kata-deploy to label nodes..." diff --git a/test/start-kubernetes.sh b/test/start-kubernetes.sh index 25c08ead92..4e53d80a92 100755 --- a/test/start-kubernetes.sh +++ b/test/start-kubernetes.sh @@ -517,7 +517,7 @@ if init_workdir && NO_PROXY=$(extend_no_proxy) && init_pmem_regions && init_kubernetes_cluster && - ( [ $TEST_CRI = docker ] || ${TEST_DIRECTORY}/setup-kata-containers.sh ); then + ( ! [ "${TEST_KATA_CONTAINERS_VERSION}" ] || ${TEST_DIRECTORY}/setup-kata-containers.sh ); then FAILED=false else exit 1 diff --git a/test/test-config.sh b/test/test-config.sh index b74ae143cd..e2bc25a52a 100644 --- a/test/test-config.sh +++ b/test/test-config.sh @@ -136,7 +136,9 @@ fi # If non-empty, the version of Kata Containers which is to be installed # in the Kubernetes cluster. Installation is done with # https://hub.docker.com/r/katadocker/kata-deploy -: ${TEST_KATA_CONTAINERS_VERSION:=1.11.0-rc0} +# +# The version should be >= 1.11.0-rc0 for support of PMEM volumes. +: ${TEST_KATA_CONTAINERS_VERSION:=} # Kubernetes node port number # (https://kubernetes.io/docs/concepts/services-networking/service/#nodeport)