diff --git a/libos/include/libos_fs.h b/libos/include/libos_fs.h index bdb2ad1308..f87c6f6595 100644 --- a/libos/include/libos_fs.h +++ b/libos/include/libos_fs.h @@ -282,6 +282,9 @@ struct libos_inode { /* Filesystem-specific data */ void* data; + /* Number of VMAs the file is mmapped to; should be accessed using atomic operations. */ + uint64_t num_mmapped; + struct libos_lock lock; refcount_t ref_count; }; diff --git a/libos/include/libos_vma.h b/libos/include/libos_vma.h index 98499d2272..37130174a1 100644 --- a/libos/include/libos_vma.h +++ b/libos/include/libos_vma.h @@ -130,6 +130,9 @@ int msync_range(uintptr_t begin, uintptr_t end); /* Call `msync` for file mappings of `hdl` */ int msync_handle(struct libos_handle* hdl); +/* Reload file mappings of `hdl` */ +int reload_mmaped_from_file_handle(struct libos_handle* hdl); + void debug_print_all_vmas(void); /* Returns the peak amount of memory usage */ diff --git a/libos/src/bookkeep/libos_vma.c b/libos/src/bookkeep/libos_vma.c index 5af88d08a7..d3f9933d95 100644 --- a/libos/src/bookkeep/libos_vma.c +++ b/libos/src/bookkeep/libos_vma.c @@ -70,6 +70,8 @@ static void copy_vma(struct libos_vma* old_vma, struct libos_vma* new_vma) { new_vma->flags = old_vma->flags; new_vma->file = old_vma->file; if (new_vma->file) { + if (new_vma->file->inode) + (void)__atomic_add_fetch(&new_vma->file->inode->num_mmapped, 1, __ATOMIC_RELAXED); get_handle(new_vma->file); } new_vma->offset = old_vma->offset; @@ -512,6 +514,12 @@ static struct libos_vma* alloc_vma(void) { static void free_vma(struct libos_vma* vma) { if (vma->file) { + if (vma->file->inode) { + uint64_t old_num_mmapped = __atomic_fetch_sub(&vma->file->inode->num_mmapped, 1, + __ATOMIC_RELAXED); + assert(old_num_mmapped > 0); + (void)old_num_mmapped; + } put_handle(vma->file); } @@ -800,6 +808,8 @@ int bkeep_mmap_fixed(void* addr, size_t length, int prot, int flags, struct libo new_vma->file = file; if (new_vma->file) { get_handle(new_vma->file); + if (new_vma->file->inode) + (void)__atomic_add_fetch(&new_vma->file->inode->num_mmapped, 1, __ATOMIC_RELAXED); } new_vma->offset = file ? offset : 0; copy_comment(new_vma, comment ?: ""); @@ -1049,6 +1059,8 @@ int bkeep_mmap_any_in_range(void* _bottom_addr, void* _top_addr, size_t length, new_vma->file = file; if (new_vma->file) { get_handle(new_vma->file); + if (new_vma->file->inode) + (void)__atomic_add_fetch(&new_vma->file->inode->num_mmapped, 1, __ATOMIC_RELAXED); } new_vma->offset = file ? offset : 0; copy_comment(new_vma, comment ?: ""); @@ -1361,6 +1373,110 @@ static bool vma_filter_needs_msync(struct libos_vma* vma, void* arg) { return true; } +static bool vma_filter_needs_reload(struct libos_vma* vma, void* arg) { + struct libos_handle* hdl = arg; + assert(hdl && hdl->inode); /* guaranteed to have inode because invoked from `write` callback */ + + if (vma->flags & (VMA_UNMAPPED | VMA_INTERNAL | MAP_ANONYMOUS | MAP_PRIVATE)) + return false; + + assert(vma->file); /* check above filtered out non-file-backed mappings */ + + if (!vma->file->inode || vma->file->inode != hdl->inode) + return false; + + if (!vma->file->fs || !vma->file->fs->fs_ops || !vma->file->fs->fs_ops->read) + return false; + + if (!(vma->file->acc_mode & MAY_READ)) + return false; + + return true; +} + +static int reload_vma(struct libos_vma_info* vma_info) { + int ret; + struct libos_handle* file = vma_info->file; + assert(file && file->fs && file->fs->fs_ops && file->fs->fs_ops->read); + + /* NOTE: Unfortunately there's a data race here: the memory can be unmapped, or remapped, by + * another thread by the time we get to `read`. */ + uintptr_t read_begin = (uintptr_t)vma_info->addr; + uintptr_t read_end = (uintptr_t)vma_info->addr + vma_info->length; + assert(IS_ALLOC_ALIGNED(read_begin)); + assert(IS_ALLOC_ALIGNED(read_end)); + + size_t size = read_end - read_begin; + size_t read = 0; + file_off_t pos = (file_off_t)vma_info->file_offset; + pal_prot_flags_t pal_prot = LINUX_PROT_TO_PAL(vma_info->prot, vma_info->flags); + pal_prot_flags_t pal_prot_writable = pal_prot | PAL_PROT_WRITE; + + if (pal_prot != pal_prot_writable) { + /* make the area writable so that it can be reloaded */ + ret = PalVirtualMemoryProtect((void*)read_begin, size, pal_prot_writable); + if (ret < 0) + return pal_to_unix_errno(ret); + } + + while (read < size) { + size_t to_read = size - read; + ssize_t count = file->fs->fs_ops->read(file, (void*)(read_begin + read), to_read, &pos); + if (count < 0) { + if (count == -EINTR || count == -EAGAIN) { + continue; + } + ret = count; + goto out; + } else if (count == 0) { + /* it's possible that the underlying file contents do not cover the whole VMA region */ + break; + } + assert((size_t)count <= to_read); + read += count; + } + + ret = 0; +out: + if (pal_prot != pal_prot_writable) { + /* the area was made writable above; restore the original permissions */ + int protect_ret = PalVirtualMemoryProtect((void*)read_begin, size, pal_prot); + if (protect_ret < 0) { + log_error("restore original permissions failed: %s", pal_strerror(protect_ret)); + BUG(); + } + } + + return ret; +} + +/* This helper function is to reload the VMA contents of a given file handle on `write`. + * + * NOTE: the `write` callback can be invoked from multiple paths (syscalls like `munmap()`, + * `mmap(MAP_FIXED_NOREPLACE)` and `msync()`) via the `msync` callback, so blindly reloading the VMA + * contents on e.g. `munmap()` can be inefficient (but unmapping file-backed memory regions + * shouldn't be a frequent operation). */ +int reload_mmaped_from_file_handle(struct libos_handle* hdl) { + struct libos_vma_info* vma_infos; + size_t count; + + int ret = dump_vmas(&vma_infos, &count, /*begin=*/0, /*end=*/UINTPTR_MAX, + vma_filter_needs_reload, hdl); + if (ret < 0) + return ret; + + for (size_t i = 0; i < count; i++) { + ret = reload_vma(&vma_infos[i]); + if (ret < 0) + goto out; + } + + ret = 0; +out: + free_vma_info_array(vma_infos, count); + return ret; +} + static int msync_all(uintptr_t begin, uintptr_t end, struct libos_handle* hdl) { assert(IS_ALLOC_ALIGNED(begin)); assert(end == UINTPTR_MAX || IS_ALLOC_ALIGNED(end)); diff --git a/libos/src/fs/chroot/encrypted.c b/libos/src/fs/chroot/encrypted.c index 19d6d3a016..1e2852d5f3 100644 --- a/libos/src/fs/chroot/encrypted.c +++ b/libos/src/fs/chroot/encrypted.c @@ -450,18 +450,28 @@ static ssize_t chroot_encrypted_write(struct libos_handle* hdl, const void* buf, lock(&hdl->inode->lock); int ret = encrypted_file_write(enc, buf, count, *pos, &actual_count); - if (ret < 0) - goto out; + if (ret < 0) { + unlock(&hdl->inode->lock); + return ret; + } assert(actual_count <= count); *pos += actual_count; if (hdl->inode->size < *pos) hdl->inode->size = *pos; - ret = 0; -out: unlock(&hdl->inode->lock); - return ret < 0 ? ret : (ssize_t)actual_count; + + /* If there are any MAP_SHARED mappings for the file, this will read data from `enc`. */ + if (__atomic_load_n(&hdl->inode->num_mmapped, __ATOMIC_ACQUIRE) != 0) { + ret = reload_mmaped_from_file_handle(hdl); + if (ret < 0) { + log_error("reload mmapped regions of file failed: %s", unix_strerror(ret)); + BUG(); + } + } + + return (ssize_t)actual_count; } static int chroot_encrypted_truncate(struct libos_handle* hdl, file_off_t size) { diff --git a/libos/src/fs/chroot/fs.c b/libos/src/fs/chroot/fs.c index d2f9bd1403..0d53726393 100644 --- a/libos/src/fs/chroot/fs.c +++ b/libos/src/fs/chroot/fs.c @@ -17,6 +17,7 @@ #include "libos_internal.h" #include "libos_lock.h" #include "libos_utils.h" +#include "libos_vma.h" #include "linux_abi/errors.h" #include "linux_abi/fs.h" #include "linux_abi/memory.h" @@ -310,7 +311,17 @@ static ssize_t chroot_write(struct libos_handle* hdl, const void* buf, size_t co hdl->inode->size = *pos; unlock(&hdl->inode->lock); } - return actual_count; + + /* If there are any MAP_SHARED mappings for the file, this will read data from `hdl`. */ + if (__atomic_load_n(&hdl->inode->num_mmapped, __ATOMIC_ACQUIRE) != 0) { + ret = reload_mmaped_from_file_handle(hdl); + if (ret < 0) { + log_error("reload mmapped regions of file failed: %s", unix_strerror(ret)); + BUG(); + } + } + + return (ssize_t)actual_count; } static int chroot_mmap(struct libos_handle* hdl, void* addr, size_t size, int prot, int flags, diff --git a/libos/src/fs/tmpfs/fs.c b/libos/src/fs/tmpfs/fs.c index cba4e89ad1..a2190a7fb2 100644 --- a/libos/src/fs/tmpfs/fs.c +++ b/libos/src/fs/tmpfs/fs.c @@ -259,8 +259,10 @@ static ssize_t tmpfs_write(struct libos_handle* hdl, const void* buf, size_t siz struct libos_mem_file* mem = inode->data; ret = mem_file_write(mem, *pos, buf, size); - if (ret < 0) - goto out; + if (ret < 0) { + unlock(&inode->lock); + return ret; + } inode->size = mem->size; @@ -268,8 +270,17 @@ static ssize_t tmpfs_write(struct libos_handle* hdl, const void* buf, size_t siz inode->mtime = time_us / USEC_IN_SEC; /* keep `ret` */ -out: unlock(&inode->lock); + + /* If there are any MAP_SHARED mappings for the file, this will read data from `hdl`. */ + if (__atomic_load_n(&hdl->inode->num_mmapped, __ATOMIC_ACQUIRE) != 0) { + int reload_ret = reload_mmaped_from_file_handle(hdl); + if (reload_ret < 0) { + log_error("reload mmapped regions of file failed: %s", unix_strerror(reload_ret)); + BUG(); + } + } + return ret; } diff --git a/libos/test/fs/meson.build b/libos/test/fs/meson.build index cbce1090f8..ce3f8b2e90 100644 --- a/libos/test/fs/meson.build +++ b/libos/test/fs/meson.build @@ -43,6 +43,7 @@ tests = { 'open_close': {}, 'open_flags': {}, 'read_write': {}, + 'read_write_mmap': {}, 'seek_tell': {}, 'seek_tell_truncate': {}, 'stat': {}, diff --git a/libos/test/fs/read_write_mmap.c b/libos/test/fs/read_write_mmap.c new file mode 100644 index 0000000000..fdff15d78f --- /dev/null +++ b/libos/test/fs/read_write_mmap.c @@ -0,0 +1,96 @@ +#include "common.h" + +static void read_write_mmap(const char* file_path) { + const size_t size = 1024 * 1024; + int fd = open_output_fd(file_path, /*rdwr=*/true); + printf("open(%s) RW (mmap) OK\n", file_path); + + if (ftruncate(fd, size) == -1) { + close(fd); + fatal_error("ftruncate\n"); + } + + void* buf_write = alloc_buffer(size); + void* buf_read = alloc_buffer(size); + + void* m = mmap_fd(file_path, fd, PROT_READ | PROT_WRITE, 0, size); + printf("mmap_fd(%zu) OK\n", size); + fill_random(m, size); + int ret = msync(m, size, MS_SYNC); + if (ret < 0) { + close(fd); + fatal_error("msync\n"); + } + + read_fd(file_path, fd, buf_read, size); + printf("read(%s) 1 RW (mmap) OK\n", file_path); + if (memcmp(m, buf_read, size) != 0) + fatal_error("Read data via read() is different from what was written in the mapping\n"); + + fill_random(buf_write, size); + seek_fd(file_path, fd, 0, SEEK_SET); + printf("seek(%s) 1 RW (mmap) OK\n", file_path); + write_fd(file_path, fd, buf_write, size); + printf("write(%s) RW (mmap) OK\n", file_path); + seek_fd(file_path, fd, 0, SEEK_SET); + printf("seek(%s) 2 RW (mmap) OK\n", file_path); + read_fd(file_path, fd, buf_read, size); + printf("read(%s) 2 RW (mmap) OK\n", file_path); + if (memcmp(buf_write, buf_read, size) != 0) + fatal_error("Read data via read() is different from what was written via write()\n"); + if (memcmp(buf_write, m, size) != 0) + fatal_error("Read data in the mapping is different from what was written via write()\n"); + printf("compare(%s) RW (mmap) OK\n", file_path); + + munmap_fd(file_path, m, size); + printf("munmap_fd(%zu) OK\n", size); + close_fd(file_path, fd); + printf("close(%s) RW (mmap) OK\n", file_path); + free(buf_write); + free(buf_read); +} + +static void read_write_mmap_different_fds(const char* file_path) { + const size_t size = 1024 * 1024; + int fd1 = open_output_fd(file_path, /*rdwr=*/true); + printf("open(%s) RW fd1 (mmap) OK\n", file_path); + + if (ftruncate(fd1, size) == -1) { + close(fd1); + fatal_error("ftruncate fd1\n"); + } + + int fd2 = open_output_fd(file_path, /*rdwr=*/true); + printf("open(%s) RW fd2 OK\n", file_path); + + void* m = mmap_fd(file_path, fd1, PROT_READ, 0, size); + printf("mmap_fd(%zu) fd1 OK\n", size); + + void* buf_write = alloc_buffer(size); + fill_random(buf_write, size); + write_fd(file_path, fd2, buf_write, size); + printf("write(%s) RW fd2 OK\n", file_path); + + if (memcmp(m, buf_write, size) != 0) + fatal_error("Read data from the mapping is different from what was written via write() by " + "another fd of the same file\n"); + + munmap_fd(file_path, m, size); + printf("munmap_fd(%zu) fd1 OK\n", size); + close_fd(file_path, fd1); + printf("close(%s) RW fd1 (mmap) OK\n", file_path); + close_fd(file_path, fd2); + printf("close(%s) RW fd2 OK\n", file_path); + free(buf_write); +} + +int main(int argc, char* argv[]) { + if (argc < 2) + fatal_error("Usage: %s \n", argv[0]); + + setup(); + read_write_mmap(argv[1]); + read_write_mmap_different_fds(argv[1]); + + return 0; +} diff --git a/libos/test/fs/test_enc.py b/libos/test/fs/test_enc.py index c62d70875d..ee90890a6d 100644 --- a/libos/test/fs/test_enc.py +++ b/libos/test/fs/test_enc.py @@ -91,6 +91,33 @@ def test_100_open_close(self): self.verify_open_close(stdout, stderr, output_path, 'output') self.assertTrue(os.path.isfile(output_path)) + # overrides TC_00_FileSystem to not skip Gramine-SGX + def test_111_read_write_mmap(self): + file_path = os.path.join(self.OUTPUT_DIR, 'test_111') # new file to be created + stdout, stderr = self.run_binary(['read_write_mmap', file_path]) + size = '1048576' + self.assertNotIn('ERROR: ', stderr) + self.assertTrue(os.path.isfile(file_path)) + + self.assertIn('open(' + file_path + ') RW (mmap) OK', stdout) + self.assertIn('mmap_fd(' + size + ') OK', stdout) + self.assertIn('read(' + file_path + ') 1 RW (mmap) OK', stdout) + self.assertIn('seek(' + file_path + ') 1 RW (mmap) OK', stdout) + self.assertIn('write(' + file_path + ') RW (mmap) OK', stdout) + self.assertIn('seek(' + file_path + ') 2 RW (mmap) OK', stdout) + self.assertIn('read(' + file_path + ') 2 RW (mmap) OK', stdout) + self.assertIn('compare(' + file_path + ') RW (mmap) OK', stdout) + self.assertIn('munmap_fd(' + size + ') OK', stdout) + self.assertIn('close(' + file_path + ') RW (mmap) OK', stdout) + + self.assertIn('open(' + file_path + ') RW fd1 (mmap) OK', stdout) + self.assertIn('open(' + file_path + ') RW fd2 OK', stdout) + self.assertIn('mmap_fd(' + size + ') fd1 OK', stdout) + self.assertIn('write(' + file_path + ') RW fd2 OK', stdout) + self.assertIn('munmap_fd(' + size + ') fd1 OK', stdout) + self.assertIn('close(' + file_path + ') RW fd1 (mmap) OK', stdout) + self.assertIn('close(' + file_path + ') RW fd2 OK', stdout) + # overrides TC_00_FileSystem to change input dir (from plaintext to encrypted) def test_115_seek_tell(self): # the test binary expects a path to read-only (existing) file and two paths to files that diff --git a/libos/test/fs/test_fs.py b/libos/test/fs/test_fs.py index c249555752..99f0dd599c 100644 --- a/libos/test/fs/test_fs.py +++ b/libos/test/fs/test_fs.py @@ -95,6 +95,37 @@ def test_110_read_write(self): self.assertIn('compare(' + file_path + ') RW OK', stdout) self.assertIn('close(' + file_path + ') RW OK', stdout) + # Gramine's implementation of file_map doesn't currently support shared memory-mapped regular + # chroot files with write permission in PAL/Linux-SGX (like mmap(PROT_WRITE, MAP_SHARED, fd)). + # Below test requires it, so skip it. We decided not to implement it as we don't know any + # workload using it. + @unittest.skipIf(HAS_SGX, 'mmap(PROT_WRITE, MAP_SHARED, fd) not implemented in Linux-SGX PAL') + def test_111_read_write_mmap(self): + file_path = os.path.join(self.OUTPUT_DIR, 'test_111') # new file to be created + stdout, stderr = self.run_binary(['read_write_mmap', file_path]) + size = '1048576' + self.assertNotIn('ERROR: ', stderr) + self.assertTrue(os.path.isfile(file_path)) + + self.assertIn('open(' + file_path + ') RW (mmap) OK', stdout) + self.assertIn('mmap_fd(' + size + ') OK', stdout) + self.assertIn('read(' + file_path + ') 1 RW (mmap) OK', stdout) + self.assertIn('seek(' + file_path + ') 1 RW (mmap) OK', stdout) + self.assertIn('write(' + file_path + ') RW (mmap) OK', stdout) + self.assertIn('seek(' + file_path + ') 2 RW (mmap) OK', stdout) + self.assertIn('read(' + file_path + ') 2 RW (mmap) OK', stdout) + self.assertIn('compare(' + file_path + ') RW (mmap) OK', stdout) + self.assertIn('munmap_fd(' + size + ') OK', stdout) + self.assertIn('close(' + file_path + ') RW (mmap) OK', stdout) + + self.assertIn('open(' + file_path + ') RW fd1 (mmap) OK', stdout) + self.assertIn('open(' + file_path + ') RW fd2 OK', stdout) + self.assertIn('mmap_fd(' + size + ') fd1 OK', stdout) + self.assertIn('write(' + file_path + ') RW fd2 OK', stdout) + self.assertIn('munmap_fd(' + size + ') fd1 OK', stdout) + self.assertIn('close(' + file_path + ') RW fd1 (mmap) OK', stdout) + self.assertIn('close(' + file_path + ') RW fd2 OK', stdout) + # pylint: disable=too-many-arguments def verify_seek_tell(self, stdout, output_path_1, output_path_2, size): self.assertIn('Test passed', stdout) diff --git a/libos/test/fs/test_tmpfs.py b/libos/test/fs/test_tmpfs.py index b359d669f5..a57d080c40 100644 --- a/libos/test/fs/test_tmpfs.py +++ b/libos/test/fs/test_tmpfs.py @@ -46,6 +46,32 @@ def test_110_read_write(self): self.assertIn('compare(' + file_path + ') RW OK', stdout) self.assertIn('close(' + file_path + ') RW OK', stdout) + # overrides TC_00_FileSystem to not skip Gramine-SGX and to skip verification of file existence + def test_111_read_write_mmap(self): + file_path = os.path.join(self.OUTPUT_DIR, 'test_111') # new file to be created + stdout, stderr = self.run_binary(['read_write_mmap', file_path]) + size = '1048576' + self.assertNotIn('ERROR: ', stderr) + + self.assertIn('open(' + file_path + ') RW (mmap) OK', stdout) + self.assertIn('mmap_fd(' + size + ') OK', stdout) + self.assertIn('read(' + file_path + ') 1 RW (mmap) OK', stdout) + self.assertIn('seek(' + file_path + ') 1 RW (mmap) OK', stdout) + self.assertIn('write(' + file_path + ') RW (mmap) OK', stdout) + self.assertIn('seek(' + file_path + ') 2 RW (mmap) OK', stdout) + self.assertIn('read(' + file_path + ') 2 RW (mmap) OK', stdout) + self.assertIn('compare(' + file_path + ') RW (mmap) OK', stdout) + self.assertIn('munmap_fd(' + size + ') OK', stdout) + self.assertIn('close(' + file_path + ') RW (mmap) OK', stdout) + + self.assertIn('open(' + file_path + ') RW fd1 (mmap) OK', stdout) + self.assertIn('open(' + file_path + ') RW fd2 OK', stdout) + self.assertIn('mmap_fd(' + size + ') fd1 OK', stdout) + self.assertIn('write(' + file_path + ') RW fd2 OK', stdout) + self.assertIn('munmap_fd(' + size + ') fd1 OK', stdout) + self.assertIn('close(' + file_path + ') RW fd1 (mmap) OK', stdout) + self.assertIn('close(' + file_path + ') RW fd2 OK', stdout) + @unittest.skip("impossible to do setup on tmpfs with python only") def test_115_seek_tell(self): test_fs.TC_00_FileSystem.test_115_seek_tell(self) diff --git a/libos/test/fs/tests.toml b/libos/test/fs/tests.toml index e7e462e314..362b737322 100644 --- a/libos/test/fs/tests.toml +++ b/libos/test/fs/tests.toml @@ -14,6 +14,7 @@ manifests = [ "open_close", "open_flags", "read_write", + "read_write_mmap", "seek_tell", "seek_tell_truncate", "stat",