firecracker-microvm · kalyazin · Mar 15, 2024 · Mar 13, 2024 · Mar 14, 2024 · Mar 14, 2024
@@ -41,6 +41,10 @@ and this project adheres to
   information about page size to the payload Firecracker sends to the UFFD
   handler. Each memory region object now contains a `page_size_kib` field. See
   also the [hugepages documentation](docs/hugepages.md).
+- [#4498](https://github.com/firecracker-microvm/firecracker/pull/4498): Only
+  use memfd to back guest memory if a vhost-user-blk device is configured,
+  otherwise use anonymous private memory. This is because serving page faults of
+  shared memory used by memfd is slower and may impact workloads.
 
 ### Deprecated
 

@@ -74,6 +74,28 @@ be coming from the fact that the custom logic is implemented in the same process
 that handles Virtio queues, which reduces the number of required context
 switches.
 
+## Disadvantages
+
+In order for the backend to be able to process virtio requests, guest memory
+needs to be shared by the frontend to the backend. This means, a shared memory
+mapping is required to back guest memory. When a vhost-user device is
+configured, Firecracker uses `memfd_create` instead of creating an anonymous
+private mapping to achieve that. It was observed that page faults to a shared
+memory mapping take significantly longer (up to 24% in our testing), because
+Linux memory subsystem has to use atomic memory operations to update page
+status, which is an expensive operation under specific conditions. We advise
+users to profile performance on their workloads when considering to use
+vhost-user devices.
+
+## Other considerations
+
+Compared to virtio block device where Firecracker interacts with a drive file on
+the host, vhost-user block device is handled by the backend directly. Some
+workloads may benefit from caching and readahead that the host pagecache offers
+for the backing file. This benefit is not available in vhost-user block case.
+Users may need to implement internal caching within the backend if they find it
+appropriate.
+
 ## Backends
 
 There are a number of open source implementations of a vhost-user backend

diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs
@@ -237,12 +237,39 @@
         .ok_or(MissingKernelConfig)?;
 
     let track_dirty_pages = vm_resources.track_dirty_pages();
-    let guest_memory = GuestMemoryMmap::memfd_backed(
-        vm_resources.vm_config.mem_size_mib,
-        track_dirty_pages,
-        vm_resources.vm_config.huge_pages,
-    )
-    .map_err(StartMicrovmError::GuestMemory)?;
+
+    let vhost_user_device_used = vm_resources
+        .block
+        .devices
+        .iter()
+        .any(|b| b.lock().expect("Poisoned lock").is_vhost_user());
+
+    // Page faults are more expensive for shared memory mapping, including  memfd.
+    // For this reason, we only back guest memory with a memfd
+    // if a vhost-user-blk device is configured in the VM, otherwise we fall back to
+    // an anonymous private memory.
+    //
+    // The vhost-user-blk branch is not currently covered by integration tests in Rust,
+    // because that would require running a backend process. If in the future we converge to
+    // a single way of backing guest memory for vhost-user and non-vhost-user cases,
+    // that would not be worth the effort.
+    let guest_memory = if vhost_user_device_used {
+        GuestMemoryMmap::memfd_backed(
+            vm_resources.vm_config.mem_size_mib,
+            track_dirty_pages,
+            vm_resources.vm_config.huge_pages,
+        )
+        .map_err(StartMicrovmError::GuestMemory)?
+    } else {
+        let regions = crate::arch::arch_memory_regions(vm_resources.vm_config.mem_size_mib << 20);
+        GuestMemoryMmap::from_raw_regions(
+            &regions,
+            track_dirty_pages,
+            vm_resources.vm_config.huge_pages,
+        )
+        .map_err(StartMicrovmError::GuestMemory)?
+    };
+
     let entry_addr = load_kernel(boot_config, &guest_memory)?;
     let initrd = load_initrd_from_config(boot_config, &guest_memory)?;
     // Clone the command-line so that a failed boot doesn't pollute the original.

diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py
@@ -392,7 +392,7 @@ def test_api_machine_config(uvm_plain):
     test_microvm.api.machine_config.patch(mem_size_mib=bad_size)
 
     fail_msg = re.escape(
-        "Invalid Memory Configuration: Cannot resize memfd file: Custom { kind: InvalidInput, error: TryFromIntError(()) }"
+        "Invalid Memory Configuration: Cannot create mmap region: Out of memory (os error 12)"
     )
     with pytest.raises(RuntimeError, match=fail_msg):
         test_microvm.start()

diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py
@@ -18,11 +18,13 @@ def check_hugetlbfs_in_use(pid: int, allocation_name: str):
 
     `allocation_name` should be the name of the smaps entry for which we want to verify that huge pages are used.
     For memfd-backed guest memory, this would be "memfd:guest_mem" (the `guest_mem` part originating from the name
-    we give the memfd in memory.rs), for anonymous memory this would be "/anon_hugepage"
+    we give the memfd in memory.rs), for anonymous memory this would be "/anon_hugepage".
+    Note: in our testing, we do not currently configure vhost-user-blk devices, so we only exercise
+    the "/anon_hugepage" case.
     """
 
     # Format of a sample smaps entry:
-    #   7fc2bc400000-7fc2cc400000 rw-s 00000000 00:10 25488401                   /memfd:guest_mem (deleted)
+    #   7fc2bc400000-7fc2cc400000 rw-s 00000000 00:10 25488401                   /anon_hugepage
     #   Size:             262144 kB
     #   KernelPageSize:     2048 kB
     #   MMUPageSize:        2048 kB
@@ -70,7 +72,7 @@ def test_hugetlbfs_boot(uvm_plain):
 
     check_hugetlbfs_in_use(
         uvm_plain.firecracker_pid,
-        "memfd:guest_mem",
+        "/anon_hugepage",
     )
 
 
@@ -97,7 +99,7 @@ def test_hugetlbfs_snapshot(
     rc, _, _ = vm.ssh.run("true")
     assert not rc
 
-    check_hugetlbfs_in_use(vm.firecracker_pid, "memfd:guest_mem")
+    check_hugetlbfs_in_use(vm.firecracker_pid, "/anon_hugepage")
 
     snapshot = vm.snapshot_full()
 

diff --git a/tests/integration_tests/security/test_jail.py b/tests/integration_tests/security/test_jail.py
@@ -489,21 +489,23 @@ def test_positive_file_size_limit(uvm_plain):
 
 def test_negative_file_size_limit(uvm_plain):
     """
-    Test creating vm fails when memory size exceeds `fsize` limit.
-    This is caused by the fact that we back guest memory by memfd.
+    Test creating snapshot file fails when size exceeds `fsize` limit.
     """
-
-    vm_mem_size = 128
-    jail_limit = (vm_mem_size - 1) << 20
-
     test_microvm = uvm_plain
-    test_microvm.jailer.resource_limits = [f"fsize={jail_limit}"]
+    # limit to 1MB, to account for logs and metrics
+    test_microvm.jailer.resource_limits = [f"fsize={2**20}"]
     test_microvm.spawn()
-    test_microvm.basic_config(mem_size_mib=vm_mem_size)
+    test_microvm.basic_config()
+    test_microvm.start()
 
-    # Attempt to start a vm.
+    test_microvm.pause()
+
+    # Attempt to create a snapshot.
     try:
-        test_microvm.start()
+        test_microvm.api.snapshot_create.put(
+            mem_file_path="/vm.mem",
+            snapshot_path="/vm.vmstate",
+        )
     except (
         http_client.RemoteDisconnected,
         urllib3.exceptions.ProtocolError,