From 2997bf8a3771f20511fe8ab35cb90f8fc82e815f Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Wed, 10 Jan 2024 17:59:43 +0000 Subject: [PATCH 01/16] Add `huge_pages` field to `machine-config` endpoint This field allows specifying whether guest memory for this microVM should be backed by regular, 4K, pages or 2M hugetlbfs pages. Configuration fails if guest memory size is not a multiple of selected page size. Signed-off-by: Patrick Roy --- .../request/machine_configuration.rs | 56 +++++++++++++------ src/vmm/src/device_manager/persist.rs | 3 +- src/vmm/src/persist.rs | 1 + src/vmm/src/resources.rs | 15 ++++- src/vmm/src/vmm_config/machine_config.rs | 42 +++++++++++++- 5 files changed, 97 insertions(+), 20 deletions(-) diff --git a/src/firecracker/src/api_server/request/machine_configuration.rs b/src/firecracker/src/api_server/request/machine_configuration.rs index eeb8216a523..746b1e19009 100644 --- a/src/firecracker/src/api_server/request/machine_configuration.rs +++ b/src/firecracker/src/api_server/request/machine_configuration.rs @@ -74,6 +74,7 @@ pub(crate) fn parse_patch_machine_config(body: &Body) -> Result bool { + let divisor = match self { + // Any integer memory size expressed in MiB will be a multiple of 4096KiB. + HugePageConfig::None => 1, + HugePageConfig::Hugetlbfs2M => 2, + }; + + mem_size_mib % divisor == 0 + } +} + /// Struct used in PUT `/machine-config` API call. #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] #[serde(deny_unknown_fields)] @@ -46,6 +71,9 @@ pub struct MachineConfig { /// Enables or disables dirty page tracking. Enabling allows incremental snapshots. #[serde(default)] pub track_dirty_pages: bool, + /// Configures what page size Firecracker should use to back guest memory. + #[serde(default)] + pub huge_pages: HugePageConfig, } impl Default for MachineConfig { @@ -78,6 +106,9 @@ pub struct MachineConfigUpdate { /// Enables or disables dirty page tracking. Enabling allows incremental snapshots. #[serde(skip_serializing_if = "Option::is_none")] pub track_dirty_pages: Option, + /// Configures what page size Firecracker should use to back guest memory. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub huge_pages: Option, } impl MachineConfigUpdate { @@ -97,6 +128,7 @@ impl From for MachineConfigUpdate { smt: Some(cfg.smt), cpu_template: cfg.cpu_template, track_dirty_pages: Some(cfg.track_dirty_pages), + huge_pages: Some(cfg.huge_pages), } } } @@ -114,6 +146,8 @@ pub struct VmConfig { pub cpu_template: Option, /// Enables or disables dirty page tracking. Enabling allows incremental snapshots. pub track_dirty_pages: bool, + /// Configures what page size Firecracker should use to back guest memory. + pub huge_pages: HugePageConfig, } impl VmConfig { @@ -148,8 +182,9 @@ impl VmConfig { } let mem_size_mib = update.mem_size_mib.unwrap_or(self.mem_size_mib); + let page_config = update.huge_pages.unwrap_or(self.huge_pages); - if mem_size_mib == 0 { + if mem_size_mib == 0 || !page_config.is_valid_mem_size(mem_size_mib) { return Err(VmConfigError::InvalidMemorySize); } @@ -165,6 +200,7 @@ impl VmConfig { smt, cpu_template, track_dirty_pages: update.track_dirty_pages.unwrap_or(self.track_dirty_pages), + huge_pages: page_config, }) } } @@ -177,6 +213,7 @@ impl Default for VmConfig { smt: false, cpu_template: None, track_dirty_pages: false, + huge_pages: HugePageConfig::None, } } } @@ -189,6 +226,7 @@ impl From<&VmConfig> for MachineConfig { smt: value.smt, cpu_template: value.cpu_template.as_ref().map(|template| template.into()), track_dirty_pages: value.track_dirty_pages, + huge_pages: value.huge_pages, } } } From 8bcab5ba5f4daeca2707332e43774b59152c1497 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Thu, 11 Jan 2024 11:23:20 +0000 Subject: [PATCH 02/16] chore: Mark huge pages support as developer preview Mark it as developer preview in case not everything gets worked out before 1.7. Signed-off-by: Patrick Roy --- src/vmm/src/resources.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 0709dfc100c..0c3a82da88d 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -10,7 +10,7 @@ use utils::net::ipv4addr::is_link_local_valid; use crate::cpu_config::templates::CustomCpuTemplate; use crate::device_manager::persist::SharedDeviceType; -use crate::logger::info; +use crate::logger::{info, log_dev_preview_warning}; use crate::mmds; use crate::mmds::data_store::{Mmds, MmdsVersion}; use crate::mmds::ns::MmdsNetworkStack; @@ -22,7 +22,7 @@ use crate::vmm_config::drive::*; use crate::vmm_config::entropy::*; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::{ - MachineConfig, MachineConfigUpdate, VmConfig, VmConfigError, + HugePageConfig, MachineConfig, MachineConfigUpdate, VmConfig, VmConfigError, }; use crate::vmm_config::metrics::{init_metrics, MetricsConfig, MetricsConfigError}; use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; @@ -238,6 +238,10 @@ impl VmResources { /// Updates the configuration of the microVM. pub fn update_vm_config(&mut self, update: &MachineConfigUpdate) -> Result<(), VmConfigError> { + if update.huge_pages.is_some() && update.huge_pages != Some(HugePageConfig::None) { + log_dev_preview_warning("Huge pages support", None); + } + let updated = self.vm_config.update(update)?; // The VM cannot have a memory size smaller than the target size From fb3be86e362fddb986dd60a5c11fcb078bb5bc52 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Thu, 11 Jan 2024 16:28:57 +0000 Subject: [PATCH 03/16] Wire up huge_page api parameter with memory allocation code Update the memory allocation code to be able to utilize hugetlbfs, and pass the corresponding arguments through from the api. Currently snapshots always restore on 4K pages, as huge page configuration is not yet saved in the vmstate file. Signed-off-by: Patrick Roy --- src/vmm/src/builder.rs | 9 ++- .../src/devices/virtio/block/virtio/io/mod.rs | 4 +- src/vmm/src/utilities/test_utils/mod.rs | 4 +- src/vmm/src/vmm_config/machine_config.rs | 23 ++++++ src/vmm/src/vstate/memory.rs | 75 ++++++++++++++----- 5 files changed, 91 insertions(+), 24 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index cac6580d319..42c3dba4fb9 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -237,9 +237,12 @@ pub fn build_microvm_for_boot( .ok_or(MissingKernelConfig)?; let track_dirty_pages = vm_resources.track_dirty_pages(); - let guest_memory = - GuestMemoryMmap::memfd_backed(vm_resources.vm_config.mem_size_mib, track_dirty_pages) - .map_err(StartMicrovmError::GuestMemory)?; + let guest_memory = GuestMemoryMmap::memfd_backed( + vm_resources.vm_config.mem_size_mib, + track_dirty_pages, + vm_resources.vm_config.huge_pages, + ) + .map_err(StartMicrovmError::GuestMemory)?; let entry_addr = load_kernel(boot_config, &guest_memory)?; let initrd = load_initrd_from_config(boot_config, &guest_memory)?; // Clone the command-line so that a failed boot doesn't pollute the original. diff --git a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs index 58b7a456a9e..0068c1dddbd 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs @@ -206,6 +206,7 @@ pub mod tests { use super::*; use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::devices::virtio::block::virtio::request::PendingRequest; + use crate::vmm_config::machine_config::HugePageConfig; use crate::vstate::memory::{Bitmap, Bytes, GuestMemory, GuestMemoryExtension}; const FILE_LEN: u32 = 1024; @@ -256,7 +257,8 @@ pub mod tests { } fn create_mem() -> GuestMemoryMmap { - GuestMemoryMmap::from_raw_regions(&[(GuestAddress(0), MEM_LEN)], true).unwrap() + GuestMemoryMmap::from_raw_regions(&[(GuestAddress(0), MEM_LEN)], true, HugePageConfig::None) + .unwrap() } fn check_dirty_mem(mem: &GuestMemoryMmap, addr: GuestAddress, len: u32) { diff --git a/src/vmm/src/utilities/test_utils/mod.rs b/src/vmm/src/utilities/test_utils/mod.rs index d15b9a85039..f46229b0566 100644 --- a/src/vmm/src/utilities/test_utils/mod.rs +++ b/src/vmm/src/utilities/test_utils/mod.rs @@ -13,6 +13,7 @@ use crate::seccomp_filters::get_empty_filters; use crate::utilities::mock_resources::{MockBootSourceConfig, MockVmConfig, MockVmResources}; use crate::vmm_config::boot_source::BootSourceConfig; use crate::vmm_config::instance_info::InstanceInfo; +use crate::vmm_config::machine_config::HugePageConfig; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryMmap}; use crate::{EventManager, Vmm}; @@ -30,7 +31,8 @@ pub fn single_region_mem_at(at: u64, size: usize) -> GuestMemoryMmap { /// Creates a [`GuestMemoryMmap`] with multiple regions and without dirty page tracking. pub fn multi_region_mem(regions: &[(GuestAddress, usize)]) -> GuestMemoryMmap { - GuestMemoryMmap::from_raw_regions(regions, false).expect("Cannot initialize memory") + GuestMemoryMmap::from_raw_regions(regions, false, HugePageConfig::None) + .expect("Cannot initialize memory") } /// Creates a [`GuestMemoryMmap`] of the given size with the contained regions laid out in diff --git a/src/vmm/src/vmm_config/machine_config.rs b/src/vmm/src/vmm_config/machine_config.rs index 9bb9caf5e90..f4a114db805 100644 --- a/src/vmm/src/vmm_config/machine_config.rs +++ b/src/vmm/src/vmm_config/machine_config.rs @@ -52,6 +52,29 @@ impl HugePageConfig { mem_size_mib % divisor == 0 } + + /// Returns the flags required to pass to `mmap`, in addition to `MAP_ANONYMOUS`, to + /// create a mapping backed by huge pages as described by this [`HugePageConfig`]. + pub fn mmap_flags(&self) -> libc::c_int { + match self { + HugePageConfig::None => 0, + HugePageConfig::Hugetlbfs2M => libc::MAP_HUGETLB | libc::MAP_HUGE_2MB, + } + } + + /// Returns `true` iff this [`HugePageConfig`] describes a hugetlbfs-based configuration. + pub fn is_hugetlbfs(&self) -> bool { + matches!(self, HugePageConfig::Hugetlbfs2M) + } +} + +impl From for Option { + fn from(value: HugePageConfig) -> Self { + match value { + HugePageConfig::None => None, + HugePageConfig::Hugetlbfs2M => Some(memfd::HugetlbSize::Huge2MB), + } + } } /// Struct used in PUT `/machine-config` API call. diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index b2dca00b1b7..b5abec3209a 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -19,6 +19,7 @@ pub use vm_memory::{ }; use vm_memory::{Error as VmMemoryError, GuestMemoryError, WriteVolatile}; +use crate::vmm_config::machine_config::HugePageConfig; use crate::DirtyBitmap; /// Type of GuestMemoryMmap. @@ -57,12 +58,17 @@ where Self: Sized, { /// Creates a GuestMemoryMmap with `size` in MiB backed by a memfd. - fn memfd_backed(mem_size_mib: usize, track_dirty_pages: bool) -> Result; + fn memfd_backed( + mem_size_mib: usize, + track_dirty_pages: bool, + huge_pages: HugePageConfig, + ) -> Result; /// Creates a GuestMemoryMmap from raw regions. fn from_raw_regions( regions: &[(GuestAddress, usize)], track_dirty_pages: bool, + huge_pages: HugePageConfig, ) -> Result; /// Creates a GuestMemoryMmap from raw regions. @@ -119,8 +125,12 @@ pub struct GuestMemoryState { impl GuestMemoryExtension for GuestMemoryMmap { /// Creates a GuestMemoryMmap with `size` in MiB backed by a memfd. - fn memfd_backed(mem_size_mib: usize, track_dirty_pages: bool) -> Result { - let memfd_file = create_memfd(mem_size_mib)?.into_file(); + fn memfd_backed( + mem_size_mib: usize, + track_dirty_pages: bool, + huge_pages: HugePageConfig, + ) -> Result { + let memfd_file = create_memfd(mem_size_mib, huge_pages.into())?.into_file(); let mut offset: u64 = 0; let regions = crate::arch::arch_memory_regions(mem_size_mib << 20) @@ -140,9 +150,16 @@ impl GuestMemoryExtension for GuestMemoryMmap { fn from_raw_regions( regions: &[(GuestAddress, usize)], track_dirty_pages: bool, + huge_pages: HugePageConfig, ) -> Result { let prot = libc::PROT_READ | libc::PROT_WRITE; - let flags = libc::MAP_NORESERVE | libc::MAP_PRIVATE | libc::MAP_ANONYMOUS; + // MAP_NORESERVE for 4K-backed page regions means that no swap space will be reserved for + // the region. For hugetlbfs regions, it means that pages in the hugetlbfs pool will + // not be reserved at mmap-time. This means that instead of failing at mmap-time if + // the hugetlbfs page pool is too small to accommodate the entire VM, Firecracker might + // receive a SIGBUS if a pagefault ever cannot be served due to the pool being depleted. + let flags = + libc::MAP_NORESERVE | libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | huge_pages.mmap_flags(); let regions = regions .iter() @@ -156,6 +173,7 @@ impl GuestMemoryExtension for GuestMemoryMmap { .with_mmap_flags(flags) .build() .map_err(MemoryError::MmapRegionError)?; + GuestRegionMmap::new(region, *guest_address).map_err(MemoryError::VmMemoryError) }) .collect::, MemoryError>>()?; @@ -188,6 +206,7 @@ impl GuestMemoryExtension for GuestMemoryMmap { .with_file_offset(file_offset) .build() .map_err(MemoryError::MmapRegionError)?; + GuestRegionMmap::new(region, guest_address).map_err(MemoryError::VmMemoryError) }) .collect::, MemoryError>>()?; @@ -224,7 +243,7 @@ impl GuestMemoryExtension for GuestMemoryMmap { .iter() .map(|r| (GuestAddress(r.base_address), r.size)) .collect::>(); - Self::from_raw_regions(®ions, track_dirty_pages) + Self::from_raw_regions(®ions, track_dirty_pages, HugePageConfig::None) } } } @@ -324,11 +343,15 @@ impl GuestMemoryExtension for GuestMemoryMmap { } } -/// Creates a memfd file with the `size` in MiB. -fn create_memfd(size: usize) -> Result { +fn create_memfd( + size: usize, + hugetlb_size: Option, +) -> Result { let mem_size = size << 20; // Create a memfd. - let opts = memfd::MemfdOptions::default().allow_sealing(true); + let opts = memfd::MemfdOptions::default() + .hugetlb(hugetlb_size) + .allow_sealing(true); let mem_file = opts.create("guest_mem").map_err(MemoryError::Memfd)?; // Resize to guest mem size. @@ -376,7 +399,8 @@ mod tests { (GuestAddress(0x30000), region_size), ]; - let guest_memory = GuestMemoryMmap::from_raw_regions(®ions, false).unwrap(); + let guest_memory = + GuestMemoryMmap::from_raw_regions(®ions, false, HugePageConfig::None).unwrap(); guest_memory.iter().for_each(|region| { assert!(region.bitmap().is_none()); }); @@ -392,7 +416,8 @@ mod tests { (GuestAddress(0x30000), region_size), ]; - let guest_memory = GuestMemoryMmap::from_raw_regions(®ions, true).unwrap(); + let guest_memory = + GuestMemoryMmap::from_raw_regions(®ions, true, HugePageConfig::None).unwrap(); guest_memory.iter().for_each(|region| { assert!(region.bitmap().is_some()); }); @@ -470,7 +495,8 @@ mod tests { (GuestAddress(region_size as u64), region_size), // pages 3-5 (GuestAddress(region_size as u64 * 2), region_size), // pages 6-8 ]; - let guest_memory = GuestMemoryMmap::from_raw_regions(®ions, true).unwrap(); + let guest_memory = + GuestMemoryMmap::from_raw_regions(®ions, true, HugePageConfig::None).unwrap(); let dirty_map = [ // page 0: not dirty @@ -525,8 +551,12 @@ mod tests { let region_size = page_size * 3; // Test with a single region - let guest_memory = - GuestMemoryMmap::from_raw_regions(&[(GuestAddress(0), region_size)], false).unwrap(); + let guest_memory = GuestMemoryMmap::from_raw_regions( + &[(GuestAddress(0), region_size)], + false, + HugePageConfig::None, + ) + .unwrap(); check_serde(&guest_memory); // Test with some regions @@ -535,7 +565,8 @@ mod tests { (GuestAddress(region_size as u64), region_size), // pages 3-5 (GuestAddress(region_size as u64 * 2), region_size), // pages 6-8 ]; - let guest_memory = GuestMemoryMmap::from_raw_regions(®ions, true).unwrap(); + let guest_memory = + GuestMemoryMmap::from_raw_regions(®ions, true, HugePageConfig::None).unwrap(); check_serde(&guest_memory); } @@ -548,7 +579,9 @@ mod tests { (GuestAddress(0), page_size), (GuestAddress(page_size as u64 * 2), page_size), ]; - let guest_memory = GuestMemoryMmap::from_raw_regions(&mem_regions[..], true).unwrap(); + let guest_memory = + GuestMemoryMmap::from_raw_regions(&mem_regions[..], true, HugePageConfig::None) + .unwrap(); let expected_memory_state = GuestMemoryState { regions: vec![ @@ -573,7 +606,9 @@ mod tests { (GuestAddress(0), page_size * 3), (GuestAddress(page_size as u64 * 4), page_size * 3), ]; - let guest_memory = GuestMemoryMmap::from_raw_regions(&mem_regions[..], true).unwrap(); + let guest_memory = + GuestMemoryMmap::from_raw_regions(&mem_regions[..], true, HugePageConfig::None) + .unwrap(); let expected_memory_state = GuestMemoryState { regions: vec![ @@ -606,7 +641,8 @@ mod tests { (region_1_address, region_size), (region_2_address, region_size), ]; - let guest_memory = GuestMemoryMmap::from_raw_regions(&mem_regions, true).unwrap(); + let guest_memory = + GuestMemoryMmap::from_raw_regions(&mem_regions, true, HugePageConfig::None).unwrap(); // Check that Firecracker bitmap is clean. guest_memory.iter().for_each(|r| { assert!(!r.bitmap().dirty_at(0)); @@ -656,7 +692,8 @@ mod tests { (region_1_address, region_size), (region_2_address, region_size), ]; - let guest_memory = GuestMemoryMmap::from_raw_regions(&mem_regions, true).unwrap(); + let guest_memory = + GuestMemoryMmap::from_raw_regions(&mem_regions, true, HugePageConfig::None).unwrap(); // Check that Firecracker bitmap is clean. guest_memory.iter().for_each(|r| { assert!(!r.bitmap().dirty_at(0)); @@ -735,7 +772,7 @@ mod tests { let size = 1; let size_mb = 1 << 20; - let memfd = create_memfd(size).unwrap(); + let memfd = create_memfd(size, None).unwrap(); assert_eq!(memfd.as_file().metadata().unwrap().len(), size_mb); memfd.as_file().set_len(0x69).unwrap_err(); From ee716388cb77ddb3f04e1256492d2d96951bbee8 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Fri, 12 Jan 2024 09:58:38 +0000 Subject: [PATCH 04/16] Wire up huge pages support with snapshot feature We store the huge pages configuration in the snapshot's vmstate file, and enforce that a snapshot gets restored with the same hugepages configuration with which it was taken (for simplicity reasons). Signed-off-by: Patrick Roy --- src/vmm/src/persist.rs | 24 ++++++++++++----- src/vmm/src/rpc_interface.rs | 1 + src/vmm/src/vstate/memory.rs | 41 +++++++++++++++++++++++++++--- src/vmm/tests/integration_tests.rs | 2 ++ 4 files changed, 58 insertions(+), 10 deletions(-) diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 015414800ae..0a3c912f8ce 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -32,7 +32,7 @@ use crate::resources::VmResources; use crate::snapshot::Snapshot; use crate::vmm_config::boot_source::BootSourceConfig; use crate::vmm_config::instance_info::InstanceInfo; -use crate::vmm_config::machine_config::{MachineConfigUpdate, VmConfigError}; +use crate::vmm_config::machine_config::{HugePageConfig, MachineConfigUpdate, VmConfigError}; use crate::vmm_config::snapshot::{ CreateSnapshotParams, LoadSnapshotParams, MemBackendType, SnapshotType, }; @@ -54,6 +54,8 @@ pub struct VmInfo { pub cpu_template: StaticCpuTemplate, /// Boot source information. pub boot_source: BootSourceConfig, + /// Huge page configuration + pub huge_pages: HugePageConfig, } impl From<&VmResources> for VmInfo { @@ -63,6 +65,7 @@ impl From<&VmResources> for VmInfo { smt: value.vm_config.smt, cpu_template: StaticCpuTemplate::from(&value.vm_config.cpu_template), boot_source: value.boot_source_config().clone(), + huge_pages: value.vm_config.huge_pages, } } } @@ -399,7 +402,7 @@ pub fn restore_from_snapshot( smt: Some(microvm_state.vm_info.smt), cpu_template: Some(microvm_state.vm_info.cpu_template), track_dirty_pages: Some(track_dirty_pages), - huge_pages: None, // TODO: snapshot integration + huge_pages: Some(microvm_state.vm_info.huge_pages), }) .map_err(BuildMicrovmFromSnapshotError::VmUpdateConfig)?; @@ -411,8 +414,13 @@ pub fn restore_from_snapshot( let (guest_memory, uffd) = match params.mem_backend.backend_type { MemBackendType::File => ( - guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages) - .map_err(RestoreFromSnapshotGuestMemoryError::File)?, + guest_memory_from_file( + mem_backend_path, + mem_state, + track_dirty_pages, + vm_resources.vm_config.huge_pages, + ) + .map_err(RestoreFromSnapshotGuestMemoryError::File)?, None, ), MemBackendType::Uffd => guest_memory_from_uffd( @@ -422,6 +430,7 @@ pub fn restore_from_snapshot( // We enable the UFFD_FEATURE_EVENT_REMOVE feature only if a balloon device // is present in the microVM state. microvm_state.device_states.balloon_device.is_some(), + vm_resources.vm_config.huge_pages, ) .map_err(RestoreFromSnapshotGuestMemoryError::Uffd)?, }; @@ -475,9 +484,11 @@ fn guest_memory_from_file( mem_file_path: &Path, mem_state: &GuestMemoryState, track_dirty_pages: bool, + huge_pages: HugePageConfig, ) -> Result { let mem_file = File::open(mem_file_path)?; - let guest_mem = GuestMemoryMmap::from_state(Some(&mem_file), mem_state, track_dirty_pages)?; + let guest_mem = + GuestMemoryMmap::from_state(Some(&mem_file), mem_state, track_dirty_pages, huge_pages)?; Ok(guest_mem) } @@ -501,8 +512,9 @@ fn guest_memory_from_uffd( mem_state: &GuestMemoryState, track_dirty_pages: bool, enable_balloon: bool, + huge_pages: HugePageConfig, ) -> Result<(GuestMemoryMmap, Option), GuestMemoryFromUffdError> { - let guest_memory = GuestMemoryMmap::from_state(None, mem_state, track_dirty_pages)?; + let guest_memory = GuestMemoryMmap::from_state(None, mem_state, track_dirty_pages, huge_pages)?; let mut uffd_builder = UffdBuilder::new(); diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index 62a0fc10991..b8a3929f42b 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -1069,6 +1069,7 @@ mod tests { smt: value.vm_config.smt, cpu_template: StaticCpuTemplate::from(&value.vm_config.cpu_template), boot_source: value.boot_source_config().clone(), + huge_pages: value.vm_config.huge_pages, } } } diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index b5abec3209a..1dd89f9b104 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -50,6 +50,8 @@ pub enum MemoryError { Memfd(memfd::Error), /// Cannot resize memfd file: {0:?} MemfdSetLen(std::io::Error), + /// Cannot restore hugetlbfs backed snapshot by mapping the memory file. Please use uffd. + HugetlbfsSnapshot, } /// Defines the interface for snapshotting memory. @@ -84,6 +86,7 @@ where file: Option<&File>, state: &GuestMemoryState, track_dirty_pages: bool, + huge_pages: HugePageConfig, ) -> Result; /// Describes GuestMemoryMmap through a GuestMemoryState struct. @@ -220,9 +223,14 @@ impl GuestMemoryExtension for GuestMemoryMmap { file: Option<&File>, state: &GuestMemoryState, track_dirty_pages: bool, + huge_pages: HugePageConfig, ) -> Result { match file { Some(f) => { + if huge_pages.is_hugetlbfs() { + return Err(MemoryError::HugetlbfsSnapshot); + } + let regions = state .regions .iter() @@ -243,7 +251,7 @@ impl GuestMemoryExtension for GuestMemoryMmap { .iter() .map(|r| (GuestAddress(r.base_address), r.size)) .collect::>(); - Self::from_raw_regions(®ions, track_dirty_pages, HugePageConfig::None) + Self::from_raw_regions(®ions, track_dirty_pages, huge_pages) } } } @@ -485,6 +493,25 @@ mod tests { } } + #[test] + fn test_from_state() { + let state = GuestMemoryState { + regions: vec![GuestMemoryRegionState { + base_address: 0, + size: 4096, + offset: 0, + }], + }; + let file = TempFile::new().unwrap().into_file(); + + // No mapping of snapshots that were taken with hugetlbfs enabled + let err = + GuestMemoryMmap::from_state(Some(&file), &state, false, HugePageConfig::Hugetlbfs2M) + .unwrap_err(); + + assert!(matches!(err, MemoryError::HugetlbfsSnapshot), "{:?}", err); + } + #[test] fn test_mark_dirty() { let page_size = get_page_size().unwrap(); @@ -664,8 +691,13 @@ mod tests { let mut memory_file = TempFile::new().unwrap().into_file(); guest_memory.dump(&mut memory_file).unwrap(); - let restored_guest_memory = - GuestMemoryMmap::from_state(Some(&memory_file), &memory_state, false).unwrap(); + let restored_guest_memory = GuestMemoryMmap::from_state( + Some(&memory_file), + &memory_state, + false, + HugePageConfig::None, + ) + .unwrap(); // Check that the region contents are the same. let mut restored_region = vec![0u8; page_size * 2]; @@ -723,7 +755,8 @@ mod tests { // We can restore from this because this is the first dirty dump. let restored_guest_memory = - GuestMemoryMmap::from_state(Some(&file), &memory_state, false).unwrap(); + GuestMemoryMmap::from_state(Some(&file), &memory_state, false, HugePageConfig::None) + .unwrap(); // Check that the region contents are the same. let mut restored_region = vec![0u8; region_size]; diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 6e7436893a0..8fb7395ab00 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -16,6 +16,7 @@ use vmm::utilities::mock_resources::{MockVmResources, NOISY_KERNEL_IMAGE}; use vmm::utilities::test_utils::dirty_tracking_vmm; use vmm::utilities::test_utils::{create_vmm, default_vmm, default_vmm_no_boot}; use vmm::vmm_config::instance_info::{InstanceInfo, VmState}; +use vmm::vmm_config::machine_config::HugePageConfig; use vmm::vmm_config::snapshot::{CreateSnapshotParams, SnapshotType}; use vmm::{DumpCpuConfigError, EventManager, FcExitCode}; @@ -236,6 +237,7 @@ fn verify_load_snapshot(snapshot_file: TempFile, memory_file: TempFile) { Some(memory_file.as_file()), µvm_state.memory_state, false, + HugePageConfig::None, ) .unwrap(); From da62f7d8a3950a5ad0c5712b006435e7f73aae73 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Fri, 12 Jan 2024 14:41:42 +0000 Subject: [PATCH 05/16] Gracefully fail if hugetlbfs is attempted to be used on <4.16 host Support for memfd_create with `MFD_HUGETLB | MFD_ALLOW_SEALING` was only added in 4.16, so trying to use hugetlbfs backed guest memory on 4.14 host will fail [1]. Make the error message shown to the customer a bit nicer. [1]: https://man7.org/linux/man-pages/man2/memfd_create.2.html Signed-off-by: Patrick Roy --- src/vmm/src/resources.rs | 9 ++++-- src/vmm/src/vmm_config/machine_config.rs | 41 ++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 0c3a82da88d..5963ed63502 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -472,6 +472,7 @@ mod tests { use std::str::FromStr; use serde_json::{Map, Value}; + use utils::kernel_version::KernelVersion; use utils::net::mac::MacAddr; use utils::tempfile::TempFile; @@ -1378,9 +1379,11 @@ mod tests { VmConfigError::InvalidMemorySize ); - // mem_size_mib compatible with huge page configuration - aux_vm_config.mem_size_mib = Some(2048); - vm_resources.update_vm_config(&aux_vm_config).unwrap(); + if KernelVersion::get().unwrap() >= KernelVersion::new(5, 10, 0) { + // mem_size_mib compatible with huge page configuration + aux_vm_config.mem_size_mib = Some(2048); + vm_resources.update_vm_config(&aux_vm_config).unwrap(); + } } #[test] diff --git a/src/vmm/src/vmm_config/machine_config.rs b/src/vmm/src/vmm_config/machine_config.rs index f4a114db805..a84b8ffc753 100644 --- a/src/vmm/src/vmm_config/machine_config.rs +++ b/src/vmm/src/vmm_config/machine_config.rs @@ -3,6 +3,8 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use utils::kernel_version; +use utils::kernel_version::KernelVersion; use crate::cpu_config::templates::{CpuTemplateType, CustomCpuTemplate, StaticCpuTemplate}; @@ -27,6 +29,18 @@ pub enum VmConfigError { /// Enabling simultaneous multithreading is not supported on aarch64. #[cfg(target_arch = "aarch64")] SmtNotSupported, + /// Could not determine host kernel version when checking hugetlbfs compatibility + KernelVersion, + /// Firecracker's hugetlbfs support requires at least host kernel 5.10. + HugetlbfsNotSupported, +} + +// We cannot do a `KernelVersion(kernel_version::Error)` variant because `kernel_version::Error` +// does not implement `PartialEq, Eq` (due to containing an io error). +impl From for VmConfigError { + fn from(_: kernel_version::Error) -> Self { + VmConfigError::KernelVersion + } } /// Describes the possible (huge)page configurations for a microVM's memory. @@ -217,6 +231,10 @@ impl VmConfig { Some(other) => Some(CpuTemplateType::Static(other)), }; + if page_config.is_hugetlbfs() && KernelVersion::get()? < KernelVersion::new(4, 16, 0) { + return Err(VmConfigError::HugetlbfsNotSupported); + } + Ok(VmConfig { vcpu_count, mem_size_mib, @@ -253,3 +271,26 @@ impl From<&VmConfig> for MachineConfig { } } } + +#[cfg(test)] +mod tests { + use utils::kernel_version::KernelVersion; + + use crate::vmm_config::machine_config::{ + HugePageConfig, MachineConfigUpdate, VmConfig, VmConfigError, + }; + + #[test] + fn test_hugetlbfs_not_supported_4_14() { + if KernelVersion::get().unwrap() < KernelVersion::new(4, 16, 0) { + let base_config = VmConfig::default(); + let update = MachineConfigUpdate { + huge_pages: Some(HugePageConfig::Hugetlbfs2M), + ..Default::default() + }; + + let err = base_config.update(&update).unwrap_err(); + assert_eq!(err, VmConfigError::HugetlbfsNotSupported) + } + } +} From 24dc497209222cc77e91d8ea826fc8ffae20ce0b Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Fri, 12 Jan 2024 11:46:33 +0000 Subject: [PATCH 06/16] fix(tests): Add huge_pages parameter to relevant tests Tests that explicitly check API responses now need to deal with the new huge_pages parameter. Signed-off-by: Patrick Roy --- tests/framework/vm_config.json | 3 ++- tests/integration_tests/functional/test_api.py | 7 ++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/framework/vm_config.json b/tests/framework/vm_config.json index 32b4f615655..5df673308d9 100644 --- a/tests/framework/vm_config.json +++ b/tests/framework/vm_config.json @@ -21,7 +21,8 @@ "vcpu_count": 2, "mem_size_mib": 1024, "smt": false, - "track_dirty_pages": false + "track_dirty_pages": false, + "huge_pages": "None" }, "cpu-config": null, "balloon": null, diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index 95086b5cd43..4c3c8c6998d 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -399,7 +399,10 @@ def test_api_machine_config(uvm_plain): # Test invalid mem_size_mib = 0. with pytest.raises( - RuntimeError, match=re.escape("The memory size (MiB) is invalid.") + RuntimeError, + match=re.escape( + "The memory size (MiB) is either 0, or not a multiple of the configured page size." + ), ): test_microvm.api.machine_config.patch(mem_size_mib=0) @@ -1105,6 +1108,7 @@ def test_get_full_config_after_restoring_snapshot(microvm_factory, uvm_nano): "mem_size_mib": 256, "smt": True, "track_dirty_pages": False, + "huge_pages": "None", } if cpu_vendor == utils_cpuid.CpuVendor.ARM: @@ -1221,6 +1225,7 @@ def test_get_full_config(uvm_plain): "mem_size_mib": 256, "smt": False, "track_dirty_pages": False, + "huge_pages": "None", } expected_cfg["cpu-config"] = None expected_cfg["boot-source"] = { From 807e1c788ef1abfb19e2c794363a4d22d4ef0bec Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Fri, 12 Jan 2024 13:26:38 +0000 Subject: [PATCH 07/16] test: Add test that booting with hugetlbfs memory works Attempts to boot a microvm with guest memory backed by 2MB hugetlbfs pages. Adjusts the test infrastructure to allocate 2MB pages prior to test run (failing if it cannot do so). For now, we rely on no other process on the host trying to use hugetlbfs. The test is skipped on 4.14 because hugetlbfs support for sealable memfds was only added in 4.16. We put the test as a performance test to ensure it runs on ag=1 agents, to avoid problems with different agents on the same metal concurrently modifying the hugetlbfs pool. Signed-off-by: Patrick Roy --- src/vmm/src/vmm_config/machine_config.rs | 1 + tests/framework/microvm.py | 9 +++ .../performance/test_huge_pages.py | 66 +++++++++++++++++++ tools/devtool | 30 +++++++-- 4 files changed, 101 insertions(+), 5 deletions(-) create mode 100644 tests/integration_tests/performance/test_huge_pages.py diff --git a/src/vmm/src/vmm_config/machine_config.rs b/src/vmm/src/vmm_config/machine_config.rs index a84b8ffc753..0955ece0149 100644 --- a/src/vmm/src/vmm_config/machine_config.rs +++ b/src/vmm/src/vmm_config/machine_config.rs @@ -286,6 +286,7 @@ mod tests { let base_config = VmConfig::default(); let update = MachineConfigUpdate { huge_pages: Some(HugePageConfig::Hugetlbfs2M), + mem_size_mib: Some(1024), ..Default::default() }; diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index af2f9a2a5f3..3b5ae14fc70 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -138,6 +138,13 @@ def delete(self): self.vmstate.unlink() +class HugePagesConfig(str, Enum): + """Enum describing the huge pages configurations supported Firecracker""" + + NONE = "None" + HUGETLBFS_2MB = "2M" + + # pylint: disable=R0904 class Microvm: """Class to represent a Firecracker microvm. @@ -631,6 +638,7 @@ def basic_config( boot_args: str = None, use_initrd: bool = False, track_dirty_pages: bool = False, + huge_pages: HugePagesConfig = None, rootfs_io_engine=None, cpu_template: Optional[str] = None, enable_entropy_device=False, @@ -658,6 +666,7 @@ def basic_config( mem_size_mib=mem_size_mib, track_dirty_pages=track_dirty_pages, cpu_template=cpu_template, + huge_pages=huge_pages, ) self.vcpus_count = vcpu_count self.mem_size_bytes = mem_size_mib * 2**20 diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py new file mode 100644 index 00000000000..040ece2f85f --- /dev/null +++ b/tests/integration_tests/performance/test_huge_pages.py @@ -0,0 +1,66 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Integration tests for Firecracker's huge pages support""" +import pytest + +from framework import utils +from framework.microvm import HugePagesConfig +from framework.properties import global_props + + +def check_hugetlbfs_in_use(pid: int, allocation_name: str): + """Asserts that the process with the given `pid` is using hugetlbfs pages somewhere. + + `allocation_name` should be the name of the smaps entry for which we want to verify that huge pages are used. + For memfd-backed guest memory, this would be "memfd:guest_mem" (the `guest_mem` part originating from the name + we give the memfd in memory.rs), for anonymous memory this would be "/anon_hugepage" + """ + + # Format of a sample smaps entry: + # 7fc2bc400000-7fc2cc400000 rw-s 00000000 00:10 25488401 /memfd:guest_mem (deleted) + # Size: 262144 kB + # KernelPageSize: 2048 kB + # MMUPageSize: 2048 kB + # Rss: 0 kB + # Pss: 0 kB + # Pss_Dirty: 0 kB + # Shared_Clean: 0 kB + # Shared_Dirty: 0 kB + # Private_Clean: 0 kB + # Private_Dirty: 0 kB + # Referenced: 0 kB + # Anonymous: 0 kB + # LazyFree: 0 kB + # AnonHugePages: 0 kB + # ShmemPmdMapped: 0 kB + # FilePmdMapped: 0 kB + # Shared_Hugetlb: 0 kB + # Private_Hugetlb: 92160 kB + # Swap: 0 kB + # SwapPss: 0 kB + # Locked: 0 kB + # THPeligible: 0 + # ProtectionKey: 0 + cmd = f"cat /proc/{pid}/smaps | grep {allocation_name} -A 23 | grep KernelPageSize" + _, stdout, _ = utils.run_cmd(cmd) + + kernel_page_size_kib = int(stdout.split()[1]) + assert kernel_page_size_kib > 4 + + +@pytest.mark.skipif( + global_props.host_linux_version == "4.14", + reason="MFD_HUGETLB | MFD_ALLOW_SEALING only supported on kernels >= 4.16", +) +def test_hugetlbfs_boot(uvm_plain): + """Tests booting a microvm with guest memory backed by 2MB hugetlbfs pages""" + + uvm_plain.spawn() + uvm_plain.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB, mem_size_mib=128) + uvm_plain.add_net_iface() + uvm_plain.start() + + rc, _, _ = uvm_plain.ssh.run("true") + assert not rc + + check_hugetlbfs_in_use(uvm_plain.firecracker_pid, "memfd:guest_mem") diff --git a/tools/devtool b/tools/devtool index 2e5ee206499..ba84fd31bc6 100755 --- a/tools/devtool +++ b/tools/devtool @@ -687,10 +687,26 @@ cmd_test() { say "RPM microcode_ctl version: $(rpm -q microcode_ctl)" env |grep -P "^(AWS_EMF_|BUILDKITE|CODECOV_)" > env.list - if [[ $performance_tweaks -eq 1 ]] && [[ "$(uname --machine)" == "x86_64" ]]; then - say "Detected CI and performance tests, tuning CPU frequency scaling and idle states for reduced variability" + if [[ $performance_tweaks -eq 1 ]]; then + if [[ "$(uname --machine)" == "x86_64" ]]; then + say "Detected CI and performance tests, tuning CPU frequency scaling and idle states for reduced variability" + + apply_performance_tweaks + fi + + # It seems that even if the tests using huge pages run sequentially on ag=1 agents, right-sizing the huge pages + # pool to the total number of huge pages used across all tests results in spurious failures with pool depletion + # anyway (something else on the host seems to be stealing our huge pages, and we cannot "ear mark" them for + # Firecracker processes). Thus, just allocate 4GB of them and call it a day. + say "Setting up huge pages pool" + num_hugetlbfs_pages=2048 + + huge_pages_old=$(cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages) + huge_pages_new=$(echo $num_hugetlbfs_pages |sudo tee /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages) + fi - apply_performance_tweaks + if [[ "$huge_pages_new" -ne "$num_hugetlbfs_pages" ]]; then + die "Failed to allocate $num_hugetlbfs_pages hugetlbfs pages, only got $huge_pages_new" fi say "Starting test run ..." @@ -727,8 +743,12 @@ cmd_test() { cmd_fix_perms # undo performance tweaks (in case the instance gets recycled for a non-perf test) - if [[ $performance_tweaks -eq 1 ]] && [[ "$(uname --machine)" == "x86_64" ]]; then - unapply_performance_tweaks + if [[ $performance_tweaks -eq 1 ]]; then + if [[ "$(uname --machine)" == "x86_64" ]]; then + unapply_performance_tweaks + fi + + echo $huge_pages_old |sudo tee /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages >/dev/null fi # do not leave behind env.list file From 3d4c6742e4cd25a6f8fb02ff0cfd8413ccdf034a Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Fri, 12 Jan 2024 15:37:34 +0000 Subject: [PATCH 08/16] Generalize uffd handler to allow faulting in huge pages Make the `valid_handler.rs` code sample page-size agnostic, in preparating for hugepages tests. Signed-off-by: Patrick Roy --- ...handling-page-faults-on-snapshot-resume.md | 2 +- src/firecracker/Cargo.toml | 8 ++-- ...ous_handler.rs => malicious_4k_handler.rs} | 2 +- src/firecracker/examples/uffd/uffd_utils.rs | 41 ++++++++++--------- .../{valid_handler.rs => valid_4k_handler.rs} | 4 +- tests/conftest.py | 2 +- .../integration_tests/functional/test_uffd.py | 4 +- 7 files changed, 33 insertions(+), 30 deletions(-) rename src/firecracker/examples/uffd/{malicious_handler.rs => malicious_4k_handler.rs} (95%) rename src/firecracker/examples/uffd/{valid_handler.rs => valid_4k_handler.rs} (95%) diff --git a/docs/snapshotting/handling-page-faults-on-snapshot-resume.md b/docs/snapshotting/handling-page-faults-on-snapshot-resume.md index 9f7d9314091..d699c5d24ee 100644 --- a/docs/snapshotting/handling-page-faults-on-snapshot-resume.md +++ b/docs/snapshotting/handling-page-faults-on-snapshot-resume.md @@ -161,7 +161,7 @@ connect/send data. ### Example An example of a handler process can be found -[here](../../src/firecracker/examples/uffd/valid_handler.rs). The process is +[here](../../src/firecracker/examples/uffd/valid_4k_handler.rs). The process is designed to tackle faults on a certain address by loading into memory the entire region that the address belongs to, but users can choose any other behavior that suits their use case best. diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml index 8da0d04de16..78b8e44e67f 100644 --- a/src/firecracker/Cargo.toml +++ b/src/firecracker/Cargo.toml @@ -50,12 +50,12 @@ serde_json = "1.0.113" tracing = ["log-instrument", "seccompiler/tracing", "utils/tracing", "vmm/tracing"] [[example]] -name = "uffd_malicious_handler" -path = "examples/uffd/malicious_handler.rs" +name = "uffd_malicious_4k_handler" +path = "examples/uffd/malicious_4k_handler.rs" [[example]] -name = "uffd_valid_handler" -path = "examples/uffd/valid_handler.rs" +name = "uffd_valid_4k_handler" +path = "examples/uffd/valid_4k_handler.rs" [[example]] name = "seccomp_harmless" diff --git a/src/firecracker/examples/uffd/malicious_handler.rs b/src/firecracker/examples/uffd/malicious_4k_handler.rs similarity index 95% rename from src/firecracker/examples/uffd/malicious_handler.rs rename to src/firecracker/examples/uffd/malicious_4k_handler.rs index 9af94e057aa..157d3d7e147 100644 --- a/src/firecracker/examples/uffd/malicious_handler.rs +++ b/src/firecracker/examples/uffd/malicious_4k_handler.rs @@ -23,7 +23,7 @@ fn main() { let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); let mut runtime = Runtime::new(stream, file); - runtime.run(|uffd_handler: &mut UffdHandler| { + runtime.run(4096, |uffd_handler: &mut UffdHandler| { // Read an event from the userfaultfd. let event = uffd_handler .read_event() diff --git a/src/firecracker/examples/uffd/uffd_utils.rs b/src/firecracker/examples/uffd/uffd_utils.rs index 822ce178fac..d0e3b3cb91e 100644 --- a/src/firecracker/examples/uffd/uffd_utils.rs +++ b/src/firecracker/examples/uffd/uffd_utils.rs @@ -12,7 +12,6 @@ use std::ptr; use serde::{Deserialize, Serialize}; use userfaultfd::{Error, Event, Uffd}; -use utils::get_page_size; use utils::sock_ctrl_msg::ScmSocket; // This is the same with the one used in src/vmm. @@ -33,7 +32,7 @@ pub struct GuestRegionUffdMapping { pub offset: u64, } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Copy)] pub enum MemPageState { Uninitialized, FromFile, @@ -50,12 +49,18 @@ struct MemRegion { #[derive(Debug)] pub struct UffdHandler { mem_regions: Vec, + page_size: usize, backing_buffer: *const u8, uffd: Uffd, } impl UffdHandler { - pub fn from_unix_stream(stream: &UnixStream, backing_buffer: *const u8, size: usize) -> Self { + pub fn from_unix_stream( + stream: &UnixStream, + page_size: usize, + backing_buffer: *const u8, + size: usize, + ) -> Self { let mut message_buf = vec![0u8; 1024]; let (bytes_read, file) = stream .recv_with_fd(&mut message_buf[..]) @@ -71,13 +76,15 @@ impl UffdHandler { // Make sure memory size matches backing data size. assert_eq!(memsize, size); + assert!(page_size.is_power_of_two()); let uffd = unsafe { Uffd::from_raw_fd(file.into_raw_fd()) }; - let mem_regions = create_mem_regions(&mappings); + let mem_regions = create_mem_regions(&mappings, page_size); Self { mem_regions, + page_size, backing_buffer, uffd, } @@ -87,21 +94,19 @@ impl UffdHandler { self.uffd.read_event() } - pub fn update_mem_state_mappings(&mut self, start: u64, end: u64, state: &MemPageState) { + pub fn update_mem_state_mappings(&mut self, start: u64, end: u64, state: MemPageState) { for region in self.mem_regions.iter_mut() { for (key, value) in region.page_states.iter_mut() { if key >= &start && key < &end { - *value = state.clone(); + *value = state; } } } } pub fn serve_pf(&mut self, addr: *mut u8, len: usize) { - let page_size = get_page_size().unwrap(); - // Find the start of the page that the current faulting address belongs to. - let dst = (addr as usize & !(page_size as usize - 1)) as *mut libc::c_void; + let dst = (addr as usize & !(self.page_size - 1)) as *mut libc::c_void; let fault_page_addr = dst as u64; // Get the state of the current faulting page. @@ -117,12 +122,12 @@ impl UffdHandler { // memory from the host (through balloon device) Some(MemPageState::Uninitialized) | Some(MemPageState::FromFile) => { let (start, end) = self.populate_from_file(region, fault_page_addr, len); - self.update_mem_state_mappings(start, end, &MemPageState::FromFile); + self.update_mem_state_mappings(start, end, MemPageState::FromFile); return; } Some(MemPageState::Removed) | Some(MemPageState::Anonymous) => { let (start, end) = self.zero_out(fault_page_addr); - self.update_mem_state_mappings(start, end, &MemPageState::Anonymous); + self.update_mem_state_mappings(start, end, MemPageState::Anonymous); return; } None => {} @@ -152,17 +157,15 @@ impl UffdHandler { } fn zero_out(&mut self, addr: u64) -> (u64, u64) { - let page_size = get_page_size().unwrap(); - let ret = unsafe { self.uffd - .zeropage(addr as *mut _, page_size, true) + .zeropage(addr as *mut _, self.page_size, true) .expect("Uffd zeropage failed") }; // Make sure the UFFD zeroed out some bytes. assert!(ret > 0); - (addr, addr + page_size as u64) + (addr, addr + self.page_size as u64) } } @@ -211,7 +214,7 @@ impl Runtime { /// When uffd is polled, page fault is handled by /// calling `pf_event_dispatch` with corresponding /// uffd object passed in. - pub fn run(&mut self, pf_event_dispatch: impl Fn(&mut UffdHandler)) { + pub fn run(&mut self, page_size: usize, pf_event_dispatch: impl Fn(&mut UffdHandler)) { let mut pollfds = vec![]; // Poll the stream for incoming uffds @@ -246,6 +249,7 @@ impl Runtime { // Handle new uffd from stream let handler = UffdHandler::from_unix_stream( &self.stream, + page_size, self.backing_memory, self.backing_memory_size, ); @@ -270,8 +274,7 @@ impl Runtime { } } -fn create_mem_regions(mappings: &Vec) -> Vec { - let page_size = get_page_size().unwrap(); +fn create_mem_regions(mappings: &Vec, page_size: usize) -> Vec { let mut mem_regions: Vec = Vec::with_capacity(mappings.len()); for r in mappings.iter() { @@ -327,7 +330,7 @@ mod tests { let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); // Update runtime with actual runtime let runtime = uninit_runtime.write(Runtime::new(stream, file)); - runtime.run(|_: &mut UffdHandler| {}); + runtime.run(4096, |_: &mut UffdHandler| {}); }); // wait for runtime thread to initialize itself diff --git a/src/firecracker/examples/uffd/valid_handler.rs b/src/firecracker/examples/uffd/valid_4k_handler.rs similarity index 95% rename from src/firecracker/examples/uffd/valid_handler.rs rename to src/firecracker/examples/uffd/valid_4k_handler.rs index 609380afa8a..1f752f141f1 100644 --- a/src/firecracker/examples/uffd/valid_handler.rs +++ b/src/firecracker/examples/uffd/valid_4k_handler.rs @@ -30,7 +30,7 @@ fn main() { let len = get_page_size().unwrap(); let mut runtime = Runtime::new(stream, file); - runtime.run(|uffd_handler: &mut UffdHandler| { + runtime.run(len, |uffd_handler: &mut UffdHandler| { // Read an event from the userfaultfd. let event = uffd_handler .read_event() @@ -44,7 +44,7 @@ fn main() { userfaultfd::Event::Remove { start, end } => uffd_handler.update_mem_state_mappings( start as u64, end as u64, - &MemPageState::Removed, + MemPageState::Removed, ), _ => panic!("Unexpected event on userfaultfd"), } diff --git a/tests/conftest.py b/tests/conftest.py index 1732bde4ab6..261ed8e1a94 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -229,7 +229,7 @@ def uffd_handler_paths(): """Build UFFD handler binaries.""" handlers = { f"{handler}_handler": build_tools.get_example(f"uffd_{handler}_handler") - for handler in ["malicious", "valid"] + for handler in ["malicious_4k", "valid_4k"] } yield handlers diff --git a/tests/integration_tests/functional/test_uffd.py b/tests/integration_tests/functional/test_uffd.py index 6e7e96552a8..8d70cedff46 100644 --- a/tests/integration_tests/functional/test_uffd.py +++ b/tests/integration_tests/functional/test_uffd.py @@ -110,7 +110,7 @@ def test_valid_handler(uvm_plain, snapshot, uffd_handler_paths): # Spawn page fault handler process. _pf_handler = spawn_pf_handler( - vm, uffd_handler_paths["valid_handler"], snapshot.mem + vm, uffd_handler_paths["valid_4k_handler"], snapshot.mem ) vm.restore_from_snapshot(snapshot, resume=True, uffd_path=SOCKET_PATH) @@ -144,7 +144,7 @@ def test_malicious_handler(uvm_plain, snapshot, uffd_handler_paths): # Spawn page fault handler process. _pf_handler = spawn_pf_handler( - vm, uffd_handler_paths["malicious_handler"], snapshot.mem + vm, uffd_handler_paths["malicious_4k_handler"], snapshot.mem ) # We expect Firecracker to freeze while resuming from a snapshot From 0c99b9f172a33d5d078452ad3d0e035710011533 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Fri, 12 Jan 2024 15:41:19 +0000 Subject: [PATCH 09/16] test: Add snapshot restore test for hugetlbfs backed guest The test has to be UFFD based, as we cannot mmap the file with hugetlbfs enabled (as `MAP_HUGETLB` is a modifier to `MAP_ANONYMOUS`, which precludes file mappings). Signed-off-by: Patrick Roy --- src/firecracker/Cargo.toml | 4 ++ .../examples/uffd/valid_2m_handler.rs | 51 +++++++++++++++++++ tests/conftest.py | 2 +- .../performance/test_huge_pages.py | 48 +++++++++++++++++ 4 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 src/firecracker/examples/uffd/valid_2m_handler.rs diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml index 78b8e44e67f..73133a966d5 100644 --- a/src/firecracker/Cargo.toml +++ b/src/firecracker/Cargo.toml @@ -57,6 +57,10 @@ path = "examples/uffd/malicious_4k_handler.rs" name = "uffd_valid_4k_handler" path = "examples/uffd/valid_4k_handler.rs" +[[example]] +name = "uffd_valid_2m_handler" +path = "examples/uffd/valid_2m_handler.rs" + [[example]] name = "seccomp_harmless" path = "examples/seccomp/harmless.rs" diff --git a/src/firecracker/examples/uffd/valid_2m_handler.rs b/src/firecracker/examples/uffd/valid_2m_handler.rs new file mode 100644 index 00000000000..d824ca01f55 --- /dev/null +++ b/src/firecracker/examples/uffd/valid_2m_handler.rs @@ -0,0 +1,51 @@ +// Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Provides functionality for a userspace page fault handler +//! which loads the whole region from the backing memory file +//! when a page fault occurs. + +mod uffd_utils; + +use std::fs::File; +use std::os::unix::net::UnixListener; + +use uffd_utils::{MemPageState, Runtime, UffdHandler}; + +fn main() { + let mut args = std::env::args(); + let uffd_sock_path = args.nth(1).expect("No socket path given"); + let mem_file_path = args.next().expect("No memory file given"); + + let file = File::open(mem_file_path).expect("Cannot open memfile"); + + // Get Uffd from UDS. We'll use the uffd to handle PFs for Firecracker. + let listener = UnixListener::bind(uffd_sock_path).expect("Cannot bind to socket path"); + let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); + + // Populate a single page from backing memory file. + // This is just an example, probably, with the worst-case latency scenario, + // of how memory can be loaded in guest RAM. + let len = 2 * 1024 * 1024; + + let mut runtime = Runtime::new(stream, file); + runtime.run(len, |uffd_handler: &mut UffdHandler| { + // Read an event from the userfaultfd. + let event = uffd_handler + .read_event() + .expect("Failed to read uffd_msg") + .expect("uffd_msg not ready"); + + // We expect to receive either a Page Fault or Removed + // event (if the balloon device is enabled). + match event { + userfaultfd::Event::Pagefault { addr, .. } => uffd_handler.serve_pf(addr.cast(), len), + userfaultfd::Event::Remove { start, end } => uffd_handler.update_mem_state_mappings( + start as u64, + end as u64, + MemPageState::Removed, + ), + _ => panic!("Unexpected event on userfaultfd"), + } + }); +} diff --git a/tests/conftest.py b/tests/conftest.py index 261ed8e1a94..5b0a8962f7e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -229,7 +229,7 @@ def uffd_handler_paths(): """Build UFFD handler binaries.""" handlers = { f"{handler}_handler": build_tools.get_example(f"uffd_{handler}_handler") - for handler in ["malicious_4k", "valid_4k"] + for handler in ["malicious_4k", "valid_4k", "valid_2m"] } yield handlers diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py index 040ece2f85f..a86f2c9f2e6 100644 --- a/tests/integration_tests/performance/test_huge_pages.py +++ b/tests/integration_tests/performance/test_huge_pages.py @@ -6,6 +6,7 @@ from framework import utils from framework.microvm import HugePagesConfig from framework.properties import global_props +from integration_tests.functional.test_uffd import SOCKET_PATH, spawn_pf_handler def check_hugetlbfs_in_use(pid: int, allocation_name: str): @@ -64,3 +65,50 @@ def test_hugetlbfs_boot(uvm_plain): assert not rc check_hugetlbfs_in_use(uvm_plain.firecracker_pid, "memfd:guest_mem") + + +@pytest.mark.skipif( + global_props.host_linux_version == "4.14", + reason="MFD_HUGETLB | MFD_ALLOW_SEALING only supported on kernels >= 4.16", +) +def test_hugetlbfs_snapshot( + microvm_factory, guest_kernel_linux_5_10, rootfs_ubuntu_22, uffd_handler_paths +): + """ + Test hugetlbfs snapshot restore via uffd + """ + + ### Create Snapshot ### + vm = microvm_factory.build(guest_kernel_linux_5_10, rootfs_ubuntu_22) + vm.memory_monitor = None + vm.spawn() + vm.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB, mem_size_mib=128) + vm.add_net_iface() + vm.start() + + # Wait for microvm to boot + rc, _, _ = vm.ssh.run("true") + assert not rc + + check_hugetlbfs_in_use(vm.firecracker_pid, "memfd:guest_mem") + + snapshot = vm.snapshot_full() + + vm.kill() + + ### Restore Snapshot ### + vm = microvm_factory.build() + vm.spawn() + + # Spawn page fault handler process. + _pf_handler = spawn_pf_handler( + vm, uffd_handler_paths["valid_2m_handler"], snapshot.mem + ) + + vm.restore_from_snapshot(snapshot, resume=True, uffd_path=SOCKET_PATH) + + # Verify if guest can run commands. + rc, _, _ = vm.ssh.run("true") + assert not rc + + check_hugetlbfs_in_use(vm.firecracker_pid, "/anon_hugepage") From 2a391e1a8396d8c3509d9c05171398ca68180fcb Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Thu, 18 Jan 2024 14:34:21 +0000 Subject: [PATCH 10/16] Disallow simultaneous usage of balloon device and huge pages The balloon device does not work with huge pages, so for now disallow using them together. Signed-off-by: Patrick Roy --- src/vmm/src/resources.rs | 11 ++++++++ src/vmm/src/vmm_config/balloon.rs | 2 ++ src/vmm/src/vmm_config/machine_config.rs | 2 ++ .../performance/test_huge_pages.py | 27 +++++++++++++++++++ 4 files changed, 42 insertions(+) diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 5963ed63502..5a5c6fc0d71 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -257,6 +257,10 @@ impl VmResources { return Err(VmConfigError::IncompatibleBalloonSize); } + if self.balloon.get().is_some() && updated.huge_pages != HugePageConfig::None { + return Err(VmConfigError::BalloonAndHugePages); + } + self.vm_config = updated; Ok(()) @@ -321,6 +325,10 @@ impl VmResources { return Err(BalloonConfigError::TooManyPagesRequested); } + if self.vm_config.huge_pages != HugePageConfig::None { + return Err(BalloonConfigError::HugePages); + } + self.balloon.set(config) } @@ -1382,6 +1390,9 @@ mod tests { if KernelVersion::get().unwrap() >= KernelVersion::new(5, 10, 0) { // mem_size_mib compatible with huge page configuration aux_vm_config.mem_size_mib = Some(2048); + // Remove the balloon device config that's added by `default_vm_resources` as it would + // trigger the "ballooning incompatible with huge pages" check. + vm_resources.balloon = BalloonBuilder::new(); vm_resources.update_vm_config(&aux_vm_config).unwrap(); } } diff --git a/src/vmm/src/vmm_config/balloon.rs b/src/vmm/src/vmm_config/balloon.rs index d359c871ece..4b4e229b9ec 100644 --- a/src/vmm/src/vmm_config/balloon.rs +++ b/src/vmm/src/vmm_config/balloon.rs @@ -28,6 +28,8 @@ pub enum BalloonConfigError { CreateFailure(crate::devices::virtio::balloon::BalloonError), /// Error updating the balloon device configuration: {0:?} UpdateFailure(std::io::Error), + /// Firecracker's huge pages support is incompatible with memory ballooning. + HugePages, } /// This struct represents the strongly typed equivalent of the json body diff --git a/src/vmm/src/vmm_config/machine_config.rs b/src/vmm/src/vmm_config/machine_config.rs index 0955ece0149..b76025095b2 100644 --- a/src/vmm/src/vmm_config/machine_config.rs +++ b/src/vmm/src/vmm_config/machine_config.rs @@ -33,6 +33,8 @@ pub enum VmConfigError { KernelVersion, /// Firecracker's hugetlbfs support requires at least host kernel 5.10. HugetlbfsNotSupported, + /// Firecracker's huge pages support is incompatible with memory ballooning. + BalloonAndHugePages, } // We cannot do a `KernelVersion(kernel_version::Error)` variant because `kernel_version::Error` diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py index a86f2c9f2e6..b120939ce8f 100644 --- a/tests/integration_tests/performance/test_huge_pages.py +++ b/tests/integration_tests/performance/test_huge_pages.py @@ -112,3 +112,30 @@ def test_hugetlbfs_snapshot( assert not rc check_hugetlbfs_in_use(vm.firecracker_pid, "/anon_hugepage") + + +@pytest.mark.skipif( + global_props.host_linux_version == "4.14", + reason="MFD_HUGETLB | MFD_ALLOW_SEALING only supported on kernels >= 4.16", +) +def test_negative_huge_pages_plus_balloon(uvm_plain): + """Tests that huge pages and memory ballooning cannot be used together""" + uvm_plain.memory_monitor = None + uvm_plain.spawn() + + # Ensure setting huge pages and then adding a balloon device doesn't work + uvm_plain.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB) + with pytest.raises( + RuntimeError, + match="Firecracker's huge pages support is incompatible with memory ballooning.", + ): + uvm_plain.api.balloon.put(amount_mib=0, deflate_on_oom=False) + + # Ensure adding a balloon device and then setting huge pages doesn't work + uvm_plain.basic_config(huge_pages=HugePagesConfig.NONE) + uvm_plain.api.balloon.put(amount_mib=0, deflate_on_oom=False) + with pytest.raises( + RuntimeError, + match="Machine config error: Firecracker's huge pages support is incompatible with memory ballooning.", + ): + uvm_plain.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB) From fbcc7399b03b5df46ad423d21d747b9f6a09ea55 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Mon, 22 Jan 2024 13:34:40 +0000 Subject: [PATCH 11/16] Disallow simultaneous usage of initrd and huge pages Booting our initrd artifact inside a huge-pages enabled VM causes it to get stuck, so for now this is seemingly not supported. Signed-off-by: Patrick Roy --- src/vmm/src/resources.rs | 12 ++++++ src/vmm/src/vmm_config/boot_source.rs | 2 + src/vmm/src/vmm_config/machine_config.rs | 2 + .../performance/test_huge_pages.py | 42 +++++++++++++++++++ 4 files changed, 58 insertions(+) diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 5a5c6fc0d71..93b4b24822e 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -261,6 +261,12 @@ impl VmResources { return Err(VmConfigError::BalloonAndHugePages); } + if self.boot_source.config.initrd_path.is_some() + && updated.huge_pages != HugePageConfig::None + { + return Err(VmConfigError::InitrdAndHugePages); + } + self.vm_config = updated; Ok(()) @@ -337,6 +343,12 @@ impl VmResources { &mut self, boot_source_cfg: BootSourceConfig, ) -> Result<(), BootSourceConfigError> { + if boot_source_cfg.initrd_path.is_some() + && self.vm_config.huge_pages != HugePageConfig::None + { + return Err(BootSourceConfigError::HugePagesAndInitRd); + } + self.set_boot_source_config(boot_source_cfg); self.boot_source.builder = Some(BootConfig::new(self.boot_source_config())?); Ok(()) diff --git a/src/vmm/src/vmm_config/boot_source.rs b/src/vmm/src/vmm_config/boot_source.rs index 24869f1be91..8374ae335a8 100644 --- a/src/vmm/src/vmm_config/boot_source.rs +++ b/src/vmm/src/vmm_config/boot_source.rs @@ -42,6 +42,8 @@ pub enum BootSourceConfigError { InvalidInitrdPath(io::Error), /// The kernel command line is invalid: {0} InvalidKernelCommandLine(String), + /// Firecracker's huge pages support is incompatible with initrds. + HugePagesAndInitRd, } /// Holds the kernel specification (both configuration as well as runtime details). diff --git a/src/vmm/src/vmm_config/machine_config.rs b/src/vmm/src/vmm_config/machine_config.rs index b76025095b2..b012cb2c2c5 100644 --- a/src/vmm/src/vmm_config/machine_config.rs +++ b/src/vmm/src/vmm_config/machine_config.rs @@ -35,6 +35,8 @@ pub enum VmConfigError { HugetlbfsNotSupported, /// Firecracker's huge pages support is incompatible with memory ballooning. BalloonAndHugePages, + /// Firecracker's huge pages support is incompatible with initrds. + InitrdAndHugePages, } // We cannot do a `KernelVersion(kernel_version::Error)` variant because `kernel_version::Error` diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py index b120939ce8f..1850bc52565 100644 --- a/tests/integration_tests/performance/test_huge_pages.py +++ b/tests/integration_tests/performance/test_huge_pages.py @@ -139,3 +139,45 @@ def test_negative_huge_pages_plus_balloon(uvm_plain): match="Machine config error: Firecracker's huge pages support is incompatible with memory ballooning.", ): uvm_plain.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB) + + +@pytest.mark.skipif( + global_props.host_linux_version == "4.14", + reason="MFD_HUGETLB | MFD_ALLOW_SEALING only supported on kernels >= 4.16", +) +def test_negative_huge_pages_plus_initrd(uvm_with_initrd): + """Tests that huge pages and initrd cannot be used together""" + uvm_with_initrd.jailer.daemonize = False + uvm_with_initrd.spawn() + uvm_with_initrd.memory_monitor = None + + # Ensure setting huge pages and then telling FC to boot an initrd does not work + with pytest.raises( + RuntimeError, + match="Boot source error: Firecracker's huge pages support is incompatible with initrds.", + ): + # `basic_config` first does a PUT to /machine-config, which will apply the huge pages configuration, + # and then a PUT to /boot-source, which will register the initrd + uvm_with_initrd.basic_config( + boot_args="console=ttyS0 reboot=k panic=1 pci=off", + use_initrd=True, + huge_pages=HugePagesConfig.HUGETLBFS_2MB, + add_root_device=False, + vcpu_count=1, + ) + + # Ensure telling FC about the initrd first and then setting huge pages doesn't work + # This first does a PUT to /machine-config to reset the huge pages configuration, before doing a + # PUT to /boot-source to register the initrd + uvm_with_initrd.basic_config( + huge_pages=HugePagesConfig.NONE, + boot_args="console=ttyS0 reboot=k panic=1 pci=off", + use_initrd=True, + ) + with pytest.raises( + RuntimeError, + match="Machine config error: Firecracker's huge pages support is incompatible with initrds.", + ): + uvm_with_initrd.api.machine_config.patch( + huge_pages=HugePagesConfig.HUGETLBFS_2MB + ) From ac687a03609a0150bb7e054aebbdd949eb65ff34 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Thu, 25 Jan 2024 13:47:32 +0000 Subject: [PATCH 12/16] test: Add metric tracking the number of EPT_VIOLATIONS post restore An EPT_VIOLATION kvm_exit happens when the MMU's extended page tables are missing an entry for some guest physical address (e.g. some page is present in the guest page tables, but KVM has not yet set up a mapping of guest-physical->host-physical address for it). This happens after snapshot restore even if a page is faulted in via UFFD, as fauling in via UFFD only maps the page into host userspace (e.g. Firecracker), but does not set up the EPT entries. We track the number of EPT_VIOLATIONS post restore when using UFFD for both 4K and 2M pages, as we expect their number to be significantly lower when using huge pages. We use a special UFFD handler that faults in the entire guest memory ahead of time, as otherwise we just track normal page faults. Signed-off-by: Patrick Roy --- .../usr/local/bin/fast_page_fault_helper.c | 44 ++++++++++ resources/rebuild.sh | 1 + src/firecracker/Cargo.toml | 4 + .../examples/uffd/fault_all_handler.rs | 50 +++++++++++ src/firecracker/examples/uffd/uffd_utils.rs | 10 +-- tests/conftest.py | 2 +- tests/framework/utils.py | 1 - tests/framework/utils_ftrace.py | 28 ++++++ .../performance/test_huge_pages.py | 88 ++++++++++++++++++- 9 files changed, 220 insertions(+), 8 deletions(-) create mode 100644 resources/overlay/usr/local/bin/fast_page_fault_helper.c create mode 100644 src/firecracker/examples/uffd/fault_all_handler.rs create mode 100644 tests/framework/utils_ftrace.py diff --git a/resources/overlay/usr/local/bin/fast_page_fault_helper.c b/resources/overlay/usr/local/bin/fast_page_fault_helper.c new file mode 100644 index 00000000000..d304b97f94d --- /dev/null +++ b/resources/overlay/usr/local/bin/fast_page_fault_helper.c @@ -0,0 +1,44 @@ +// Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Helper program for triggering fast page faults after UFFD snapshot restore. +// Allocates a 128M memory area using mmap, touches every page in it using memset and then +// calls `sigwait` to wait for a SIGUSR1 signal. Upon receiving this signal, +// set the entire memory area to 1, to trigger fast page fault. +// The idea is that an integration test takes a snapshot while the process is +// waiting for the SIGUSR1 signal, and then sends the signal after restoring. +// This way, the `memset` will trigger a fast page fault for every page in +// the memory region. + +#include // perror +#include // sigwait and friends +#include // memset +#include // mmap + +#define MEM_SIZE_MIB (128 * 1024 * 1024) + +int main(int argc, char *const argv[]) { + sigset_t set; + int signal; + + sigemptyset(&set); + if(sigaddset(&set, SIGUSR1) == -1) { + perror("sigaddset"); + return -1; + } + + void *ptr = mmap(NULL, MEM_SIZE_MIB, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + memset(ptr, 1, MEM_SIZE_MIB); + + if(MAP_FAILED == ptr) { + perror("mmap"); + return -1; + } + + sigwait(&set, &signal); + + memset(ptr, 2, MEM_SIZE_MIB); + + return 0; +} \ No newline at end of file diff --git a/resources/rebuild.sh b/resources/rebuild.sh index 6a40812d6b0..fa2a1e9df6f 100755 --- a/resources/rebuild.sh +++ b/resources/rebuild.sh @@ -205,6 +205,7 @@ install_dependencies BIN=overlay/usr/local/bin compile_and_install $BIN/init.c $BIN/init compile_and_install $BIN/fillmem.c $BIN/fillmem +compile_and_install $BIN/fast_page_fault_helper.c $BIN/fast_page_fault_helper compile_and_install $BIN/readmem.c $BIN/readmem if [ $ARCH == "aarch64" ]; then compile_and_install $BIN/devmemread.c $BIN/devmemread diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml index 73133a966d5..1da89fe698e 100644 --- a/src/firecracker/Cargo.toml +++ b/src/firecracker/Cargo.toml @@ -61,6 +61,10 @@ path = "examples/uffd/valid_4k_handler.rs" name = "uffd_valid_2m_handler" path = "examples/uffd/valid_2m_handler.rs" +[[example]] +name = "uffd_fault_all_handler" +path = "examples/uffd/fault_all_handler.rs" + [[example]] name = "seccomp_harmless" path = "examples/seccomp/harmless.rs" diff --git a/src/firecracker/examples/uffd/fault_all_handler.rs b/src/firecracker/examples/uffd/fault_all_handler.rs new file mode 100644 index 00000000000..1ab22ada680 --- /dev/null +++ b/src/firecracker/examples/uffd/fault_all_handler.rs @@ -0,0 +1,50 @@ +// Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Provides functionality for a userspace page fault handler +//! which loads the whole region from the backing memory file +//! when a page fault occurs. + +mod uffd_utils; + +use std::fs::File; +use std::os::unix::net::UnixListener; + +use uffd_utils::{Runtime, UffdHandler}; +use utils::get_page_size; + +fn main() { + let mut args = std::env::args(); + let uffd_sock_path = args.nth(1).expect("No socket path given"); + let mem_file_path = args.next().expect("No memory file given"); + + let file = File::open(mem_file_path).expect("Cannot open memfile"); + + // Get Uffd from UDS. We'll use the uffd to handle PFs for Firecracker. + let listener = UnixListener::bind(uffd_sock_path).expect("Cannot bind to socket path"); + let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); + + // Populate a single page from backing memory file. + // This is just an example, probably, with the worst-case latency scenario, + // of how memory can be loaded in guest RAM. + let len = get_page_size().unwrap(); // page size does not matter, we fault in everything on the first fault + + let mut runtime = Runtime::new(stream, file); + runtime.run(len, |uffd_handler: &mut UffdHandler| { + // Read an event from the userfaultfd. + let event = uffd_handler + .read_event() + .expect("Failed to read uffd_msg") + .expect("uffd_msg not ready"); + + match event { + userfaultfd::Event::Pagefault { .. } => { + for region in uffd_handler.mem_regions.clone() { + uffd_handler + .serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size) + } + } + _ => panic!("Unexpected event on userfaultfd"), + } + }); +} diff --git a/src/firecracker/examples/uffd/uffd_utils.rs b/src/firecracker/examples/uffd/uffd_utils.rs index d0e3b3cb91e..d517f785e19 100644 --- a/src/firecracker/examples/uffd/uffd_utils.rs +++ b/src/firecracker/examples/uffd/uffd_utils.rs @@ -40,15 +40,15 @@ pub enum MemPageState { Anonymous, } -#[derive(Debug)] -struct MemRegion { - mapping: GuestRegionUffdMapping, +#[derive(Debug, Clone)] +pub struct MemRegion { + pub mapping: GuestRegionUffdMapping, page_states: HashMap, } #[derive(Debug)] pub struct UffdHandler { - mem_regions: Vec, + pub mem_regions: Vec, page_size: usize, backing_buffer: *const u8, uffd: Uffd, @@ -317,7 +317,7 @@ mod tests { let mut uninit_runtime = Box::new(MaybeUninit::::uninit()); // We will use this pointer to bypass a bunch of Rust Safety // for the sake of convenience. - let runtime_ptr = uninit_runtime.as_ptr() as *const Runtime; + let runtime_ptr = uninit_runtime.as_ptr().cast::(); let runtime_thread = std::thread::spawn(move || { let tmp_file = TempFile::new().unwrap(); diff --git a/tests/conftest.py b/tests/conftest.py index 5b0a8962f7e..0149ceecf66 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -229,7 +229,7 @@ def uffd_handler_paths(): """Build UFFD handler binaries.""" handlers = { f"{handler}_handler": build_tools.get_example(f"uffd_{handler}_handler") - for handler in ["malicious_4k", "valid_4k", "valid_2m"] + for handler in ["malicious_4k", "valid_4k", "valid_2m", "fault_all"] } yield handlers diff --git a/tests/framework/utils.py b/tests/framework/utils.py index 3e982fcbfee..1fec2ccd96a 100644 --- a/tests/framework/utils.py +++ b/tests/framework/utils.py @@ -1,7 +1,6 @@ # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 """Generic utility functions that are used in the framework.""" - import functools import glob import json diff --git a/tests/framework/utils_ftrace.py b/tests/framework/utils_ftrace.py new file mode 100644 index 00000000000..6e5d636e33d --- /dev/null +++ b/tests/framework/utils_ftrace.py @@ -0,0 +1,28 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Utilities for interacting with the kernel's ftrace subsystem""" +import contextlib + +from framework.utils import run_cmd + + +@contextlib.contextmanager +def ftrace_events(events: str = "*:*"): + """Temporarily enables the kernel's tracing functionality for the specified events + + Assumes that the caller is the only test executing on the host""" + + # We have to do system-wide tracing because inside docker we live in a pidns, but trace-cmd does not know about + # this. We don't know how to translate the pidns PID to one ftrace would understand, so we use the fact that only + # one vm is running at the same time, and thus we can attribute all KVM events to this one VM + run_cmd("mount -t tracefs nodev /sys/kernel/tracing") + run_cmd("echo > /sys/kernel/tracing/trace") # clear the trace buffers + run_cmd(f"echo {events} > /sys/kernel/tracing/set_event") + run_cmd("echo nop > /sys/kernel/tracing/current_tracer") + run_cmd("echo 1 > /sys/kernel/tracing/tracing_on") + + try: + yield + finally: + run_cmd("echo 0 > /sys/kernel/tracing/tracing_on") + run_cmd("umount /sys/kernel/tracing") diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py index 1850bc52565..1f06ce53444 100644 --- a/tests/integration_tests/performance/test_huge_pages.py +++ b/tests/integration_tests/performance/test_huge_pages.py @@ -1,11 +1,15 @@ # Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 """Integration tests for Firecracker's huge pages support""" +import signal +import time + import pytest from framework import utils from framework.microvm import HugePagesConfig from framework.properties import global_props +from framework.utils_ftrace import ftrace_events from integration_tests.functional.test_uffd import SOCKET_PATH, spawn_pf_handler @@ -64,7 +68,10 @@ def test_hugetlbfs_boot(uvm_plain): rc, _, _ = uvm_plain.ssh.run("true") assert not rc - check_hugetlbfs_in_use(uvm_plain.firecracker_pid, "memfd:guest_mem") + check_hugetlbfs_in_use( + uvm_plain.firecracker_pid, + "memfd:guest_mem", + ) @pytest.mark.skipif( @@ -114,6 +121,85 @@ def test_hugetlbfs_snapshot( check_hugetlbfs_in_use(vm.firecracker_pid, "/anon_hugepage") +@pytest.mark.skipif( + global_props.host_linux_version == "4.14", + reason="MFD_HUGETLB | MFD_ALLOW_SEALING only supported on kernels >= 4.16", +) +@pytest.mark.parametrize("huge_pages", HugePagesConfig) +def test_ept_violation_count( + microvm_factory, + guest_kernel_linux_5_10, + rootfs_ubuntu_22, + uffd_handler_paths, + metrics, + huge_pages, +): + """ + Tests hugetlbfs snapshot restore with a UFFD handler that pre-faults the entire guest memory + on the first page fault. Records metrics about the number of EPT_VIOLATIONS encountered by KVM. + """ + + ### Create Snapshot ### + vm = microvm_factory.build(guest_kernel_linux_5_10, rootfs_ubuntu_22) + vm.memory_monitor = None + vm.spawn() + vm.basic_config(huge_pages=huge_pages, mem_size_mib=256) + vm.add_net_iface() + vm.start() + + metrics.set_dimensions( + { + "performance_test": "test_hugetlbfs_snapshot", + "huge_pages_config": str(huge_pages), + **vm.dimensions, + } + ) + + # Wait for microvm to boot. Then spawn fast_page_fault_helper to setup an environment where we can trigger + # a lot of fast_page_faults after restoring the snapshot. + rc, _, _ = vm.ssh.run( + "nohup /usr/local/bin/fast_page_fault_helper >/dev/null 2>&1 Date: Mon, 5 Feb 2024 14:44:32 +0000 Subject: [PATCH 13/16] test: differential snapshots and hugepages works Differential snapshots work with hugetlbfs pages out of the box. This is because despite guest memory being backed by 2M pages, KVM still keeps a dirty log at 4K granularity. This means we do not need to adjust our differential snapshot logic to handle 2M chunks, as the existing logic for 4K chunks stays valid. Signed-off-by: Patrick Roy --- .../performance/test_huge_pages.py | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py index 1f06ce53444..40817ffec3e 100644 --- a/tests/integration_tests/performance/test_huge_pages.py +++ b/tests/integration_tests/performance/test_huge_pages.py @@ -121,6 +121,59 @@ def test_hugetlbfs_snapshot( check_hugetlbfs_in_use(vm.firecracker_pid, "/anon_hugepage") +@pytest.mark.skipif( + global_props.host_linux_version == "4.14", + reason="MFD_HUGETLB | MFD_ALLOW_SEALING only supported on kernels >= 4.16", +) +def test_hugetlbfs_diff_snapshot(microvm_factory, uvm_plain, uffd_handler_paths): + """ + Test hugetlbfs differential snapshot support. + + Despite guest memory being backed by huge pages, differential snapshots still work at 4K granularity. + """ + + ### Create Snapshot ### + uvm_plain.memory_monitor = None + uvm_plain.spawn() + uvm_plain.basic_config( + huge_pages=HugePagesConfig.HUGETLBFS_2MB, + mem_size_mib=128, + track_dirty_pages=True, + ) + uvm_plain.add_net_iface() + uvm_plain.start() + + # Wait for microvm to boot + rc, _, _ = uvm_plain.ssh.run("true") + assert not rc + + base_snapshot = uvm_plain.snapshot_diff() + uvm_plain.resume() + + # Run command to dirty some pages + rc, _, _ = uvm_plain.ssh.run("sync") + assert not rc + + snapshot_diff = uvm_plain.snapshot_diff() + snapshot_merged = snapshot_diff.rebase_snapshot(base_snapshot) + + uvm_plain.kill() + + vm = microvm_factory.build() + vm.spawn() + + # Spawn page fault handler process. + _pf_handler = spawn_pf_handler( + vm, uffd_handler_paths["valid_2m_handler"], snapshot_merged.mem + ) + + vm.restore_from_snapshot(snapshot_merged, resume=True, uffd_path=SOCKET_PATH) + + # Verify if guest can run commands. + rc, _, _ = vm.ssh.run("true") + assert not rc + + @pytest.mark.skipif( global_props.host_linux_version == "4.14", reason="MFD_HUGETLB | MFD_ALLOW_SEALING only supported on kernels >= 4.16", From f789bd534279fdfc784418ebf6a46ef29471144b Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Wed, 24 Jan 2024 11:52:32 +0000 Subject: [PATCH 14/16] docs: Add documentation for hugepages feature Document how to use hugetlbfs with Firecracker. Signed-off-by: Patrick Roy --- docs/hugepages.md | 55 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 docs/hugepages.md diff --git a/docs/hugepages.md b/docs/hugepages.md new file mode 100644 index 00000000000..a5105cc802b --- /dev/null +++ b/docs/hugepages.md @@ -0,0 +1,55 @@ +# Backing Guest Memory by Huge Pages + +> \[!WARNING\] +> +> Support is currently in **developer preview**. See +> [this section](RELEASE_POLICY.md#developer-preview-features) for more info. + +Firecracker supports backing the guest memory of a VM by 2MB hugetlbfs pages. +This can be enabled by setting the `huge_pages` field of `PUT` or `PATCH` +requests to the `/machine-config` endpoint to `2M`. + +Backing guest memory by huge pages can bring performance improvements for +specific workloads, due to less TLB contention and less overhead during +virtual->physical address resolution. It can also help reduce the number of +KVM_EXITS required to rebuild extended page tables post snapshot restore, as +well as improve boot times (by up to 50% as measured by Firecracker's +[boot time performance tests](../tests/integration_tests/performance/test_boottime.py)) + +Using hugetlbfs requires the host running Firecracker to have a pre-allocated +pool of 2M pages. Should this pool be too small, Firecracker may behave +erratically or receive the `SIGBUS` signal. This is because Firecracker uses the +`MAP_NORESERVE` flag when mapping guest memory. This flag means the kernel will +not try to reserve sufficient hugetlbfs pages at the time of the `mmap` call, +trying to claim them from the pool on-demand. For details on how to manage this +pool, please refer to the [Linux Documentation][hugetlbfs_docs]. + +## Huge Pages and Snapshotting + +Restoring a Firecracker snapshot of a microVM backed by huge pages will also use +huge pages to back the restored guest. There is no option to flip between +regular, 4K, pages and huge pages at restore time. Furthermore, snapshots of +microVMs backed with huge pages can only be restored via UFFD. Lastly, note that +even for guests backed by huge pages, differential snapshots will always track +write accesses to guest memory at 4K granularity. + +## Known Limitations + +Currently, hugetlbfs support is mutually exclusive with the following +Firecracker features: + +- Memory Ballooning via the [Balloon Device](./ballooning.md) +- Initrd + +## FAQ + +### Why does Firecracker not offer a transparent huge pages (THP) setting? + +Firecracker's guest memory is memfd based. Linux (as of 6.1) does not offer a +way to dynamically enable THP for such memory regions. Additionally, UFFD does +not integrate with THP (no transparent huge pages will be allocated during +userfaulting). Please refer to the [Linux Documentation][thp_docs] for more +information. + +[hugetlbfs_docs]: https://docs.kernel.org/admin-guide/mm/hugetlbpage.html +[thp_docs]: https://www.kernel.org/doc/html/next/admin-guide/mm/transhuge.html#hugepages-in-tmpfs-shmem From 27eeefd179186bb96b7893e088c18fdce9f3d9fc Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Wed, 24 Jan 2024 13:08:56 +0000 Subject: [PATCH 15/16] docs: Update swagger.yml with huge_pages field Add the huge_pages field to the /machine-config documentation. Signed-off-by: Patrick Roy --- src/firecracker/swagger/firecracker.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index 918202b57f4..adfbab11b86 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -331,9 +331,10 @@ paths: The vCPU count is restricted to the [1, 32] range. With SMT enabled, the vCPU count is required to be either 1 or an even number in the range. otherwise there are no restrictions regarding the vCPU count. + If 2M hugetlbfs pages are specified, then `mem_size_mib` must be a multiple of 2. If any of the parameters has an incorrect value, the whole update fails. All parameters that are optional and are not specified are set to their default values - (smt = false, track_dirty_pages = false, cpu_template = None). + (smt = false, track_dirty_pages = false, cpu_template = None, huge_pages = None). operationId: putMachineConfiguration parameters: - name: body @@ -1015,7 +1016,7 @@ definitions: MachineConfiguration: type: object description: - Describes the number of vCPUs, memory size, SMT capabilities and + Describes the number of vCPUs, memory size, SMT capabilities, huge page configuration and the CPU template. required: - mem_size_mib @@ -1043,6 +1044,12 @@ definitions: minimum: 1 maximum: 32 description: Number of vCPUs (either 1 or an even number) + huge_pages: + type: string + enum: + - None + - 2M + description: Which huge pages configuration (if any) should be used to back guest memory. MemoryBackend: type: object From f68c09ac485158575ec55e446b9d24b9e9d2b537 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Wed, 24 Jan 2024 13:11:03 +0000 Subject: [PATCH 16/16] docs: Update CHANGELOG.md Add an entry about huge page support. Signed-off-by: Patrick Roy --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c6293caa831..2b472acf2b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,9 @@ and this project adheres to `VcpuExit::MmioRead`, `VcpuExit::MmioWrite`, `VcpuExit::IoIn` and `VcpuExit::IoOut`. The average for these VM exits is not emitted since it can be deduced from the available emitted metrics. +- [#4360](https://github.com/firecracker-microvm/firecracker/pull/4360): Added + dev-preview support for backing a VM's guest memory by 2M hugetlbfs pages. + Please see the [documentation](docs/hugepages.md) for more information. ### Changed