diff --git a/src/arch/src/lib.rs b/src/arch/src/lib.rs index 14d5aeb1e65..4e2b637b2b2 100644 --- a/src/arch/src/lib.rs +++ b/src/arch/src/lib.rs @@ -67,3 +67,32 @@ impl fmt::Display for DeviceType { write!(f, "{:?}", self) } } + +/// Suported boot protocols for +#[derive(Debug, Copy, Clone)] +pub enum BootProtocol { + /// Linux 64-bit boot protocol + LinuxBoot, + /// PVH boot protocol (x86/HVM direct boot ABI) + PvhBoot, +} + +impl fmt::Display for BootProtocol { + fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result { + match self { + BootProtocol::LinuxBoot => write!(f, "Linux 64-bit boot protocol"), + BootProtocol::PvhBoot => write!(f, "PVH boot protocol"), + } + } +} + +#[derive(Debug, Copy, Clone)] +/// Specifies the entry point address where the guest must start +/// executing code, as well as which boot protocol is to be used +/// to configure the guest initial state. +pub struct EntryPoint { + /// Address in guest memory where the guest must start execution + pub entry_addr: vm_memory::GuestAddress, + /// Specifies which boot protocol to use + pub protocol: BootProtocol, +} diff --git a/src/arch/src/x86_64/gdt.rs b/src/arch/src/x86_64/gdt.rs index 41b33f92255..a03a5eabd11 100644 --- a/src/arch/src/x86_64/gdt.rs +++ b/src/arch/src/x86_64/gdt.rs @@ -1,3 +1,5 @@ +// Copyright © 2020, Oracle and/or its affiliates. +// // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 // @@ -24,8 +26,34 @@ fn get_base(entry: u64) -> u64 { | (((entry) & 0x0000_0000_FFFF_0000) >> 16)) } +// Extract the segment limit from the GDT segment descriptor. +// +// In a segment descriptor, the limit field is 20 bits, so it can directly describe +// a range from 0 to 0xFFFFF (1 MB). When G flag is set (4-KByte page granularity) it +// scales the value in the limit field by a factor of 2^12 (4 Kbytes), making the effective +// limit range from 0xFFF (4 KBytes) to 0xFFFF_FFFF (4 GBytes). +// +// However, the limit field in the VMCS definition is a 32 bit field, and the limit value is not +// automatically scaled using the G flag. This means that for a desired range of 4GB for a +// given segment, its limit must be specified as 0xFFFF_FFFF. Therefore the method of obtaining +// the limit from the GDT entry is not sufficient, since it only provides 20 bits when 32 bits +// are necessary. Fortunately, we can check if the G flag is set when extracting the limit since +// the full GDT entry is passed as an argument, and perform the scaling of the limit value to +// return the full 32 bit value. +// +// The scaling mentioned above is required when using PVH boot, since the guest boots in protected +// (32-bit) mode and must be able to access the entire 32-bit address space. It does not cause issues +// for the case of direct boot to 64-bit (long) mode, since in 64-bit mode the processor does not +// perform runtime limit checking on code or data segments. fn get_limit(entry: u64) -> u32 { - ((((entry) & 0x000F_0000_0000_0000) >> 32) | ((entry) & 0x0000_0000_0000_FFFF)) as u32 + let limit: u32 = + ((((entry) & 0x000F_0000_0000_0000) >> 32) | ((entry) & 0x0000_0000_0000_FFFF)) as u32; + + // Perform manual limit scaling if G flag is set + match get_g(entry) { + 0 => limit, + _ => ((limit << 12) | 0xFFF), // G flag is either 0 or 1 + } } fn get_g(entry: u64) -> u8 { @@ -109,7 +137,7 @@ mod tests { assert_eq!(0xB, seg.type_); // base and limit assert_eq!(0x10_0000, seg.base); - assert_eq!(0xfffff, seg.limit); + assert_eq!(0xffff_ffff, seg.limit); assert_eq!(0x0, seg.unusable); } } diff --git a/src/arch/src/x86_64/layout.rs b/src/arch/src/x86_64/layout.rs index b4c3f149428..0886d0f65e3 100644 --- a/src/arch/src/x86_64/layout.rs +++ b/src/arch/src/x86_64/layout.rs @@ -27,5 +27,16 @@ pub const IRQ_MAX: u32 = 15; /// Address for the TSS setup. pub const KVM_TSS_ADDRESS: u64 = 0xfffb_d000; +/// Address of the hvm_start_info struct used in PVH boot +pub const PVH_INFO_START: u64 = 0x6000; + +/// Starting address of array of modules of hvm_modlist_entry type. +/// Used to enable initrd support using the PVH boot ABI. +pub const MODLIST_START: u64 = 0x6040; + +/// Address of memory map table used in PVH boot. Can overlap +/// with the zero page address since they are mutually exclusive. +pub const MEMMAP_START: u64 = 0x7000; + /// The 'zero page', a.k.a linux kernel bootparams. pub const ZERO_PAGE_START: u64 = 0x7000; diff --git a/src/arch/src/x86_64/mod.rs b/src/arch/src/x86_64/mod.rs index 4605bc69a18..3d9efd9c917 100644 --- a/src/arch/src/x86_64/mod.rs +++ b/src/arch/src/x86_64/mod.rs @@ -1,3 +1,5 @@ +// Copyright © 2020, Oracle and/or its affiliates. +// // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 // @@ -17,9 +19,12 @@ pub mod msr; pub mod regs; use arch_gen::x86::bootparam::{boot_params, E820_RAM}; +use arch_gen::x86::start_info::{hvm_memmap_table_entry, hvm_modlist_entry, hvm_start_info}; +use std::mem; use vm_memory::{ Address, ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion, }; +use BootProtocol; use InitrdConfig; // This is a workaround to the Rust enforcement specifying that any implementation of a foreign @@ -33,6 +38,25 @@ struct BootParamsWrapper(boot_params); // It is safe to initialize BootParamsWrap which is a wrapper over `boot_params` (a series of ints). unsafe impl ByteValued for BootParamsWrapper {} +// Workaround for the Rust orphan rules that guarantee trait coherence by wrapping the foreign type +// in a tuple structure. Same approach is used by boot_params and BootParamsWrapper. +#[derive(Copy, Clone, Default)] +struct StartInfoWrapper(hvm_start_info); + +#[derive(Copy, Clone, Default)] +struct MemmapTableEntryWrapper(hvm_memmap_table_entry); + +#[derive(Copy, Clone, Default)] +struct ModlistEntryWrapper(hvm_modlist_entry); + +// It is safe to initialize the following structures. They are wrappers over the structures +// defined by the start_info module, all of which are formed by fields of integer values. +unsafe impl ByteValued for StartInfoWrapper {} +unsafe impl ByteValued for MemmapTableEntryWrapper {} +unsafe impl ByteValued for ModlistEntryWrapper {} + +const MEMMAP_TYPE_RAM: u32 = 1; + /// Errors thrown while configuring x86_64 system. #[derive(Debug, PartialEq)] pub enum Error { @@ -44,6 +68,12 @@ pub enum Error { ZeroPageSetup, /// Failed to compute initrd address. InitrdAddress, + /// Error writing module entry to guest memory. + ModlistSetup, + /// Error writing memory map table to guest memory. + MemmapTableSetup, + /// Error writing hvm_start_info to guest memory. + StartInfoSetup, } // Where BIOS/VGA magic would live on a real PC. @@ -101,12 +131,151 @@ pub fn initrd_load_addr(guest_mem: &GuestMemoryMmap, initrd_size: usize) -> supe /// * `cmdline_size` - Size of the kernel command line in bytes including the null terminator. /// * `initrd` - Information about where the ramdisk image was loaded in the `guest_mem`. /// * `num_cpus` - Number of virtual CPUs the guest will have. +/// * `boot_prot` - Boot protocol that will be used to boot the guest. pub fn configure_system( guest_mem: &GuestMemoryMmap, cmdline_addr: GuestAddress, cmdline_size: usize, initrd: &Option, num_cpus: u8, + boot_prot: BootProtocol, +) -> super::Result<()> { + // Note that this puts the mptable at the last 1k of Linux's 640k base RAM + mptable::setup_mptable(guest_mem, num_cpus).map_err(Error::MpTableSetup)?; + + match boot_prot { + BootProtocol::PvhBoot => { + configure_pvh(guest_mem, cmdline_addr, initrd)?; + } + BootProtocol::LinuxBoot => { + configure_64bit_boot(guest_mem, cmdline_addr, cmdline_size, initrd)?; + } + } + + Ok(()) +} + +fn configure_pvh( + guest_mem: &GuestMemoryMmap, + cmdline_addr: GuestAddress, + initrd: &Option, +) -> super::Result<()> { + const XEN_HVM_START_MAGIC_VALUE: u32 = 0x336e_c578; + let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS); + let end_32bit_gap_start = GuestAddress(MMIO_MEM_START); + let himem_start = GuestAddress(layout::HIMEM_START); + + let mut start_info: StartInfoWrapper = StartInfoWrapper(hvm_start_info::default()); + + start_info.0.magic = XEN_HVM_START_MAGIC_VALUE; + start_info.0.version = 1; // pvh has version 1 + start_info.0.cmdline_paddr = cmdline_addr.raw_value(); + start_info.0.memmap_paddr = layout::MEMMAP_START; + + if let Some(initrd_config) = initrd { + // The initrd has been written to guest memory already, here we just need to + // create the module structure that describes it. + let ramdisk_mod: ModlistEntryWrapper = ModlistEntryWrapper(hvm_modlist_entry { + paddr: initrd_config.address.raw_value(), + size: initrd_config.size as u64, + ..Default::default() + }); + + start_info.0.nr_modules += 1; + start_info.0.modlist_paddr = layout::MODLIST_START; + + // Write the modlist struct to guest memory. + guest_mem + .write_obj(ramdisk_mod, GuestAddress(layout::MODLIST_START)) + .map_err(|_| Error::ModlistSetup)?; + } + + // Vector to hold the memory maps which needs to be written to guest memory + // at MEMMAP_START after all of the mappings are recorded. + let mut memmap: Vec = Vec::new(); + + // Create the memory map entries. + add_memmap_entry(&mut memmap, 0, EBDA_START, MEMMAP_TYPE_RAM)?; + + let last_addr = guest_mem.last_addr(); + if last_addr < end_32bit_gap_start { + add_memmap_entry( + &mut memmap, + himem_start.raw_value() as u64, + last_addr.unchecked_offset_from(himem_start) as u64 + 1, + MEMMAP_TYPE_RAM, + )?; + } else { + add_memmap_entry( + &mut memmap, + himem_start.raw_value(), + end_32bit_gap_start.unchecked_offset_from(himem_start), + MEMMAP_TYPE_RAM, + )?; + + if last_addr > first_addr_past_32bits { + add_memmap_entry( + &mut memmap, + first_addr_past_32bits.raw_value(), + last_addr.unchecked_offset_from(first_addr_past_32bits) + 1, + MEMMAP_TYPE_RAM, + )?; + } + } + + start_info.0.memmap_entries = memmap.len() as u32; + + // Copy the vector with the memmap table to the MEMMAP_START address + // which is already saved in the memmap_paddr field of hvm_start_info struct. + let mut memmap_start_addr = GuestAddress(layout::MEMMAP_START); + + // For every entry in the memmap vector, create a MemmapTableEntryWrapper + // and write it to guest memory. + for memmap_entry in memmap { + let map_entry_wrapper: MemmapTableEntryWrapper = MemmapTableEntryWrapper(memmap_entry); + + guest_mem + .write_obj(map_entry_wrapper, memmap_start_addr) + .map_err(|_| Error::MemmapTableSetup)?; + memmap_start_addr = + memmap_start_addr.unchecked_add(mem::size_of::() as u64); + } + + // The hvm_start_info struct itself must be stored at PVH_START_INFO + // address, and %rbx will be initialized to contain PVH_INFO_START prior to + // starting the guest, as required by the PVH ABI. + let start_info_addr = GuestAddress(layout::PVH_INFO_START); + + // Write the start_info struct to guest memory. + guest_mem + .write_obj(start_info, start_info_addr) + .map_err(|_| Error::StartInfoSetup)?; + + Ok(()) +} + +fn add_memmap_entry( + memmap: &mut Vec, + addr: u64, + size: u64, + mem_type: u32, +) -> super::Result<()> { + // Add the table entry to the vector + memmap.push(hvm_memmap_table_entry { + addr, + size, + type_: mem_type, + reserved: 0, + }); + + Ok(()) +} + +fn configure_64bit_boot( + guest_mem: &GuestMemoryMmap, + cmdline_addr: GuestAddress, + cmdline_size: usize, + initrd: &Option, ) -> super::Result<()> { const KERNEL_BOOT_FLAG_MAGIC: u16 = 0xaa55; const KERNEL_HDR_MAGIC: u32 = 0x5372_6448; @@ -117,9 +286,6 @@ pub fn configure_system( let himem_start = GuestAddress(layout::HIMEM_START); - // Note that this puts the mptable at the last 1k of Linux's 640k base RAM - mptable::setup_mptable(guest_mem, num_cpus).map_err(Error::MpTableSetup)?; - let mut params: BootParamsWrapper = BootParamsWrapper(boot_params::default()); params.0.hdr.type_of_loader = KERNEL_LOADER_OTHER; @@ -220,7 +386,8 @@ mod tests { fn test_system_configuration() { let no_vcpus = 4; let gm = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); - let config_err = configure_system(&gm, GuestAddress(0), 0, &None, 1); + let config_err = + configure_system(&gm, GuestAddress(0), 0, &None, 1, BootProtocol::LinuxBoot); assert!(config_err.is_err()); assert_eq!( config_err.unwrap_err(), @@ -231,19 +398,73 @@ mod tests { let mem_size = 128 << 20; let arch_mem_regions = arch_memory_regions(mem_size); let gm = GuestMemoryMmap::from_ranges(&arch_mem_regions).unwrap(); - configure_system(&gm, GuestAddress(0), 0, &None, no_vcpus).unwrap(); + configure_system( + &gm, + GuestAddress(0), + 0, + &None, + no_vcpus, + BootProtocol::LinuxBoot, + ) + .unwrap(); + + configure_system( + &gm, + GuestAddress(0), + 0, + &None, + no_vcpus, + BootProtocol::PvhBoot, + ) + .unwrap(); // Now assigning some memory that is equal to the start of the 32bit memory hole. let mem_size = 3328 << 20; let arch_mem_regions = arch_memory_regions(mem_size); let gm = GuestMemoryMmap::from_ranges(&arch_mem_regions).unwrap(); - configure_system(&gm, GuestAddress(0), 0, &None, no_vcpus).unwrap(); + configure_system( + &gm, + GuestAddress(0), + 0, + &None, + no_vcpus, + BootProtocol::LinuxBoot, + ) + .unwrap(); + + configure_system( + &gm, + GuestAddress(0), + 0, + &None, + no_vcpus, + BootProtocol::PvhBoot, + ) + .unwrap(); // Now assigning some memory that falls after the 32bit memory hole. let mem_size = 3330 << 20; let arch_mem_regions = arch_memory_regions(mem_size); let gm = GuestMemoryMmap::from_ranges(&arch_mem_regions).unwrap(); - configure_system(&gm, GuestAddress(0), 0, &None, no_vcpus).unwrap(); + configure_system( + &gm, + GuestAddress(0), + 0, + &None, + no_vcpus, + BootProtocol::LinuxBoot, + ) + .unwrap(); + + configure_system( + &gm, + GuestAddress(0), + 0, + &None, + no_vcpus, + BootProtocol::PvhBoot, + ) + .unwrap(); } #[test] @@ -285,4 +506,31 @@ mod tests { ) .is_err()); } + + #[test] + fn test_add_memmap_entry() { + const MEMMAP_TYPE_RESERVED: u32 = 2; + + let mut memmap: Vec = Vec::new(); + + let expected_memmap = vec![ + hvm_memmap_table_entry { + addr: 0x0, + size: 0x1000, + type_: MEMMAP_TYPE_RAM, + ..Default::default() + }, + hvm_memmap_table_entry { + addr: 0x10000, + size: 0xa000, + type_: MEMMAP_TYPE_RESERVED, + ..Default::default() + }, + ]; + + add_memmap_entry(&mut memmap, 0, 0x1000, MEMMAP_TYPE_RAM).unwrap(); + add_memmap_entry(&mut memmap, 0x10000, 0xa000, MEMMAP_TYPE_RESERVED).unwrap(); + + assert_eq!(format!("{:?}", memmap), format!("{:?}", expected_memmap)); + } } diff --git a/src/arch/src/x86_64/regs.rs b/src/arch/src/x86_64/regs.rs index 8ea36d09003..a41bda78947 100644 --- a/src/arch/src/x86_64/regs.rs +++ b/src/arch/src/x86_64/regs.rs @@ -1,3 +1,4 @@ +// Copyright © 2020, Oracle and/or its affiliates. // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 // @@ -11,6 +12,8 @@ use super::gdt::{gdt_entry, kvm_segment_from_gdt}; use kvm_bindings::{kvm_fpu, kvm_regs, kvm_sregs}; use kvm_ioctls::VcpuFd; use vm_memory::{Address, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap}; +use BootProtocol; +use EntryPoint; // Initial pagetables. const PML4_START: u64 = 0x9000; @@ -62,19 +65,29 @@ pub fn setup_fpu(vcpu: &VcpuFd) -> Result<()> { /// /// * `vcpu` - Structure for the VCPU that holds the VCPU's fd. /// * `boot_ip` - Starting instruction pointer. -pub fn setup_regs(vcpu: &VcpuFd, boot_ip: u64) -> Result<()> { - let regs: kvm_regs = kvm_regs { - rflags: 0x0000_0000_0000_0002u64, - rip: boot_ip, - // Frame pointer. It gets a snapshot of the stack pointer (rsp) so that when adjustments are - // made to rsp (i.e. reserving space for local variables or pushing values on to the stack), - // local variables and function parameters are still accessible from a constant offset from rbp. - rsp: super::layout::BOOT_STACK_POINTER as u64, - // Starting stack pointer. - rbp: super::layout::BOOT_STACK_POINTER as u64, - // Must point to zero page address per Linux ABI. This is x86_64 specific. - rsi: super::layout::ZERO_PAGE_START as u64, - ..Default::default() +pub fn setup_regs(vcpu: &VcpuFd, entry_point: EntryPoint) -> Result<()> { + let regs: kvm_regs = match entry_point.protocol { + // Configure regs as required by PVH boot protocol. + BootProtocol::PvhBoot => kvm_regs { + rflags: 0x0000_0000_0000_0002u64, + rbx: super::layout::PVH_INFO_START, + rip: entry_point.entry_addr.raw_value(), + ..Default::default() + }, + // Configure regs as required by Linux 64-bit boot protocol. + BootProtocol::LinuxBoot => kvm_regs { + rflags: 0x0000_0000_0000_0002u64, + rip: entry_point.entry_addr.raw_value(), + // Frame pointer. It gets a snapshot of the stack pointer (rsp) so that when adjustments are + // made to rsp (i.e. reserving space for local variables or pushing values on to the stack), + // local variables and function parameters are still accessible from a constant offset from rbp. + rsp: super::layout::BOOT_STACK_POINTER as u64, + // Starting stack pointer. + rbp: super::layout::BOOT_STACK_POINTER as u64, + // Must point to zero page address per Linux ABI. This is x86_64 specific. + rsi: super::layout::ZERO_PAGE_START as u64, + ..Default::default() + }, }; vcpu.set_regs(®s).map_err(Error::SetBaseRegisters) @@ -86,11 +99,13 @@ pub fn setup_regs(vcpu: &VcpuFd, boot_ip: u64) -> Result<()> { /// /// * `mem` - The memory that will be passed to the guest. /// * `vcpu` - Structure for the VCPU that holds the VCPU's fd. -pub fn setup_sregs(mem: &GuestMemoryMmap, vcpu: &VcpuFd) -> Result<()> { +pub fn setup_sregs(mem: &GuestMemoryMmap, vcpu: &VcpuFd, boot_prot: BootProtocol) -> Result<()> { let mut sregs: kvm_sregs = vcpu.get_sregs().map_err(Error::GetStatusRegisters)?; - configure_segments_and_sregs(mem, &mut sregs)?; - setup_page_tables(mem, &mut sregs)?; // TODO(dgreid) - Can this be done once per system instead? + configure_segments_and_sregs(mem, &mut sregs, boot_prot)?; + if let BootProtocol::LinuxBoot = boot_prot { + setup_page_tables(mem, &mut sregs)?; // TODO(dgreid) - Can this be done once per system instead? + } vcpu.set_sregs(&sregs).map_err(Error::SetStatusRegisters) } @@ -104,6 +119,7 @@ const EFER_LMA: u64 = 0x400; const EFER_LME: u64 = 0x100; const X86_CR0_PE: u64 = 0x1; +const X86_CR0_ET: u64 = 0x10; const X86_CR0_PG: u64 = 0x8000_0000; const X86_CR4_PAE: u64 = 0x20; @@ -127,13 +143,31 @@ fn write_idt_value(val: u64, guest_mem: &GuestMemoryMmap) -> Result<()> { .map_err(|_| Error::WriteIDT) } -fn configure_segments_and_sregs(mem: &GuestMemoryMmap, sregs: &mut kvm_sregs) -> Result<()> { - let gdt_table: [u64; BOOT_GDT_MAX as usize] = [ - gdt_entry(0, 0, 0), // NULL - gdt_entry(0xa09b, 0, 0xfffff), // CODE - gdt_entry(0xc093, 0, 0xfffff), // DATA - gdt_entry(0x808b, 0, 0xfffff), // TSS - ]; +fn configure_segments_and_sregs( + mem: &GuestMemoryMmap, + sregs: &mut kvm_sregs, + boot_prot: BootProtocol, +) -> Result<()> { + let gdt_table: [u64; BOOT_GDT_MAX as usize] = match boot_prot { + BootProtocol::PvhBoot => { + // Configure GDT entries as specified by PVH boot protocol + [ + gdt_entry(0, 0, 0), // NULL + gdt_entry(0xc09b, 0, 0xffff_ffff), // CODE + gdt_entry(0xc093, 0, 0xffff_ffff), // DATA + gdt_entry(0x008b, 0, 0x67), // TSS + ] + } + BootProtocol::LinuxBoot => { + // Configure GDT entries as specified by Linux 64bit boot protocol + [ + gdt_entry(0, 0, 0), // NULL + gdt_entry(0xa09b, 0, 0xfffff), // CODE + gdt_entry(0xc093, 0, 0xfffff), // DATA + gdt_entry(0x808b, 0, 0xfffff), // TSS + ] + } + }; let code_seg = kvm_segment_from_gdt(gdt_table[1], 1); let data_seg = kvm_segment_from_gdt(gdt_table[2], 2); @@ -156,9 +190,17 @@ fn configure_segments_and_sregs(mem: &GuestMemoryMmap, sregs: &mut kvm_sregs) -> sregs.ss = data_seg; sregs.tr = tss_seg; - /* 64-bit protected mode */ - sregs.cr0 |= X86_CR0_PE; - sregs.efer |= EFER_LME | EFER_LMA; + match boot_prot { + BootProtocol::PvhBoot => { + sregs.cr0 = X86_CR0_PE | X86_CR0_ET; + sregs.cr4 = 0; + } + BootProtocol::LinuxBoot => { + /* 64-bit protected mode */ + sregs.cr0 |= X86_CR0_PE; + sregs.efer |= EFER_LME | EFER_LMA; + } + } Ok(()) } @@ -204,33 +246,58 @@ mod tests { gm.read_obj(read_addr).unwrap() } - fn validate_segments_and_sregs(gm: &GuestMemoryMmap, sregs: &kvm_sregs) { + fn validate_segments_and_sregs( + gm: &GuestMemoryMmap, + sregs: &kvm_sregs, + boot_prot: BootProtocol, + ) { + if let BootProtocol::LinuxBoot = boot_prot { + assert_eq!(0xaf_9b00_0000_ffff, read_u64(&gm, BOOT_GDT_OFFSET + 8)); + assert_eq!(0xcf_9300_0000_ffff, read_u64(&gm, BOOT_GDT_OFFSET + 16)); + assert_eq!(0x8f_8b00_0000_ffff, read_u64(&gm, BOOT_GDT_OFFSET + 24)); + + assert_eq!(0xffff_ffff, sregs.tr.limit); + + assert!(sregs.cr0 & X86_CR0_PE != 0); + assert!(sregs.efer & EFER_LME != 0 && sregs.efer & EFER_LMA != 0); + } else { + // Validate values that are specific to PVH boot protocol + assert_eq!(0xcf_9b00_0000_ffff, read_u64(&gm, BOOT_GDT_OFFSET + 8)); + assert_eq!(0xcf_9300_0000_ffff, read_u64(&gm, BOOT_GDT_OFFSET + 16)); + assert_eq!(0x00_8b00_0000_0067, read_u64(&gm, BOOT_GDT_OFFSET + 24)); + + assert_eq!(0x67, sregs.tr.limit); + assert_eq!(0, sregs.tr.g); + + assert!(sregs.cr0 & X86_CR0_PE != 0 && sregs.cr0 & X86_CR0_ET != 0); + assert_eq!(0, sregs.cr4); + } + + // Common settings for both PVH and Linux boot protocol assert_eq!(0x0, read_u64(&gm, BOOT_GDT_OFFSET)); - assert_eq!(0xaf_9b00_0000_ffff, read_u64(&gm, BOOT_GDT_OFFSET + 8)); - assert_eq!(0xcf_9300_0000_ffff, read_u64(&gm, BOOT_GDT_OFFSET + 16)); - assert_eq!(0x8f_8b00_0000_ffff, read_u64(&gm, BOOT_GDT_OFFSET + 24)); assert_eq!(0x0, read_u64(&gm, BOOT_IDT_OFFSET)); assert_eq!(0, sregs.cs.base); - assert_eq!(0xfffff, sregs.ds.limit); + assert_eq!(0xffff_ffff, sregs.ds.limit); assert_eq!(0x10, sregs.es.selector); assert_eq!(1, sregs.fs.present); assert_eq!(1, sregs.gs.g); assert_eq!(0, sregs.ss.avl); assert_eq!(0, sregs.tr.base); - assert_eq!(0xfffff, sregs.tr.limit); assert_eq!(0, sregs.tr.avl); - assert!(sregs.cr0 & X86_CR0_PE != 0); - assert!(sregs.efer & EFER_LME != 0 && sregs.efer & EFER_LMA != 0); } #[test] fn test_configure_segments_and_sregs() { let mut sregs: kvm_sregs = Default::default(); let gm = create_guest_mem(); - configure_segments_and_sregs(&gm, &mut sregs).unwrap(); + configure_segments_and_sregs(&gm, &mut sregs, BootProtocol::LinuxBoot).unwrap(); + + validate_segments_and_sregs(&gm, &sregs, BootProtocol::LinuxBoot); + + configure_segments_and_sregs(&gm, &mut sregs, BootProtocol::PvhBoot).unwrap(); - validate_segments_and_sregs(&gm, &sregs); + validate_segments_and_sregs(&gm, &sregs, BootProtocol::PvhBoot); } fn validate_page_tables(gm: &GuestMemoryMmap, sregs: &kvm_sregs) { @@ -291,7 +358,12 @@ mod tests { ..Default::default() }; - setup_regs(&vcpu, expected_regs.rip).unwrap(); + let entry_point: EntryPoint = EntryPoint { + entry_addr: GuestAddress(expected_regs.rip), + protocol: BootProtocol::LinuxBoot, + }; + + setup_regs(&vcpu, entry_point).unwrap(); let actual_regs: kvm_regs = vcpu.get_regs().unwrap(); assert_eq!(actual_regs, expected_regs); @@ -304,15 +376,21 @@ mod tests { let vcpu = vm.create_vcpu(0).unwrap(); let gm = create_guest_mem(); - assert!(vcpu.set_sregs(&Default::default()).is_ok()); - setup_sregs(&gm, &vcpu).unwrap(); - - let mut sregs: kvm_sregs = vcpu.get_sregs().unwrap(); - // for AMD KVM_GET_SREGS returns g = 0 for each kvm_segment. - // We set it to 1, otherwise the test will fail. - sregs.gs.g = 1; - - validate_segments_and_sregs(&gm, &sregs); - validate_page_tables(&gm, &sregs); + [BootProtocol::LinuxBoot, BootProtocol::PvhBoot] + .iter() + .for_each(|boot_prot| { + assert!(vcpu.set_sregs(&Default::default()).is_ok()); + setup_sregs(&gm, &vcpu, *boot_prot).unwrap(); + + let mut sregs: kvm_sregs = vcpu.get_sregs().unwrap(); + // for AMD KVM_GET_SREGS returns g = 0 for each kvm_segment. + // We set it to 1, otherwise the test will fail. + sregs.gs.g = 1; + + validate_segments_and_sregs(&gm, &sregs, *boot_prot); + if let BootProtocol::LinuxBoot = *boot_prot { + validate_page_tables(&gm, &sregs); + } + }); } } diff --git a/src/arch_gen/src/x86/mod.rs b/src/arch_gen/src/x86/mod.rs index b0dae9eff18..d35cb47e65f 100644 --- a/src/arch_gen/src/x86/mod.rs +++ b/src/arch_gen/src/x86/mod.rs @@ -14,3 +14,4 @@ pub mod bootparam; pub mod mpspec; #[allow(non_upper_case_globals)] pub mod msr_index; +pub mod start_info; diff --git a/src/arch_gen/src/x86/start_info.rs b/src/arch_gen/src/x86/start_info.rs new file mode 100644 index 00000000000..43bb5fd77f7 --- /dev/null +++ b/src/arch_gen/src/x86/start_info.rs @@ -0,0 +1,394 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2016, Citrix Systems, Inc. + */ + +/* + * automatically generated by rust-bindgen using: + * + * # bindgen start_info.h -- -include stdint.h > start_info.rs + * + * From the canonical version in upstream Xen repository + * xen/include/public/arch-x86/hvm/start_info.h + * at commit: + * a2e84d8e42c9e878fff17b738d8e5c5d83888f31 + * + * The generated file has been edited to eliminate unnecessary + * definitions, add comments, and relocate definitions and tests for clarity. + * Added Default to the list of traits that are automatically derived. + * + * The definitions in this file are intended to be exported and used by the VMM + * in order to boot a Linux guest using the PVH entry point as specified in the + * x86/HVM direct boot ABI. + * These structures contain all the required information (cmdline address, ACPI RSDP, + * memory maps, etc) that must be written to guest memory before starting guest + * execution by jumping to the PVH entry point address. + * A comparable set of definitions to hvm_start_info and hvm_memmap_table_entry in this + * file would be the boot_params and boot_e820_entry definitions used by the Linux + * 64-bit boot protocol. + * + * Start of day structure passed to PVH guests and to HVM guests in %ebx. + * + * NOTE: nothing will be loaded at physical address 0, so a 0 value in any + * of the address fields should be treated as not present. + * + * 0 +----------------+ + * | magic | Contains the magic value XEN_HVM_START_MAGIC_VALUE + * | | ("xEn3" with the 0x80 bit of the "E" set). + * 4 +----------------+ + * | version | Version of this structure. Current version is 1. New + * | | versions are guaranteed to be backwards-compatible. + * 8 +----------------+ + * | flags | SIF_xxx flags. + * 12 +----------------+ + * | nr_modules | Number of modules passed to the kernel. + * 16 +----------------+ + * | modlist_paddr | Physical address of an array of modules + * | | (layout of the structure below). + * 24 +----------------+ + * | cmdline_paddr | Physical address of the command line, + * | | a zero-terminated ASCII string. + * 32 +----------------+ + * | rsdp_paddr | Physical address of the RSDP ACPI data structure. + * 40 +----------------+ + * | memmap_paddr | Physical address of the (optional) memory map. Only + * | | present in version 1 and newer of the structure. + * 48 +----------------+ + * | memmap_entries | Number of entries in the memory map table. Zero + * | | if there is no memory map being provided. Only + * | | present in version 1 and newer of the structure. + * 52 +----------------+ + * | reserved | Version 1 and newer only. + * 56 +----------------+ + * + * The layout of each entry in the module structure is the following: + * + * 0 +----------------+ + * | paddr | Physical address of the module. + * 8 +----------------+ + * | size | Size of the module in bytes. + * 16 +----------------+ + * | cmdline_paddr | Physical address of the command line, + * | | a zero-terminated ASCII string. + * 24 +----------------+ + * | reserved | + * 32 +----------------+ + * + * The layout of each entry in the memory map table is as follows: + * + * 0 +----------------+ + * | addr | Base address + * 8 +----------------+ + * | size | Size of mapping in bytes + * 16 +----------------+ + * | type | Type of mapping as defined between the hypervisor + * | | and guest. See XEN_HVM_MEMMAP_TYPE_* values below. + * 20 +----------------| + * | reserved | + * 24 +----------------+ + * + * The address and sizes are always a 64bit little endian unsigned integer. + * + * NB: Xen on x86 will always try to place all the data below the 4GiB + * boundary. + * + * Version numbers of the hvm_start_info structure have evolved like this: + * + * Version 0: Initial implementation. + * + * Version 1: Added the memmap_paddr/memmap_entries fields (plus 4 bytes of + * padding) to the end of the hvm_start_info struct. These new + * fields can be used to pass a memory map to the guest. The + * memory map is optional and so guests that understand version 1 + * of the structure must check that memmap_entries is non-zero + * before trying to read the memory map. + */ + +#[repr(C)] +#[derive(Debug, Copy, Clone, Default)] +pub struct hvm_start_info { + pub magic: u32, + pub version: u32, + pub flags: u32, + pub nr_modules: u32, + pub modlist_paddr: u64, + pub cmdline_paddr: u64, + pub rsdp_paddr: u64, + pub memmap_paddr: u64, + pub memmap_entries: u32, + pub reserved: u32, +} + +#[repr(C)] +#[derive(Debug, Copy, Clone, Default)] +pub struct hvm_modlist_entry { + pub paddr: u64, + pub size: u64, + pub cmdline_paddr: u64, + pub reserved: u64, +} + +#[repr(C)] +#[derive(Debug, Copy, Clone, Default)] +pub struct hvm_memmap_table_entry { + pub addr: u64, + pub size: u64, + pub type_: u32, + pub reserved: u32, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn bindgen_test_layout_hvm_start_info() { + assert_eq!( + ::std::mem::size_of::(), + 56usize, + concat!("Size of: ", stringify!(hvm_start_info)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(hvm_start_info)) + ); + assert_eq!( + unsafe { &(*(::std::ptr::null::())).magic as *const _ as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(hvm_start_info), + "::", + stringify!(magic) + ) + ); + assert_eq!( + unsafe { &(*(::std::ptr::null::())).version as *const _ as usize }, + 4usize, + concat!( + "Offset of field: ", + stringify!(hvm_start_info), + "::", + stringify!(version) + ) + ); + assert_eq!( + unsafe { &(*(::std::ptr::null::())).flags as *const _ as usize }, + 8usize, + concat!( + "Offset of field: ", + stringify!(hvm_start_info), + "::", + stringify!(flags) + ) + ); + assert_eq!( + unsafe { &(*(::std::ptr::null::())).nr_modules as *const _ as usize }, + 12usize, + concat!( + "Offset of field: ", + stringify!(hvm_start_info), + "::", + stringify!(nr_modules) + ) + ); + assert_eq!( + unsafe { + &(*(::std::ptr::null::())).modlist_paddr as *const _ as usize + }, + 16usize, + concat!( + "Offset of field: ", + stringify!(hvm_start_info), + "::", + stringify!(modlist_paddr) + ) + ); + assert_eq!( + unsafe { + &(*(::std::ptr::null::())).cmdline_paddr as *const _ as usize + }, + 24usize, + concat!( + "Offset of field: ", + stringify!(hvm_start_info), + "::", + stringify!(cmdline_paddr) + ) + ); + assert_eq!( + unsafe { &(*(::std::ptr::null::())).rsdp_paddr as *const _ as usize }, + 32usize, + concat!( + "Offset of field: ", + stringify!(hvm_start_info), + "::", + stringify!(rsdp_paddr) + ) + ); + assert_eq!( + unsafe { &(*(::std::ptr::null::())).memmap_paddr as *const _ as usize }, + 40usize, + concat!( + "Offset of field: ", + stringify!(hvm_start_info), + "::", + stringify!(memmap_paddr) + ) + ); + assert_eq!( + unsafe { + &(*(::std::ptr::null::())).memmap_entries as *const _ as usize + }, + 48usize, + concat!( + "Offset of field: ", + stringify!(hvm_start_info), + "::", + stringify!(memmap_entries) + ) + ); + assert_eq!( + unsafe { &(*(::std::ptr::null::())).reserved as *const _ as usize }, + 52usize, + concat!( + "Offset of field: ", + stringify!(hvm_start_info), + "::", + stringify!(reserved) + ) + ); + } + + #[test] + fn bindgen_test_layout_hvm_modlist_entry() { + assert_eq!( + ::std::mem::size_of::(), + 32usize, + concat!("Size of: ", stringify!(hvm_modlist_entry)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(hvm_modlist_entry)) + ); + assert_eq!( + unsafe { &(*(::std::ptr::null::())).paddr as *const _ as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(hvm_modlist_entry), + "::", + stringify!(paddr) + ) + ); + assert_eq!( + unsafe { &(*(::std::ptr::null::())).size as *const _ as usize }, + 8usize, + concat!( + "Offset of field: ", + stringify!(hvm_modlist_entry), + "::", + stringify!(size) + ) + ); + assert_eq!( + unsafe { + &(*(::std::ptr::null::())).cmdline_paddr as *const _ as usize + }, + 16usize, + concat!( + "Offset of field: ", + stringify!(hvm_modlist_entry), + "::", + stringify!(cmdline_paddr) + ) + ); + assert_eq!( + unsafe { &(*(::std::ptr::null::())).reserved as *const _ as usize }, + 24usize, + concat!( + "Offset of field: ", + stringify!(hvm_modlist_entry), + "::", + stringify!(reserved) + ) + ); + } + + #[test] + fn bindgen_test_layout_hvm_memmap_table_entry() { + assert_eq!( + ::std::mem::size_of::(), + 24usize, + concat!("Size of: ", stringify!(hvm_memmap_table_entry)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(hvm_memmap_table_entry)) + ); + assert_eq!( + unsafe { &(*(::std::ptr::null::())).addr as *const _ as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(hvm_memmap_table_entry), + "::", + stringify!(addr) + ) + ); + assert_eq!( + unsafe { &(*(::std::ptr::null::())).size as *const _ as usize }, + 8usize, + concat!( + "Offset of field: ", + stringify!(hvm_memmap_table_entry), + "::", + stringify!(size) + ) + ); + assert_eq!( + unsafe { + &(*(::std::ptr::null::())).type_ as *const _ as usize + }, + 16usize, + concat!( + "Offset of field: ", + stringify!(hvm_memmap_table_entry), + "::", + stringify!(type_) + ) + ); + assert_eq!( + unsafe { + &(*(::std::ptr::null::())).reserved as *const _ as usize + }, + 20usize, + concat!( + "Offset of field: ", + stringify!(hvm_memmap_table_entry), + "::", + stringify!(reserved) + ) + ); + } +} diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml index 8570ea996c4..39620b5fd04 100644 --- a/src/firecracker/Cargo.toml +++ b/src/firecracker/Cargo.toml @@ -15,3 +15,6 @@ polly = { path = "../polly" } seccomp = { path = "../seccomp" } utils = { path = "../utils" } vmm = { path = "../vmm" } + +[features] +pvh = ["vmm/pvh"] diff --git a/src/kernel/src/loader/elf.rs b/src/kernel/src/loader/elf.rs index cf12ce4d55d..e62d342e6d2 100644 --- a/src/kernel/src/loader/elf.rs +++ b/src/kernel/src/loader/elf.rs @@ -20,6 +20,7 @@ pub const ELFMAG0: ::std::os::raw::c_uint = 127; pub const ELFDATA2LSB: ::std::os::raw::c_uint = 1; pub const PT_LOAD: ::std::os::raw::c_uint = 1; +pub const PT_NOTE: ::std::os::raw::c_uint = 4; pub const ELFMAG1: u8 = b'E'; pub const ELFMAG2: u8 = b'L'; @@ -85,6 +86,15 @@ impl Clone for elf64_phdr { } pub type Elf64_Phdr = elf64_phdr; +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct elf64_note { + pub n_namesz: Elf64_Word, + pub n_descsz: Elf64_Word, + pub n_type: Elf64_Word, +} +pub type Elf64_Nhdr = elf64_note; + #[cfg(test)] mod tests { use super::*; @@ -336,4 +346,48 @@ mod tests { ) ); } + + #[test] + fn bindgen_test_layout_elf64_note() { + assert_eq!( + ::std::mem::size_of::(), + 12usize, + concat!("Size of: ", stringify!(elf64_note)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(elf64_note)) + ); + assert_eq!( + unsafe { &(*(::std::ptr::null::())).n_namesz as *const _ as usize }, + 0usize, + concat!( + "Offset of field: ", + stringify!(elf64_note), + "::", + stringify!(n_namesz) + ) + ); + assert_eq!( + unsafe { &(*(::std::ptr::null::())).n_descsz as *const _ as usize }, + 4usize, + concat!( + "Offset of field: ", + stringify!(elf64_note), + "::", + stringify!(n_descsz) + ) + ); + assert_eq!( + unsafe { &(*(::std::ptr::null::())).n_type as *const _ as usize }, + 8usize, + concat!( + "Offset of field: ", + stringify!(elf64_note), + "::", + stringify!(n_type) + ) + ); + } } diff --git a/src/kernel/src/loader/mod.rs b/src/kernel/src/loader/mod.rs index 381665a0a61..4ae97d39a21 100644 --- a/src/kernel/src/loader/mod.rs +++ b/src/kernel/src/loader/mod.rs @@ -35,6 +35,9 @@ pub enum Error { SeekKernelStart, SeekKernelImage, SeekProgramHeader, + SeekNoteHeader, + ReadNoteHeader, + InvalidPvhNote, } impl fmt::Display for Error { @@ -56,6 +59,9 @@ impl fmt::Display for Error { } Error::SeekKernelImage => "Failed to seek to offset of kernel image", Error::SeekProgramHeader => "Failed to seek to ELF program header", + Error::SeekNoteHeader => "Unable to seek to note header", + Error::ReadNoteHeader => "Unable to read note header", + Error::InvalidPvhNote => "Invalid PVH note header", } ) } @@ -71,13 +77,14 @@ pub type Result = std::result::Result; /// * `kernel_image` - Input vmlinux image. /// * `start_address` - For x86_64, this is the start of the high memory. Kernel should reside above it. /// -/// Returns the entry address of the kernel. +/// Returns the default entry address of the kernel and an optional field with a PVH entry point address +/// if one exists. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn load_kernel( guest_mem: &GuestMemoryMmap, kernel_image: &mut F, start_address: u64, -) -> Result +) -> Result<(GuestAddress, Option)> where F: Read + Seek, { @@ -113,6 +120,10 @@ where return Err(Error::InvalidEntryAddress); } + // This field will optionally hold the address of a PVH entry point if + // the kernel binary supports the PVH boot protocol. + let mut pvh_entry_pt = None; + kernel_image .seek(SeekFrom::Start(ehdr.e_phoff)) .map_err(|_| Error::SeekProgramHeader)?; @@ -125,6 +136,10 @@ where // Read in each section pointed to by the program headers. for phdr in &phdrs { if (phdr.p_type & elf::PT_LOAD) == 0 || phdr.p_filesz == 0 { + if phdr.p_type == elf::PT_NOTE { + // This segment describes a Note, check if PVH entry point is encoded. + pvh_entry_pt = parse_elf_note(phdr, kernel_image)?; + } continue; } @@ -142,7 +157,96 @@ where .map_err(|_| Error::ReadKernelImage)?; } - Ok(GuestAddress(ehdr.e_entry)) + Ok((GuestAddress(ehdr.e_entry), pvh_entry_pt)) +} + +/// Examines a supplied ELF program header of type `PT_NOTE` to determine if it contains an entry +/// of name `Xen` and type `XEN_ELFNOTE_PHYS32_ENTRY` (0x12). Notes of this type encode a physical +/// 32-bit entry point address into the kernel, which is used when launching guests in 32-bit +/// (protected) mode with paging disabled, as described by the PVH boot protocol. +/// +/// Returns the encoded entry point address, or `None` if no `XEN_ELFNOTE_PHYS32_ENTRY` entries are +/// found in the note header. +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn parse_elf_note(phdr: &elf::Elf64_Phdr, kernel_image: &mut F) -> Result> +where + F: Read + Seek, +{ + // Type of note header that encodes a 32-bit entry point address + // to boot a guest kernel using the PVH boot protocol. + const XEN_ELFNOTE_PHYS32_ENTRY: u32 = 18; + + // Size of string "PVHNote", including the terminating NULL. + const PVH_NOTE_STR_SZ: usize = 4; + + let n_align = phdr.p_align; + + // Seek to the beginning of the note segment + kernel_image + .seek(SeekFrom::Start(phdr.p_offset)) + .map_err(|_| Error::SeekNoteHeader)?; + + // Now that the segment has been found, we must locate an ELF note with the + // correct type that encodes the PVH entry point if there is one. + let mut nhdr: elf::Elf64_Nhdr = Default::default(); + let mut read_size: usize = 0; + + while read_size < phdr.p_filesz as usize { + unsafe { + // read_struct is safe when reading a POD struct. + // It can be used and dropped without issue. + utils::structs::read_struct(kernel_image, &mut nhdr) + .map_err(|_| Error::ReadNoteHeader)?; + } + // Check if the note header's name and type match the ones specified by the PVH ABI. + if nhdr.n_type == XEN_ELFNOTE_PHYS32_ENTRY && nhdr.n_namesz as usize == PVH_NOTE_STR_SZ { + let mut buf = [0u8; PVH_NOTE_STR_SZ]; + kernel_image + .read_exact(&mut buf) + .map_err(|_| Error::ReadNoteHeader)?; + if buf == [b'X', b'e', b'n', b'\0'] { + break; + } + } + + // Skip the note header plus the size of its fields (with alignment) + read_size += mem::size_of::() + + align_up(u64::from(nhdr.n_namesz), n_align) + + align_up(u64::from(nhdr.n_descsz), n_align); + + kernel_image + .seek(SeekFrom::Start(phdr.p_offset + read_size as u64)) + .map_err(|_| Error::SeekNoteHeader)?; + } + + if read_size >= phdr.p_filesz as usize { + return Ok(None); // PVH ELF note not found, nothing else to do. + } + // Otherwise the correct note type was found. + // The note header struct has already been read, so we can seek from the + // current position and just skip the name field contents. + kernel_image + .seek(SeekFrom::Current( + align_up(u64::from(nhdr.n_namesz), n_align) as i64 - PVH_NOTE_STR_SZ as i64, + )) + .map_err(|_| Error::SeekNoteHeader)?; + + // The PVH entry point is a 32-bit address, so the descriptor field + // must be capable of storing all such addresses. + if (nhdr.n_descsz as usize) < mem::size_of::() { + return Err(Error::InvalidPvhNote); + } + + let mut pvh_addr_bytes = [0; mem::size_of::()]; + + // Read 32-bit address stored in the PVH note descriptor field. + kernel_image + .read_exact(&mut pvh_addr_bytes) + .map_err(|_| Error::ReadNoteHeader)?; + + Ok(Some(GuestAddress( + u32::from_le_bytes(pvh_addr_bytes).into(), + ))) } #[cfg(target_arch = "aarch64")] @@ -150,7 +254,7 @@ pub fn load_kernel( guest_mem: &GuestMemoryMmap, kernel_image: &mut F, start_address: u64, -) -> Result +) -> Result<(GuestAddress, Option)> where F: Read + Seek, { @@ -224,7 +328,7 @@ where ) .map_err(|_| Error::ReadKernelImage)?; - Ok(GuestAddress(kernel_load_offset)) + Ok((GuestAddress(kernel_load_offset), None)) } /// Writes the command line string to the given memory slice. @@ -259,6 +363,22 @@ pub fn load_cmdline( Ok(()) } +/// Align address upwards. Taken from x86_64 crate: +/// https://docs.rs/x86_64/latest/x86_64/fn.align_up.html +/// +/// Returns the smallest x with alignment `align` so that x >= addr. The alignment must be +/// a power of 2. +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn align_up(addr: u64, align: u64) -> usize { + assert!(align.is_power_of_two(), "`align` must be a power of two"); + let align_mask = align - 1; + if addr & align_mask == 0 { + addr as usize // already aligned + } else { + ((addr | align_mask) + 1) as usize + } +} + #[cfg(test)] mod tests { use super::super::cmdline::Cmdline; @@ -266,7 +386,7 @@ mod tests { use std::io::Cursor; use vm_memory::{GuestAddress, GuestMemoryMmap}; - const MEM_SIZE: usize = 0x18_0000; + const MEM_SIZE: usize = 0x48_0000; fn create_guest_mem() -> GuestMemoryMmap { GuestMemoryMmap::from_ranges(&[(GuestAddress(0x0), MEM_SIZE)]).unwrap() @@ -282,6 +402,21 @@ mod tests { include_bytes!("test_pe.bin").to_vec() } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn make_elfnote_bin() -> Vec { + include_bytes!("test_elfnote.bin").to_vec() + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn make_dummy_elfnote_bin() -> Vec { + include_bytes!("test_dummynote.bin").to_vec() + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn make_bad_elfnote_bin() -> Vec { + include_bytes!("test_badnote.bin").to_vec() + } + #[test] // Tests that loading the kernel is successful on different archs. fn test_load_kernel() { @@ -291,10 +426,11 @@ mod tests { let load_addr = 0x10_0000; #[cfg(target_arch = "aarch64")] let load_addr = 0x8_0000; - assert_eq!( - Ok(GuestAddress(load_addr)), - load_kernel(&gm, &mut Cursor::new(&image), 0) - ); + + let (entry_addr, pvh_addr) = load_kernel(&gm, &mut Cursor::new(&image), 0).unwrap(); + + assert!(pvh_addr.is_none()); + assert_eq!(GuestAddress(load_addr), entry_addr); } #[test] @@ -385,6 +521,38 @@ mod tests { ); } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[test] + fn test_load_pvh() { + let gm = create_guest_mem(); + let pvhnote_image = make_elfnote_bin(); + let (_, pvh_addr) = load_kernel(&gm, &mut Cursor::new(&pvhnote_image), 0).unwrap(); + + assert_eq!(pvh_addr.unwrap(), GuestAddress(0x1_e1f_e1f)); + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[test] + fn test_dummy_elfnote() { + let gm = create_guest_mem(); + let dummynote_image = make_dummy_elfnote_bin(); + let (entry_addr, pvh_addr) = + load_kernel(&gm, &mut Cursor::new(&dummynote_image), 0).unwrap(); + + assert!(pvh_addr.is_none()); + assert_eq!(entry_addr, GuestAddress(0x40_00f0)); + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[test] + fn test_bad_elfnote() { + let gm = create_guest_mem(); + let badnote_image = make_bad_elfnote_bin(); + assert_eq!( + Err(Error::InvalidPvhNote), + load_kernel(&gm, &mut Cursor::new(&badnote_image), 0) + ); + } #[test] fn test_cmdline_overflow() { let gm = create_guest_mem(); diff --git a/src/kernel/src/loader/test_badnote.bin b/src/kernel/src/loader/test_badnote.bin new file mode 100644 index 00000000000..99013dd50fe Binary files /dev/null and b/src/kernel/src/loader/test_badnote.bin differ diff --git a/src/kernel/src/loader/test_dummynote.bin b/src/kernel/src/loader/test_dummynote.bin new file mode 100644 index 00000000000..990e69ed925 Binary files /dev/null and b/src/kernel/src/loader/test_dummynote.bin differ diff --git a/src/kernel/src/loader/test_elfnote.bin b/src/kernel/src/loader/test_elfnote.bin new file mode 100644 index 00000000000..e2fc7faa4f1 Binary files /dev/null and b/src/kernel/src/loader/test_elfnote.bin differ diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index f63f2490820..702796b2164 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -29,3 +29,6 @@ cpuid = { path = "../cpuid" } [dev-dependencies] vmm-sys-util = ">=0.4.0" + +[features] +pvh = [] diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 5fe058aebd0..1cf4cee7f22 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -11,6 +11,7 @@ use std::sync::{Arc, Mutex}; use super::{Error, Vmm}; use arch::InitrdConfig; +use arch::{BootProtocol, EntryPoint}; #[cfg(target_arch = "x86_64")] use device_manager::legacy::PortIODeviceManager; use device_manager::mmio::MMIODeviceManager; @@ -234,7 +235,7 @@ pub fn build_microvm( .ok_or(StartMicrovmError::MissingMemSizeConfig)?, )?; let vcpu_config = vm_resources.vcpu_config(); - let entry_addr = load_kernel(boot_config, &guest_memory)?; + let entry_point = load_kernel(boot_config, &guest_memory)?; let initrd = load_initrd_from_config(boot_config, &guest_memory)?; // Clone the command-line so that a failed boot doesn't pollute the original. #[allow(unused_mut)] @@ -293,7 +294,7 @@ pub fn build_microvm( &vm, &vcpu_config, &guest_memory, - entry_addr, + entry_point, request_ts, &pio_device_manager.io_bus, &exit_evt, @@ -311,7 +312,7 @@ pub fn build_microvm( &vm, &vcpu_config, &guest_memory, - entry_addr, + entry_point.entry_addr, request_ts, &exit_evt, ) @@ -349,7 +350,7 @@ pub fn build_microvm( #[cfg(target_arch = "x86_64")] load_cmdline(&vmm)?; - vmm.configure_system(vcpus.as_slice(), &initrd) + vmm.configure_system(vcpus.as_slice(), &initrd, entry_point.protocol) .map_err(StartMicrovmError::Internal)?; // Firecracker uses the same seccomp filter for all threads. vmm.start_vcpus(vcpus, seccomp_filter.to_vec(), seccomp_filter) @@ -377,17 +378,31 @@ pub fn create_guest_memory( fn load_kernel( boot_config: &BootConfig, guest_memory: &GuestMemoryMmap, -) -> std::result::Result { +) -> std::result::Result { let mut kernel_file = boot_config .kernel_file .try_clone() .map_err(|e| StartMicrovmError::Internal(Error::KernelFile(e)))?; - let entry_addr = + let (entry_addr, pvh_entry_pt) = kernel::loader::load_kernel(guest_memory, &mut kernel_file, arch::get_kernel_start()) .map_err(StartMicrovmError::KernelLoader)?; - Ok(entry_addr) + let mut entry_point_addr: GuestAddress = entry_addr; + let mut boot_prot: BootProtocol = BootProtocol::LinuxBoot; + + if cfg!(feature = "pvh") && cfg!(target_arch = "x86_64") { + if let Some(pvh_entry_addr) = pvh_entry_pt { + // Use the PVH kernel entry point to boot the guest + entry_point_addr = pvh_entry_addr; + boot_prot = BootProtocol::PvhBoot; + } + } + + Ok(EntryPoint { + entry_addr: entry_point_addr, + protocol: boot_prot, + }) } fn load_initrd_from_config( @@ -572,7 +587,7 @@ fn create_vcpus_x86_64( vm: &Vm, vcpu_config: &VcpuConfig, guest_mem: &GuestMemoryMmap, - entry_addr: GuestAddress, + entry_point: EntryPoint, request_ts: TimestampUs, io_bus: &devices::Bus, exit_evt: &EventFd, @@ -590,7 +605,7 @@ fn create_vcpus_x86_64( ) .map_err(Error::Vcpu)?; - vcpu.configure_x86_64(guest_mem, entry_addr, vcpu_config) + vcpu.configure_x86_64(guest_mem, entry_point, vcpu_config) .map_err(Error::Vcpu)?; vcpus.push(vcpu); @@ -946,13 +961,16 @@ pub mod tests { }; // Dummy entry_addr, vcpus will not boot. - let entry_addr = GuestAddress(0); + let entry_point = EntryPoint { + entry_addr: GuestAddress(0), + protocol: BootProtocol::LinuxBoot, + }; let bus = devices::Bus::new(); let vcpu_vec = create_vcpus_x86_64( &vm, &vcpu_config, &guest_memory, - entry_addr, + entry_point, TimestampUs::default(), &bus, &EventFd::new(libc::EFD_NONBLOCK).unwrap(), diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 6b3e16353e6..8d729abc9fa 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -54,7 +54,7 @@ use std::sync::Mutex; use std::time::Duration; use arch::DeviceType; -use arch::InitrdConfig; +use arch::{BootProtocol, InitrdConfig}; #[cfg(target_arch = "x86_64")] use device_manager::legacy::PortIODeviceManager; use device_manager::mmio::MMIODeviceManager; @@ -284,7 +284,12 @@ impl Vmm { } /// Configures the system for boot. - pub fn configure_system(&self, vcpus: &[Vcpu], initrd: &Option) -> Result<()> { + pub fn configure_system( + &self, + vcpus: &[Vcpu], + initrd: &Option, + _boot_prot: BootProtocol, + ) -> Result<()> { #[cfg(target_arch = "x86_64")] arch::x86_64::configure_system( &self.guest_memory, @@ -292,6 +297,7 @@ impl Vmm { self.kernel_cmdline.len() + 1, initrd, vcpus.len() as u8, + _boot_prot, ) .map_err(Error::ConfigureSystem)?; diff --git a/src/vmm/src/vstate.rs b/src/vmm/src/vstate.rs index 4cfa071e77c..0fe8b6c4cdb 100644 --- a/src/vmm/src/vstate.rs +++ b/src/vmm/src/vstate.rs @@ -23,6 +23,8 @@ use arch; #[cfg(target_arch = "aarch64")] use arch::aarch64::gic::GICDevice; #[cfg(target_arch = "x86_64")] +use arch::EntryPoint; +#[cfg(target_arch = "x86_64")] use cpuid::{c3, filter_cpuid, t2, VmSpec}; #[cfg(target_arch = "x86_64")] use kvm_bindings::{ @@ -38,9 +40,9 @@ use seccomp::{BpfProgram, SeccompFilter}; use utils::eventfd::EventFd; use utils::signal::{register_signal_handler, sigrtmin, Killable}; use utils::sm::StateMachine; -use vm_memory::{ - Address, GuestAddress, GuestMemory, GuestMemoryError, GuestMemoryMmap, GuestMemoryRegion, -}; +#[cfg(target_arch = "aarch64")] +use vm_memory::GuestAddress; +use vm_memory::{Address, GuestMemory, GuestMemoryError, GuestMemoryMmap, GuestMemoryRegion}; use vmm_config::machine_config::CpuFeaturesTemplate; #[cfg(target_arch = "x86_64")] @@ -794,7 +796,7 @@ impl Vcpu { pub fn configure_x86_64( &mut self, guest_mem: &GuestMemoryMmap, - kernel_start_addr: GuestAddress, + kernel_entry_point: EntryPoint, vcpu_config: &VcpuConfig, ) -> Result<()> { let cpuid_vm_spec = VmSpec::new(self.id, vcpu_config.vcpu_count, vcpu_config.ht_enabled) @@ -822,10 +824,11 @@ impl Vcpu { .map_err(Error::VcpuSetCpuid)?; arch::x86_64::msr::setup_msrs(&self.fd).map_err(Error::MSRSConfiguration)?; - arch::x86_64::regs::setup_regs(&self.fd, kernel_start_addr.raw_value() as u64) + arch::x86_64::regs::setup_regs(&self.fd, kernel_entry_point) .map_err(Error::REGSConfiguration)?; arch::x86_64::regs::setup_fpu(&self.fd).map_err(Error::FPUConfiguration)?; - arch::x86_64::regs::setup_sregs(guest_mem, &self.fd).map_err(Error::SREGSConfiguration)?; + arch::x86_64::regs::setup_sregs(guest_mem, &self.fd, kernel_entry_point.protocol) + .map_err(Error::SREGSConfiguration)?; arch::x86_64::interrupts::set_lint(&self.fd).map_err(Error::LocalIntConfiguration)?; Ok(()) } @@ -1343,6 +1346,9 @@ mod tests { use super::super::devices; use super::*; + use arch::BootProtocol; + + use vm_memory::GuestAddress; use utils::signal::validate_signal_num; @@ -1490,20 +1496,24 @@ mod tests { cpu_template: None, }; + let entry_point = EntryPoint { + entry_addr: GuestAddress(0), + protocol: BootProtocol::LinuxBoot, + }; assert!(vcpu - .configure_x86_64(&vm_mem, GuestAddress(0), &vcpu_config) + .configure_x86_64(&vm_mem, entry_point, &vcpu_config) .is_ok()); // Test configure while using the T2 template. vcpu_config.cpu_template = Some(CpuFeaturesTemplate::T2); assert!(vcpu - .configure_x86_64(&vm_mem, GuestAddress(0), &vcpu_config) + .configure_x86_64(&vm_mem, entry_point, &vcpu_config) .is_ok()); // Test configure while using the C3 template. vcpu_config.cpu_template = Some(CpuFeaturesTemplate::C3); assert!(vcpu - .configure_x86_64(&vm_mem, GuestAddress(0), &vcpu_config) + .configure_x86_64(&vm_mem, entry_point, &vcpu_config) .is_ok()); } @@ -1642,7 +1652,7 @@ mod tests { } #[cfg(target_arch = "x86_64")] - fn load_good_kernel(vm_memory: &GuestMemoryMmap) -> GuestAddress { + fn load_good_kernel(vm_memory: &GuestMemoryMmap) -> EntryPoint { use vmm_config::boot_source::DEFAULT_KERNEL_CMDLINE; let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); @@ -1657,7 +1667,7 @@ mod tests { assert!(cmdline.insert_str(DEFAULT_KERNEL_CMDLINE).is_ok()); let cmdline_addr = GuestAddress(arch::x86_64::layout::CMDLINE_START); - let entry_addr = kernel::loader::load_kernel( + let (entry_addr, _pvh_entry_pt) = kernel::loader::load_kernel( vm_memory, &mut kernel_file, arch::x86_64::layout::HIMEM_START, @@ -1671,7 +1681,10 @@ mod tests { ) .expect("failed to load cmdline"); - entry_addr + EntryPoint { + entry_addr, + protocol: BootProtocol::LinuxBoot, + } } #[cfg(target_arch = "x86_64")] @@ -1714,14 +1727,14 @@ mod tests { let vcpu_exit_evt = vcpu.exit_evt.try_clone().unwrap(); // Needs a kernel since we'll actually run this vcpu. - let entry_addr = load_good_kernel(&vm_mem); + let entry_point = load_good_kernel(&vm_mem); let vcpu_config = VcpuConfig { vcpu_count: 1, ht_enabled: false, cpu_template: None, }; - vcpu.configure_x86_64(&vm_mem, entry_addr, &vcpu_config) + vcpu.configure_x86_64(&vm_mem, entry_point, &vcpu_config) .expect("failed to configure vcpu"); let seccomp_filter = seccomp::SeccompFilter::empty().try_into().unwrap(); diff --git a/tools/devtool b/tools/devtool index 0b872b72ebc..4c65c177a02 100755 --- a/tools/devtool +++ b/tools/devtool @@ -475,6 +475,28 @@ cmd_build() { [ $profile = "release" ] && cargo_args+=("--release") + # Parse any optional features passed to cargo. + # Currently, cargo does not support specifying features in a workspace, so passing + # the "--features opt-feature" option to cargo does not enable the optional feature + # in the crates that are part of the workspace, as one would expect. In order to + # build the individual crates with non-default features, we can use the cargo build + # option "--manifest-path", to direct cargo to use the Cargo.toml file that defines + # the desired feature. The downside is that other independent crates in the workspace + # (i.e. jailer) will not be built. + while [ $# -gt 0 ]; do + case "$1" in + "--features") + shift + [[ "$1" =~ ^--* ]] && \ + die "Must specify a feature name: \"--features \"." + say_warn "WARNING: Building Firecracker with experimental feature." \ + "The jailer binary will not be built." + cargo_args+=("--manifest-path" "src/firecracker/Cargo.toml") + ;; + esac + shift + done + # Run the cargo build process inside the container. # We don't need any special privileges for the build phase, so we run the # container as the current user/group.