From f98af8ad54f877689cdf7e1bffcf8fd0329cac7c Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 12 Sep 2024 14:27:21 +0200 Subject: [PATCH] virtio: allow IoVecBufferMut to hold multiple DescriptorChain objects Allow IoVecBufferMut objects to store multiple DescriptorChain objects, so that we can describe guest memory meant to be used for receiving data (for example memory used for network RX) as a single (sparse) memory region. This will allow us to always keep track all the available memory we have for performing RX and use `readv` for copying memory from the TAP device inside guest memory avoiding the extra copy. In the future, it will also facilitate the implementation of mergeable buffers for the RX path of the network device. Signed-off-by: Babis Chalios --- src/vmm/src/devices/virtio/iovec.rs | 163 ++++++++++++++++----- src/vmm/src/devices/virtio/rng/device.rs | 9 +- src/vmm/src/devices/virtio/vsock/mod.rs | 4 + src/vmm/src/devices/virtio/vsock/packet.rs | 10 +- 4 files changed, 143 insertions(+), 43 deletions(-) diff --git a/src/vmm/src/devices/virtio/iovec.rs b/src/vmm/src/devices/virtio/iovec.rs index 688f27a0a67..7de07a1ab1b 100644 --- a/src/vmm/src/devices/virtio/iovec.rs +++ b/src/vmm/src/devices/virtio/iovec.rs @@ -10,6 +10,7 @@ use vm_memory::{ GuestMemory, GuestMemoryError, ReadVolatile, VolatileMemoryError, VolatileSlice, WriteVolatile, }; +use super::iov_deque::{IovDeque, IovDequeError}; use crate::devices::virtio::queue::DescriptorChain; use crate::vstate::memory::GuestMemoryMmap; @@ -23,6 +24,8 @@ pub enum IoVecError { OverflowedDescriptor, /// Guest memory error: {0} GuestMemory(#[from] GuestMemoryError), + /// Error with underlying `IovDeque`: {0} + IovDeque(#[from] IovDequeError), } // Using SmallVec in the kani proofs causes kani to use unbounded amounts of memory @@ -219,28 +222,24 @@ impl IoVecBuffer { /// It describes a write-only buffer passed to us by the guest that is scattered across multiple /// memory regions. Additionally, this wrapper provides methods that allow reading arbitrary ranges /// of data from that buffer. -#[derive(Debug, Default, Clone)] -pub struct IoVecBufferMut { +#[derive(Debug)] +pub struct IoVecBufferMut<'a> { // container of the memory regions included in this IO vector - vecs: IoVecVec, + vecs: IovDeque<'a>, // Total length of the IoVecBufferMut - len: u32, + len: usize, } -impl IoVecBufferMut { - /// Create an `IoVecBuffer` from a `DescriptorChain` - /// - /// # Safety - /// - /// The descriptor chain cannot be referencing the same memory location as another chain - pub unsafe fn load_descriptor_chain( +impl<'a> IoVecBufferMut<'a> { + /// Parse a `DescriptorChain` object and append the memory regions it describes in the + /// underlying ring buffer. + fn parse_descriptor( &mut self, mem: &GuestMemoryMmap, head: DescriptorChain, - ) -> Result<(), IoVecError> { - self.clear(); - + ) -> Result { let mut next_descriptor = Some(head); + let mut length = 0u32; while let Some(desc) = next_descriptor { if !desc.is_write_only() { return Err(IoVecError::ReadOnlyDescriptor); @@ -257,18 +256,80 @@ impl IoVecBufferMut { slice.bitmap().mark_dirty(0, desc.len as usize); let iov_base = slice.ptr_guard_mut().as_ptr().cast::(); - self.vecs.push(iovec { - iov_base, - iov_len: desc.len as size_t, - }); - self.len = self - .len + self.vecs + .push_back(iovec { + iov_base, + iov_len: desc.len as size_t, + }) + .unwrap(); + length = length .checked_add(desc.len) .ok_or(IoVecError::OverflowedDescriptor)?; next_descriptor = desc.next_descriptor(); } + self.len = self + .len + .checked_add(length as usize) + .ok_or(IoVecError::OverflowedDescriptor)?; + + Ok(length) + } + + /// Create an empty `IoVecBufferMut`. + pub(crate) fn new() -> Result { + let vecs = IovDeque::new()?; + Ok(Self { vecs, len: 0 }) + } + + /// Create an `IoVecBufferMut` from a `DescriptorChain` + /// + /// This will clear any previous `iovec` objects in the buffer and load the new + /// [`DescriptorChain`]. + /// + /// # Safety + /// + /// The descriptor chain cannot be referencing the same memory location as another chain + pub unsafe fn load_descriptor_chain( + &mut self, + mem: &GuestMemoryMmap, + head: DescriptorChain, + ) -> Result<(), IoVecError> { + self.clear(); + let _ = self.parse_descriptor(mem, head)?; + Ok(()) + } + + /// Append a `DescriptorChain` in this `IoVecBufferMut` + /// + /// # Safety + /// + /// The descriptor chain cannot be referencing the same memory location as another chain + pub unsafe fn append_descriptor_chain( + &mut self, + mem: &GuestMemoryMmap, + head: DescriptorChain, + ) -> Result { + self.parse_descriptor(mem, head) + } + + /// Drop memory from the `IoVecBufferMut` + /// + /// This will drop memory described by the `IoVecBufferMut` starting from the beginning. + pub fn drop_iovecs(&mut self, size: u32) -> Result<(), IoVecError> { + let dropped = self.vecs.drop_iovs(size as usize); + + // Users should ask us to drop a `size` of memory that is not exactly covered by `iovec` + // objects. In other words, the sum of the lengths of all dropped `iovec` objects should be + // equal to the `size` we were asked to drop. If it isn't, something is seriously wrong + // with the VirtIO queue or the emulation logic, so fail at this point. + assert_eq!(u32::try_from(dropped).unwrap(), size); + self.len = self + .len + .checked_sub(size as usize) + .ok_or(IoVecError::OverflowedDescriptor)?; + Ok(()) } @@ -281,20 +342,34 @@ impl IoVecBufferMut { mem: &GuestMemoryMmap, head: DescriptorChain, ) -> Result { - let mut new_buffer = Self::default(); + let mut new_buffer = Self::new()?; new_buffer.load_descriptor_chain(mem, head)?; Ok(new_buffer) } /// Get the total length of the memory regions covered by this `IoVecBuffer` - pub(crate) fn len(&self) -> u32 { + /// + /// In contrast to the equivalent [`IoVecBuffer::len()`] which returns `u32`, this one returns + /// `usize` since the buffer can contain multiple `DescriptorChain` objects, so we don't have + /// the limit that the length of a buffer is limited by `u32`. + pub(crate) fn len(&self) -> usize { self.len } + /// Returns a pointer to the memory keeping the `iovec` structs + pub fn as_iovec_ptr(&mut self) -> *mut iovec { + self.vecs.as_mut_slice().as_mut_ptr() + } + + /// Returns the length of the `iovec` array. + pub fn iovec_count(&self) -> usize { + self.vecs.len() + } + /// Clears the `iovec` array pub fn clear(&mut self) { self.vecs.clear(); - self.len = 0u32; + self.len = 0; } /// Writes a number of bytes into the `IoVecBufferMut` starting at a given offset. @@ -313,7 +388,7 @@ impl IoVecBufferMut { mut buf: &[u8], offset: usize, ) -> Result<(), VolatileMemoryError> { - if offset < self.len() as usize { + if offset < self.len() { let expected = buf.len(); let bytes_written = self.write_volatile_at(&mut buf, offset, expected)?; @@ -342,7 +417,7 @@ impl IoVecBufferMut { ) -> Result { let mut total_bytes_read = 0; - for iov in &self.vecs { + for iov in self.vecs.as_mut_slice() { if len == 0 { break; } @@ -391,6 +466,7 @@ mod tests { use vm_memory::VolatileMemoryError; use super::{IoVecBuffer, IoVecBufferMut}; + use crate::devices::virtio::iov_deque::IovDeque; use crate::devices::virtio::queue::{Queue, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::VirtQueue; use crate::utilities::test_utils::multi_region_mem; @@ -427,15 +503,18 @@ mod tests { } } - impl From<&mut [u8]> for IoVecBufferMut { + impl<'a> From<&mut [u8]> for IoVecBufferMut<'a> { fn from(buf: &mut [u8]) -> Self { + let mut vecs = IovDeque::new().unwrap(); + vecs.push_back(iovec { + iov_base: buf.as_mut_ptr().cast::(), + iov_len: buf.len(), + }) + .unwrap(); + Self { - vecs: vec![iovec { - iov_base: buf.as_mut_ptr().cast::(), - iov_len: buf.len(), - }] - .into(), - len: buf.len().try_into().unwrap(), + vecs, + len: buf.len(), } } } @@ -528,8 +607,19 @@ mod tests { let head = q.pop().unwrap(); // SAFETY: This descriptor chain is only loaded once in this test - let iovec = unsafe { IoVecBufferMut::from_descriptor_chain(&mem, head).unwrap() }; + let mut iovec = unsafe { IoVecBufferMut::from_descriptor_chain(&mem, head).unwrap() }; assert_eq!(iovec.len(), 4 * 64); + + // We are creating a new queue where we can get descriptors from. Probably, this is not + // something that we will ever want to do, as `IoVecBufferMut`s are typically + // (concpetually) associated with a single `Queue`. We just do this here to be able to test + // the appending logic. + let (mut q, _) = write_only_chain(&mem); + let head = q.pop().unwrap(); + // SAFETY: it is actually unsafe, but we just want to check the length of the + // `IoVecBufferMut` after appending. + let _ = unsafe { iovec.append_descriptor_chain(&mem, head).unwrap() }; + assert_eq!(iovec.len(), 8 * 64); } #[test] @@ -728,7 +818,7 @@ mod verification { } } - impl IoVecBufferMut { + impl IoVecBufferMut<'_> { fn any_of_length(nr_descs: usize) -> Self { // We only write into `IoVecBufferMut` objects, so we can simply create a guest memory // object initialized to zeroes, trying to be nice to Kani. @@ -740,7 +830,10 @@ mod verification { }; let (vecs, len) = create_iovecs(mem, GUEST_MEMORY_SIZE, nr_descs); - Self { vecs, len } + Self { + vecs, + len: len.try_into().unwrap(), + } } } diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index 43742ab0327..97cea458b97 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -112,7 +112,7 @@ impl Entropy { return Ok(0); } - let mut rand_bytes = vec![0; iovec.len() as usize]; + let mut rand_bytes = vec![0; iovec.len()]; rand::fill(&mut rand_bytes).map_err(|err| { METRICS.host_rng_fails.inc(); err @@ -120,7 +120,7 @@ impl Entropy { // It is ok to unwrap here. We are writing `iovec.len()` bytes at offset 0. iovec.write_all_volatile_at(&rand_bytes, 0).unwrap(); - Ok(iovec.len()) + Ok(u32::try_from(iovec.len()).unwrap()) } fn process_entropy_queue(&mut self) { @@ -145,7 +145,10 @@ impl Entropy { // Check for available rate limiting budget. // If not enough budget is available, leave the request descriptor in the queue // to handle once we do have budget. - if !Self::rate_limit_request(&mut self.rate_limiter, u64::from(iovec.len())) { + if !Self::rate_limit_request( + &mut self.rate_limiter, + u64::try_from(iovec.len()).unwrap(), + ) { debug!("entropy: throttling entropy queue"); METRICS.entropy_rate_limiter_throttled.inc(); self.queues[RNG_QUEUE].undo_pop(); diff --git a/src/vmm/src/devices/virtio/vsock/mod.rs b/src/vmm/src/devices/virtio/vsock/mod.rs index a87b6b488d8..383145054c6 100644 --- a/src/vmm/src/devices/virtio/vsock/mod.rs +++ b/src/vmm/src/devices/virtio/vsock/mod.rs @@ -30,6 +30,7 @@ pub use self::defs::uapi::VIRTIO_ID_VSOCK as TYPE_VSOCK; pub use self::defs::VSOCK_DEV_ID; pub use self::device::Vsock; pub use self::unix::{VsockUnixBackend, VsockUnixBackendError}; +use super::iov_deque::IovDequeError; use crate::devices::virtio::iovec::IoVecError; use crate::devices::virtio::persist::PersistError as VirtioStateError; @@ -138,6 +139,8 @@ pub enum VsockError { VirtioState(VirtioStateError), /// Vsock uds backend error: {0} VsockUdsBackend(VsockUnixBackendError), + /// Underlying IovDeque error: {0} + IovDeque(IovDequeError), } impl From for VsockError { @@ -147,6 +150,7 @@ impl From for VsockError { IoVecError::ReadOnlyDescriptor => VsockError::UnwritableDescriptor, IoVecError::GuestMemory(err) => VsockError::GuestMemoryMmap(err), IoVecError::OverflowedDescriptor => VsockError::DescChainOverflow, + IoVecError::IovDeque(err) => VsockError::IovDeque(err), } } } diff --git a/src/vmm/src/devices/virtio/vsock/packet.rs b/src/vmm/src/devices/virtio/vsock/packet.rs index 4c7e68ccb54..298d9df5a55 100644 --- a/src/vmm/src/devices/virtio/vsock/packet.rs +++ b/src/vmm/src/devices/virtio/vsock/packet.rs @@ -92,7 +92,7 @@ pub enum VsockPacketBuffer { /// Buffer holds a read-only guest-to-host (TX) packet Tx(IoVecBuffer), /// Buffer holds a write-only host-to-guest (RX) packet - Rx(IoVecBufferMut), + Rx(IoVecBufferMut<'static>), } /// Struct describing a single vsock packet. @@ -172,8 +172,8 @@ impl VsockPacket { // are live at the same time, meaning this has exclusive ownership over the memory let buffer = unsafe { IoVecBufferMut::from_descriptor_chain(mem, chain)? }; - if buffer.len() < VSOCK_PKT_HDR_SIZE { - return Err(VsockError::DescChainTooShortForHeader(buffer.len() as usize)); + if (u32::try_from(buffer.len()).unwrap()) < VSOCK_PKT_HDR_SIZE { + return Err(VsockError::DescChainTooShortForHeader(buffer.len())); } Ok(Self { @@ -222,7 +222,7 @@ impl VsockPacket { pub fn buf_size(&self) -> usize { let chain_length = match self.buffer { VsockPacketBuffer::Tx(ref iovec_buf) => iovec_buf.len(), - VsockPacketBuffer::Rx(ref iovec_buf) => iovec_buf.len(), + VsockPacketBuffer::Rx(ref iovec_buf) => iovec_buf.len().try_into().unwrap(), }; (chain_length - VSOCK_PKT_HDR_SIZE) as usize } @@ -237,7 +237,7 @@ impl VsockPacket { VsockPacketBuffer::Tx(_) => Err(VsockError::UnwritableDescriptor), VsockPacketBuffer::Rx(ref mut buffer) => { if count - > (buffer.len() as usize) + > (buffer.len()) .saturating_sub(VSOCK_PKT_HDR_SIZE as usize) .saturating_sub(offset) {