From 845cb76b2473209911ef67f325959ffcfdc8facf Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Fri, 18 Feb 2022 15:22:22 +0800 Subject: [PATCH 1/9] api: also update fs backends in backends collections After remount, the mounted time and backend configurations might be changed. Signed-off-by: Changwei Ge --- src/bin/nydusd/daemon.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/bin/nydusd/daemon.rs b/src/bin/nydusd/daemon.rs index c553990b846..50127493992 100644 --- a/src/bin/nydusd/daemon.rs +++ b/src/bin/nydusd/daemon.rs @@ -326,6 +326,9 @@ pub trait NydusDaemon: DaemonStateMachineSubscriber { e => DaemonError::Rafs(e), })?; + // To update mounted time and backend configurations. + self.backend_collection().add(&cmd.mountpoint, &cmd)?; + // Update mounts opaque from UpgradeManager if let Some(mut mgr_guard) = self.upgrade_mgr() { upgrade::update_mounts_state(&mut mgr_guard, cmd)?; From 5ef881556eef7891cdd87f0c03ae50e4c048f186 Mon Sep 17 00:00:00 2001 From: gexuyang Date: Tue, 8 Mar 2022 15:53:14 +0800 Subject: [PATCH 2/9] nydus-image: change default value of v6_force_extended_inode to true The JVM in Share:on mode will check the mtime to determine whether the version matches. The current image generated by nydus-image does not update the mtime. By setting v6_force_extended_inode to true, EROFS extended inode format is used, and mtime attr is added. Fixes: #39988119 Signed-off-by: gexuyang --- src/bin/nydus-image/builder/diff.rs | 5 +++++ src/bin/nydus-image/builder/directory.rs | 2 ++ src/bin/nydus-image/core/node.rs | 7 ++++++- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/bin/nydus-image/builder/diff.rs b/src/bin/nydus-image/builder/diff.rs index 84c1b57d0e0..eb1ace0ab0c 100644 --- a/src/bin/nydus-image/builder/diff.rs +++ b/src/bin/nydus-image/builder/diff.rs @@ -218,6 +218,7 @@ fn walk_diff( Overlay::UpperAddition, ctx.chunk_size, ctx.explicit_uidgid, + true, ) .with_context(|| format!("failed to create node from {:?}", child_path))?; @@ -231,6 +232,7 @@ fn walk_diff( Overlay::Lower, ctx.chunk_size, ctx.explicit_uidgid, + true, )?; if same_file(&lower_node, &child_node) { child_node.overlay = Overlay::Lower; @@ -277,6 +279,7 @@ fn walk_all(ctx: &BuildContext, dir_root: PathBuf, dir_path: PathBuf) -> Result< Overlay::UpperAddition, ctx.chunk_size, ctx.explicit_uidgid, + true, ) .with_context(|| format!("failed to create node from {:?}", child_path))?; @@ -511,6 +514,7 @@ impl DiffBuilder { Overlay::UpperAddition, ctx.chunk_size, ctx.explicit_uidgid, + true, )?; let mut tree = Tree::new(root); tree.children = self.build_tree_from_children( @@ -549,6 +553,7 @@ impl DiffBuilder { Overlay::UpperAddition, ctx.chunk_size, ctx.explicit_uidgid, + true, ) .with_context(|| format!("failed to create node from {:?}", child_path))?; diff --git a/src/bin/nydus-image/builder/directory.rs b/src/bin/nydus-image/builder/directory.rs index 5b790d9b099..fef405cc055 100644 --- a/src/bin/nydus-image/builder/directory.rs +++ b/src/bin/nydus-image/builder/directory.rs @@ -50,6 +50,7 @@ impl FilesystemTreeBuilder { Overlay::UpperAddition, ctx.chunk_size, parent.explicit_uidgid, + true, ) .with_context(|| format!("failed to create node {:?}", path))?; @@ -91,6 +92,7 @@ impl DirectoryBuilder { Overlay::UpperAddition, ctx.chunk_size, ctx.explicit_uidgid, + true, )?; let mut tree = Tree::new(node); let tree_builder = FilesystemTreeBuilder::new(); diff --git a/src/bin/nydus-image/core/node.rs b/src/bin/nydus-image/core/node.rs index acbc4c21eb8..2ef259fab3a 100644 --- a/src/bin/nydus-image/core/node.rs +++ b/src/bin/nydus-image/core/node.rs @@ -229,6 +229,7 @@ impl Node { overlay: Overlay, chunk_size: u32, explicit_uidgid: bool, + v6_force_extended_inode: bool, ) -> Result { let target = Self::generate_target(&path, &source); let target_vec = Self::generate_target_vec(&target); @@ -252,7 +253,7 @@ impl Node { offset: 0, dirents: Vec::new(), v6_datalayout: EROFS_INODE_FLAT_PLAIN, - v6_force_extended_inode: false, + v6_force_extended_inode, v6_compact_inode: false, }; @@ -1737,6 +1738,7 @@ mod tests { Overlay::UpperAddition, RAFS_DEFAULT_CHUNK_SIZE as u32, false, + false, ) .unwrap(); @@ -1762,6 +1764,7 @@ mod tests { Overlay::UpperAddition, RAFS_DEFAULT_CHUNK_SIZE as u32, false, + false, ) .unwrap(); @@ -1868,6 +1871,7 @@ mod tests { Overlay::UpperAddition, RAFS_DEFAULT_CHUNK_SIZE as u32, false, + false, ) .unwrap(); @@ -1880,6 +1884,7 @@ mod tests { Overlay::UpperAddition, RAFS_DEFAULT_CHUNK_SIZE as u32, false, + false, ) .unwrap(); From 6aff1d5cd4d0fc588479c70b5158c3ff63c0542c Mon Sep 17 00:00:00 2001 From: zhaoshang Date: Fri, 24 Dec 2021 15:55:27 +0800 Subject: [PATCH 3/9] nydus-image/create: fix an error that v5 will dump meta info Meta info is not required for V5 blobs. Therefore, the option blob_meta_info_enabled is set to true only in V6. Signed-off-by: zhaoshang --- src/bin/nydus-image/builder/diff.rs | 2 +- src/bin/nydus-image/builder/directory.rs | 2 +- src/bin/nydus-image/core/context.rs | 28 +++++++++++++----------- storage/src/meta/mod.rs | 16 +++++--------- 4 files changed, 22 insertions(+), 26 deletions(-) diff --git a/src/bin/nydus-image/builder/diff.rs b/src/bin/nydus-image/builder/diff.rs index eb1ace0ab0c..36afb26c722 100644 --- a/src/bin/nydus-image/builder/diff.rs +++ b/src/bin/nydus-image/builder/diff.rs @@ -302,7 +302,7 @@ fn dump_blob( let mut blob_ctx = BlobContext::new(blob_id, blob_storage)?; blob_ctx.set_chunk_dict(chunk_dict); blob_ctx.set_chunk_size(ctx.chunk_size); - blob_ctx.set_meta_info_enabled(true); + blob_ctx.set_meta_info_enabled(ctx.fs_version == RafsVersion::V6); // Since all layers are built concurrently, it is not possible to deduplicate // chunk between layers while ensuring reproducible build, so we only do diff --git a/src/bin/nydus-image/builder/directory.rs b/src/bin/nydus-image/builder/directory.rs index fef405cc055..090e49bf6a4 100644 --- a/src/bin/nydus-image/builder/directory.rs +++ b/src/bin/nydus-image/builder/directory.rs @@ -133,7 +133,7 @@ impl Builder for DirectoryBuilder { let mut blob_ctx = BlobContext::new(ctx.blob_id.clone(), ctx.blob_storage.clone())?; blob_ctx.set_chunk_dict(blob_mgr.get_chunk_dict()); blob_ctx.set_chunk_size(ctx.chunk_size); - blob_ctx.set_meta_info_enabled(true); + blob_ctx.set_meta_info_enabled(ctx.fs_version == RafsVersion::V6); blob_mgr.extend_blob_table_from_chunk_dict()?; let blob_index = blob_mgr.alloc_index()?; diff --git a/src/bin/nydus-image/core/context.rs b/src/bin/nydus-image/core/context.rs index 2705e52aa1a..50d5600d0e6 100644 --- a/src/bin/nydus-image/core/context.rs +++ b/src/bin/nydus-image/core/context.rs @@ -321,21 +321,23 @@ impl BlobContext { } pub fn add_chunk_meta_info(&mut self, chunk: &ChunkWrapper) -> Result<()> { - if self.blob_meta_info_enabled { - debug_assert!(chunk.index() as usize == self.blob_meta_info.len()); - let mut meta = BlobChunkInfoOndisk::default(); - meta.set_compressed_offset(chunk.compressed_offset()); - meta.set_compressed_size(chunk.compressed_size()); - meta.set_uncompressed_offset(chunk.uncompressed_offset(), self.blob_meta_info_enabled); - meta.set_uncompressed_size(chunk.uncompressed_size()); - trace!( - "chunk uncompressed {} size {}", - meta.uncompressed_offset(), - meta.uncompressed_size() - ); - self.blob_meta_info.push(meta); + if !self.blob_meta_info_enabled { + return Ok(()); } + debug_assert!(chunk.index() as usize == self.blob_meta_info.len()); + let mut meta = BlobChunkInfoOndisk::default(); + meta.set_compressed_offset(chunk.compressed_offset()); + meta.set_compressed_size(chunk.compressed_size()); + meta.set_uncompressed_offset(chunk.uncompressed_offset()); + meta.set_uncompressed_size(chunk.uncompressed_size()); + trace!( + "chunk uncompressed {} size {}", + meta.uncompressed_offset(), + meta.uncompressed_size() + ); + self.blob_meta_info.push(meta); + Ok(()) } diff --git a/storage/src/meta/mod.rs b/storage/src/meta/mod.rs index 54125d67df1..3ac33d4281f 100644 --- a/storage/src/meta/mod.rs +++ b/storage/src/meta/mod.rs @@ -218,16 +218,10 @@ impl BlobChunkInfoOndisk { /// Set uncompressed offset of the chunk. #[inline] - pub fn set_uncompressed_offset(&mut self, offset: u64, aligned_4k: bool) { - if aligned_4k { - debug_assert!(offset & !BLOB_CHUNK_UNCOMP_OFFSET_MASK == 0); - self.uncomp_info &= !BLOB_CHUNK_UNCOMP_OFFSET_MASK; - self.uncomp_info |= offset & BLOB_CHUNK_UNCOMP_OFFSET_MASK; - } else { - debug_assert!(offset & !BLOB_CHUNK_COMP_OFFSET_MASK == 0); - self.uncomp_info &= !BLOB_CHUNK_COMP_OFFSET_MASK; - self.uncomp_info |= offset & BLOB_CHUNK_COMP_OFFSET_MASK; - } + pub fn set_uncompressed_offset(&mut self, offset: u64) { + debug_assert!(offset & !BLOB_CHUNK_UNCOMP_OFFSET_MASK == 0); + self.uncomp_info &= !BLOB_CHUNK_UNCOMP_OFFSET_MASK; + self.uncomp_info |= offset & BLOB_CHUNK_UNCOMP_OFFSET_MASK; } /// Get uncompressed end of the chunk. @@ -760,7 +754,7 @@ mod tests { assert_eq!(chunk.compressed_offset(), 0x1000); assert_eq!(chunk.compressed_size(), 0x100); assert_eq!(chunk.compressed_end(), 0x1100); - chunk.set_uncompressed_offset(0x2000, false); + chunk.set_uncompressed_offset(0x2000); chunk.set_uncompressed_size(0x100); assert_eq!(chunk.uncompressed_offset(), 0x2000); assert_eq!(chunk.uncompressed_size(), 0x100); From 9f202bb3ce883d75773b690ac99d7a2afb0c77fe Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 20 Dec 2021 15:48:32 +0800 Subject: [PATCH 4/9] nydus-image: sort v6 dirents again due to inproper `.' and `..' order `.' and `..' wasn't added when sorting v6 dirents. Signed-off-by: Gao Xiang --- src/bin/nydus-image/core/bootstrap.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/bin/nydus-image/core/bootstrap.rs b/src/bin/nydus-image/core/bootstrap.rs index 68815d18237..c9afd7da832 100644 --- a/src/bin/nydus-image/core/bootstrap.rs +++ b/src/bin/nydus-image/core/bootstrap.rs @@ -289,6 +289,9 @@ impl Bootstrap { dirs.push(child); } } + /* XXX: `.' and `..' should be sorted globally too */ + node.dirents + .sort_unstable_by(|a, b| a.1.as_os_str().cmp(&b.1.as_os_str()) as std::cmp::Ordering); for dir in dirs { self.update_dirents(nodes, dir, tree.node.offset); From df2bde0a705606bc126035dd559bc6398d3b98cc Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Thu, 9 Dec 2021 10:10:56 +0800 Subject: [PATCH 5/9] nydus-image/inspect: print sizes info of chunks `chunk` subcommand prints more info of chunks compressed and decompressed sizes info. It helps analyze rafs layout. Signed-off-by: Changwei Ge --- src/bin/nydus-image/inspect.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/bin/nydus-image/inspect.rs b/src/bin/nydus-image/inspect.rs index abd6197c314..13311fc5960 100644 --- a/src/bin/nydus-image/inspect.rs +++ b/src/bin/nydus-image/inspect.rs @@ -349,11 +349,17 @@ Blocks: {blocks}"#, let path = self.path_from_ino(inode.parent()).unwrap(); println!( r#" - {:width$} Parent Path {:width$} + File: {:width$} Parent Path: {:width$} + Compressed Offset: {}, Compressed Size: {} + Decompressed Offset: {}, Decompressed Size: {} Chunk ID: {:50}, Blob ID: {} "#, name.to_string_lossy(), path.to_string_lossy(), + c.compress_offset, + c.compress_size, + c.decompress_offset, + c.decompress_size, c.block_id, if let Ok(blob_id) = self.state.get_blob_id(c.blob_index) { blob_id From db8790bfc23de762335d13f316c38e94359c3b84 Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Thu, 9 Dec 2021 10:20:25 +0800 Subject: [PATCH 6/9] nydus-image/inspect: trim white spaces before parsing Otherwise, the string parsing may fail resulting in inspector's error. Signed-off-by: Changwei Ge --- src/bin/nydus-image/inspect.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/bin/nydus-image/inspect.rs b/src/bin/nydus-image/inspect.rs index 13311fc5960..9d18dbd5b79 100644 --- a/src/bin/nydus-image/inspect.rs +++ b/src/bin/nydus-image/inspect.rs @@ -358,8 +358,8 @@ Blocks: {blocks}"#, path.to_string_lossy(), c.compress_offset, c.compress_size, - c.decompress_offset, - c.decompress_size, + c.uncompress_offset, + c.uncompress_size, c.block_id, if let Ok(blob_id) = self.state.get_blob_id(c.blob_index) { blob_id @@ -739,9 +739,12 @@ impl Executor { inspector: &mut RafsInspector, input: String, ) -> std::result::Result, ExecuteError> { - let mut raw = input.strip_suffix("\n").unwrap_or(&input).split(' '); + let mut raw = input + .strip_suffix("\n") + .unwrap_or(&input) + .split_ascii_whitespace(); let cmd = raw.next().unwrap(); - let args = raw.next(); + let args = raw.next().map(|a| a.trim()); debug!("execute {:?} {:?}", cmd, args); From 47d73d7436fae48ba0a1d4e34739dd847bffcb91 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Wed, 9 Mar 2022 16:24:45 +0800 Subject: [PATCH 7/9] blobfs: use fuse-backend-rs passthroughfs The blob file is passthrough to guest directly by referring to the fuse-backend-rs passthroughFs library function. Signed-off-by: hugh.gxy Signed-off-by: Liu Bo Signed-off-by: Liu Jiang Signed-off-by: Peng Tao --- blobfs/Cargo.toml | 8 +- blobfs/src/lib.rs | 737 ++++------------------ blobfs/src/multikey.rs | 286 --------- blobfs/src/sync_io.rs | 1314 ++++++---------------------------------- 4 files changed, 293 insertions(+), 2052 deletions(-) delete mode 100644 blobfs/src/multikey.rs diff --git a/blobfs/Cargo.toml b/blobfs/Cargo.toml index c6bb727d449..291c2ec8788 100644 --- a/blobfs/Cargo.toml +++ b/blobfs/Cargo.toml @@ -18,10 +18,12 @@ fuse-backend-rs = { version = "0.3.0" } rafs = { path = "../rafs" } nydus-error = { path = "../error" } -storage = { path = "../storage", features = ["backend-localfs", "backend-oss", "backend-registry"] } +storage = { path = "../storage", features = ["backend-localfs"] } [features] -virtiofs = [ "fuse-backend-rs/virtiofs", "vm-memory/backend-mmap"] +virtiofs = [ "fuse-backend-rs/virtiofs", "rafs/virtio-fs" ] +backend-oss = ["rafs/backend-oss"] +backend-registry = ["rafs/backend-registry"] [dev-dependencies] -nydus-app = { path = "../app" } +nydus-app = { version = "0.2", path = "../app" } diff --git a/blobfs/src/lib.rs b/blobfs/src/lib.rs index 0b1b2881d43..9fe380ca17f 100644 --- a/blobfs/src/lib.rs +++ b/blobfs/src/lib.rs @@ -14,295 +14,49 @@ #[macro_use] extern crate log; +use fuse_backend_rs::{ + api::{filesystem::*, BackendFileSystem, VFS_MAX_INO}, + passthrough::Config as PassthroughConfig, + passthrough::PassthroughFs, +}; +use nydus_error::{einval, eother}; +use rafs::{ + fs::{Rafs, RafsConfig}, + RafsIoRead, +}; +use serde::Deserialize; use std::any::Any; -use std::collections::{btree_map, BTreeMap}; #[cfg(feature = "virtiofs")] -use std::ffi::OsString; -use std::ffi::{CStr, CString}; +use std::convert::TryInto; +#[cfg(feature = "virtiofs")] +use std::ffi::CStr; +use std::ffi::CString; +#[cfg(feature = "virtiofs")] use std::fs::File; use std::io; +#[cfg(feature = "virtiofs")] use std::mem::MaybeUninit; #[cfg(feature = "virtiofs")] -use std::os::unix::ffi::OsStringExt; -use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; -use std::path::Path; +use std::os::unix::ffi::OsStrExt; #[cfg(feature = "virtiofs")] -use std::path::PathBuf; +use std::os::unix::io::{AsRawFd, FromRawFd}; +use std::path::Path; use std::str::FromStr; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; -use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockWriteGuard}; -use std::time::Duration; - -use serde::Deserialize; -use vm_memory::ByteValued; - -use fuse_backend_rs::{ - api::{filesystem::*, BackendFileSystem, VFS_MAX_INO}, - passthrough::CachePolicy, - // transport::FileReadWriteVolatile, -}; +use std::sync::{Arc, Mutex}; +use std::thread; #[cfg(feature = "virtiofs")] -use fuse_backend_rs::abi::virtio_fs; -#[cfg(feature = "virtiofs")] -use fuse_backend_rs::transport::FsCacheReqHandler; -// #[cfg(feature = "virtiofs")] -// use rafs::metadata::cached::CachedChunkInfo; -// #[cfg(feature = "virtiofs")] -// use rafs::metadata::layout::OndiskChunkInfo; - -use nydus_error::{einval, eother}; - -use rafs::{ - fs::{Rafs, RafsConfig}, - RafsIoRead, -}; +use storage::device::BlobPrefetchRequest; +use vm_memory::ByteValued; mod sync_io; -#[allow(dead_code)] -mod multikey; -use multikey::MultikeyBTreeMap; - -const CURRENT_DIR_CSTR: &[u8] = b".\0"; -const PARENT_DIR_CSTR: &[u8] = b"..\0"; +#[cfg(feature = "virtiofs")] const EMPTY_CSTR: &[u8] = b"\0"; -const PROC_CSTR: &[u8] = b"/proc\0"; type Inode = u64; type Handle = u64; -#[derive(Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Debug, Default)] -struct InodeAltKey { - ino: libc::ino64_t, - dev: libc::dev_t, -} - -impl InodeAltKey { - fn from_stat(st: &libc::stat64) -> Self { - InodeAltKey { - ino: st.st_ino, - dev: st.st_dev, - } - } -} - -struct InodeData { - inode: Inode, - // Most of these aren't actually files but ¯\_(ツ)_/¯. - file: File, - refcount: AtomicU64, -} - -impl InodeData { - fn new(inode: Inode, file: File, refcount: u64) -> Self { - InodeData { - inode, - file, - refcount: AtomicU64::new(refcount), - } - } - - // When making use of the underlying RawFd, the caller must ensure that the Arc - // object is within scope. Otherwise it may cause race window to access wrong target fd. - // By introducing this method, we could explicitly audit all callers making use of the - // underlying RawFd. - fn get_raw_fd(&self) -> RawFd { - self.file.as_raw_fd() - } -} - -/// Data structures to manage accessed inodes. -struct InodeMap { - inodes: RwLock>>, -} - -impl InodeMap { - fn new() -> Self { - InodeMap { - inodes: RwLock::new(MultikeyBTreeMap::new()), - } - } - - fn clear(&self) { - self.inodes.write().unwrap().clear(); - } - - fn get(&self, inode: Inode) -> io::Result> { - self.inodes - .read() - .unwrap() - .get(&inode) - .map(Arc::clone) - .ok_or_else(ebadf) - } - - fn get_alt(&self, altkey: &InodeAltKey) -> Option> { - self.inodes.read().unwrap().get_alt(altkey).map(Arc::clone) - } - - fn get_map_mut( - &self, - ) -> RwLockWriteGuard>> { - self.inodes.write().unwrap() - } - - fn insert(&self, inode: Inode, altkey: InodeAltKey, data: InodeData) { - self.inodes - .write() - .unwrap() - .insert(inode, altkey, Arc::new(data)); - } -} - -struct HandleData { - inode: Inode, - file: File, - lock: Mutex<()>, -} - -impl HandleData { - fn new(inode: Inode, file: File) -> Self { - HandleData { - inode, - file, - lock: Mutex::new(()), - } - } - - fn get_file_mut(&self) -> (MutexGuard<()>, &File) { - (self.lock.lock().unwrap(), &self.file) - } - - // When making use of the underlying RawFd, the caller must ensure that the Arc - // object is within scope. Otherwise it may cause race window to access wrong target fd. - // By introducing this method, we could explicitly audit all callers making use of the - // underlying RawFd. - fn get_handle_raw_fd(&self) -> RawFd { - self.file.as_raw_fd() - } -} - -struct HandleMap { - handles: RwLock>>, -} - -impl HandleMap { - fn new() -> Self { - HandleMap { - handles: RwLock::new(BTreeMap::new()), - } - } - - fn clear(&self) { - self.handles.write().unwrap().clear(); - } - - fn insert(&self, handle: Handle, data: HandleData) { - self.handles.write().unwrap().insert(handle, Arc::new(data)); - } - - fn release(&self, handle: Handle, inode: Inode) -> io::Result<()> { - let mut handles = self.handles.write().unwrap(); - - if let btree_map::Entry::Occupied(e) = handles.entry(handle) { - if e.get().inode == inode { - // We don't need to close the file here because that will happen automatically when - // the last `Arc` is dropped. - e.remove(); - return Ok(()); - } - } - - Err(ebadf()) - } - - fn get(&self, handle: Handle, inode: Inode) -> io::Result> { - self.handles - .read() - .unwrap() - .get(&handle) - .filter(|hd| hd.inode == inode) - .map(Arc::clone) - .ok_or_else(ebadf) - } -} - -// #[derive(Clone)] -// struct BlobOffsetMapArg { -// base: *const u8, -// size: usize, -// } - -// unsafe impl Sync for BlobOffsetMapArg {} -// unsafe impl Send for BlobOffsetMapArg {} - -// struct BlobOffsetMap { -// map: RwLock>>, -// } - -// impl BlobOffsetMap { -// fn new() -> Self { -// BlobOffsetMap { -// map: RwLock::new(BTreeMap::new()), -// } -// } - -// fn get(&self, blob: &OsString) -> io::Result> { -// self.map -// .read() -// .unwrap() -// .get(blob) -// .map(Arc::clone) -// .ok_or_else(|| einval!()) -// } - -// fn insert(&self, blob: OsString, arg: Arc) { -// self.map.write().unwrap().insert(blob, arg); -// } - -// fn clear(&self) { -// let mut map = self.map.write().unwrap(); -// for (_, arg) in map.iter_mut() { -// trace!("unmap offset map ptr"); -// unsafe { libc::munmap((*arg).base as *mut u8 as *mut libc::c_void, (*arg).size) }; -// } - -// map.clear(); -// } -// } - -// struct DummyZcWriter {} - -// impl io::Write for DummyZcWriter { -// fn write(&mut self, _buf: &[u8]) -> io::Result { -// Ok(0) -// } - -// fn flush(&mut self) -> io::Result<()> { -// Ok(()) -// } -// } - -// impl ZeroCopyWriter for DummyZcWriter { -// fn write_from( -// &mut self, -// f: &mut dyn FileReadWriteVolatile, -// mut count: usize, -// off: u64, -// ) -> io::Result { -// let mut buf = Vec::with_capacity(count); -// count = f.read_vectored_at_volatile( -// // Safe because we have made sure buf has at least count capacity above -// unsafe { &[VolatileSlice::new(buf.as_mut_ptr(), count)] }, -// off, -// )?; - -// trace!("dummy zc write count {} off {}", count, off); -// Ok(count) -// } -// } - #[repr(C, packed)] #[derive(Clone, Copy, Debug, Default)] struct LinuxDirent64 { @@ -343,67 +97,8 @@ impl FromStr for BlobOndemandConfig { /// Options that configure the behavior of the blobfs fuse file system. #[derive(Debug, Clone, PartialEq)] pub struct Config { - /// How long the FUSE client should consider directory entries to be valid. If the contents of a - /// directory can only be modified by the FUSE client (i.e., the file system has exclusive - /// access), then this should be a large value. - /// - /// The default value for this option is 5 seconds. - pub entry_timeout: Duration, - - /// How long the FUSE client should consider file and directory attributes to be valid. If the - /// attributes of a file or directory can only be modified by the FUSE client (i.e., the file - /// system has exclusive access), then this should be set to a large value. - /// - /// The default value for this option is 5 seconds. - pub attr_timeout: Duration, - - /// The caching policy the file system should use. See the documentation of `CachePolicy` for - /// more details. - pub cache_policy: CachePolicy, - - /// Whether the file system should enabled writeback caching. This can improve performance as it - /// allows the FUSE client to cache and coalesce multiple writes before sending them to the file - /// system. However, enabling this option can increase the risk of data corruption if the file - /// contents can change without the knowledge of the FUSE client (i.e., the server does **NOT** - /// have exclusive access). Additionally, the file system should have read access to all files - /// in the directory it is serving as the FUSE client may send read requests even for files - /// opened with `O_WRONLY`. - /// - /// Therefore callers should only enable this option when they can guarantee that: 1) the file - /// system has exclusive access to the directory and 2) the file system has read permissions for - /// all files in that directory. - /// - /// The default value for this option is `false`. - pub writeback: bool, - - /// The path of the root directory. - /// - /// The default is `/`. - pub root_dir: String, - - /// Whether the file system should support Extended Attributes (xattr). Enabling this feature may - /// have a significant impact on performance, especially on write parallelism. This is the result - /// of FUSE attempting to remove the special file privileges after each write request. - /// - /// The default value for this options is `false`. - pub xattr: bool, - - /// To be compatible with Vfs and PseudoFs, BlobFs needs to prepare - /// root inode before accepting INIT request. - /// - /// The default value for this option is `true`. - pub do_import: bool, - - /// Control whether no_open is allowed. - /// - /// The default value for this option is `false`. - pub no_open: bool, - - /// Control whether no_opendir is allowed. - /// - /// The default value for this option is `false`. - pub no_opendir: bool, - + /// Blobfs config is embedded with passthrough config + pub ps_config: PassthroughConfig, /// This provides on demand config of blob management. pub blob_ondemand_cfg: String, } @@ -411,24 +106,21 @@ pub struct Config { impl Default for Config { fn default() -> Self { Config { - entry_timeout: Duration::from_secs(5), - attr_timeout: Duration::from_secs(5), - cache_policy: Default::default(), - writeback: false, - root_dir: String::from("/"), - xattr: false, - do_import: true, - no_open: false, - no_opendir: false, + ps_config: PassthroughConfig::default(), blob_ondemand_cfg: Default::default(), } } } +#[allow(dead_code)] +struct RafsHandle { + rafs: Arc>>, + handle: Arc>>>>, +} + #[allow(dead_code)] struct BootstrapArgs { - rafs: Rafs, - // bootstrap: String, + rafs_handle: RafsHandle, blob_cache_dir: String, } @@ -436,6 +128,34 @@ struct BootstrapArgs { unsafe impl Sync for BootstrapArgs {} unsafe impl Send for BootstrapArgs {} +#[cfg(feature = "virtiofs")] +impl BootstrapArgs { + fn get_rafs_handle(&self) -> io::Result<()> { + let mut c = self.rafs_handle.rafs.lock().unwrap(); + match (*self.rafs_handle.handle.lock().unwrap()).take() { + Some(handle) => { + let rafs = handle.join().unwrap().ok_or_else(|| { + error!("blobfs: get rafs failed."); + einval!("create rafs failed in thread.") + })?; + debug!("blobfs: async create Rafs finish!"); + + *c = Some(rafs); + Ok(()) + } + None => Err(einval!("create rafs failed in thread.")), + } + } + + fn fetch_range_sync(&self, prefetches: &[BlobPrefetchRequest]) -> io::Result<()> { + let c = self.rafs_handle.rafs.lock().unwrap(); + match &*c { + Some(rafs) => rafs.fetch_range_synchronous(prefetches), + None => Err(einval!("create rafs failed in thread.")), + } + } +} + /// A file system that simply "passes through" all requests it receives to the underlying file /// system. /// @@ -444,36 +164,9 @@ unsafe impl Send for BootstrapArgs {} /// directory ends up as the root of the file system process. One way to accomplish this is via a /// combination of mount namespaces and the pivot_root system call. pub struct BlobFs { - // File descriptors for various points in the file system tree. These fds are always opened with - // the `O_PATH` option so they cannot be used for reading or writing any data. See the - // documentation of the `O_PATH` flag in `open(2)` for more details on what one can and cannot - // do with an fd opened with this flag. - inode_map: InodeMap, - next_inode: AtomicU64, - - // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be - // used for reading and writing data. - handle_map: HandleMap, - next_handle: AtomicU64, - - // File descriptor pointing to the `/proc` directory. This is used to convert an fd from - // `inodes` into one that can go into `handles`. This is accomplished by reading the - // `self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are meant - // to be serving doesn't have access to `/proc`. - proc: File, - - // Whether writeback caching is enabled for this directory. This will only be true when - // `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`. - writeback: AtomicBool, - - // Whether no_open is enabled. - no_open: AtomicBool, - - // Whether no_opendir is enabled. - no_opendir: AtomicBool, - + pfs: PassthroughFs, + #[cfg(feature = "virtiofs")] cfg: Config, - #[allow(dead_code)] bootstrap_args: BootstrapArgs, } @@ -481,30 +174,16 @@ pub struct BlobFs { impl BlobFs { /// Create a Blob file system instance. pub fn new(cfg: Config) -> io::Result { - // Safe because this is a constant value and a valid C string. - let proc_cstr = unsafe { CStr::from_bytes_with_nul_unchecked(PROC_CSTR) }; - let proc = Self::open_file( - libc::AT_FDCWD, - proc_cstr, - libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, - 0, - ) - .map_err(|e| einval!(e))?; + trace!("BlobFs config is: {:?}", cfg); + #[cfg(feature = "virtiofs")] + let cfg_bak = cfg.clone(); let bootstrap_args = Self::load_bootstrap(&cfg)?; + let pfs = PassthroughFs::new(cfg.ps_config)?; Ok(BlobFs { - inode_map: InodeMap::new(), - next_inode: AtomicU64::new(ROOT_ID + 1), - - handle_map: HandleMap::new(), - next_handle: AtomicU64::new(1), - - proc, - - writeback: AtomicBool::new(false), - no_open: AtomicBool::new(false), - no_opendir: AtomicBool::new(false), - cfg, + pfs, + #[cfg(feature = "virtiofs")] + cfg: cfg_bak, bootstrap_args, }) } @@ -517,7 +196,6 @@ impl BlobFs { return Err(einval!("no valid blob cache dir")); } - // mmap bootstrap into current process let path = Path::new(blob_ondemand_conf.bootstrap_path.as_str()); if !path.exists() || blob_ondemand_conf.bootstrap_path == String::default() { return Err(einval!("no valid bootstrap")); @@ -528,52 +206,37 @@ impl BlobFs { rafs_conf.mode = "direct".to_string(); let mut bootstrap = ::from_file(path.to_str().unwrap()).map_err(|e| eother!(e))?; - let mut rafs = Rafs::new(rafs_conf, "blobfs", &mut bootstrap) - .map_err(|e| eother!(format!("blobfs: new rafs failed {:?}", e)))?; - rafs.import(bootstrap, None) - .map_err(|e| eother!(format!("blobfs: rafs import failed {:?}", e)))?; + + trace!("blobfs: async create Rafs start!"); + let rafs_join_handle = std::thread::spawn(move || { + let mut rafs = match Rafs::new(rafs_conf, "blobfs", &mut bootstrap) { + Ok(rafs) => rafs, + Err(e) => { + error!("blobfs: new rafs failed {:?}.", e); + return None; + } + }; + match rafs.import(bootstrap, None) { + Ok(_) => {} + Err(e) => { + error!("blobfs: new rafs failed {:?}.", e); + return None; + } + } + Some(rafs) + }); + let rafs_handle = RafsHandle { + rafs: Arc::new(Mutex::new(None)), + handle: Arc::new(Mutex::new(Some(rafs_join_handle))), + }; Ok(BootstrapArgs { - rafs, - // bootstrap: blob_ondemand_conf.bootstrap_path.clone(), + rafs_handle, blob_cache_dir: blob_ondemand_conf.blob_cache_dir, }) } - /// Initialize the Blob file system. - pub fn import(&self) -> io::Result<()> { - let root = CString::new(self.cfg.root_dir.as_str()).expect("CString::new failed"); - // We use `O_PATH` because we just want this for traversing the directory tree - // and not for actually reading the contents. - let f = Self::open_file( - libc::AT_FDCWD, - &root, - libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, - 0, - )?; - - let st = Self::stat(&f)?; - - // Safe because this doesn't modify any memory and there is no need to check the return - // value because this system call always succeeds. We need to clear the umask here because - // we want the client to be able to set all the bits in the mode. - unsafe { libc::umask(0o000) }; - - // Not sure why the root inode gets a refcount of 2 but that's what libfuse does. - self.inode_map.insert( - ROOT_ID, - InodeAltKey::from_stat(&st), - InodeData::new(ROOT_ID, f, 2), - ); - - Ok(()) - } - - /// Get the list of file descriptors which should be reserved across live upgrade. - pub fn keep_fds(&self) -> Vec { - vec![self.proc.as_raw_fd()] - } - + #[cfg(feature = "virtiofs")] fn stat(f: &File) -> io::Result { // Safe because this is a constant value and a valid C string. let pathname = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; @@ -596,41 +259,15 @@ impl BlobFs { } } - #[cfg(feature = "virtiofs")] - fn readlinkat(dfd: i32, pathname: &CStr) -> io::Result { - let mut buf = Vec::with_capacity(256); - - loop { - let buf_read = unsafe { - libc::readlinkat( - dfd, - pathname.as_ptr(), - buf.as_mut_ptr() as *mut _, - buf.capacity(), - ) - }; - if buf_read < 0 { - return Err(io::Error::last_os_error()); - } - - unsafe { - buf.set_len(buf_read as usize); - } - - if buf_read as usize != buf.capacity() { - buf.shrink_to_fit(); - - return Ok(PathBuf::from(OsString::from_vec(buf))); - } - - // Trigger the internal buffer resizing logic of `Vec` by requiring - // more space than the current capacity. The length is guaranteed to be - // the same as the capacity due to the if statement above. - buf.reserve(1); - } + /// Initialize the PassthroughFs + pub fn import(&self) -> io::Result<()> { + self.pfs.import() } - fn open_file(dfd: i32, pathname: &CStr, flags: i32, mode: u32) -> io::Result { + #[cfg(feature = "virtiofs")] + fn open_file(dfd: i32, pathname: &Path, flags: i32, mode: u32) -> io::Result { + let pathname = CString::new(pathname.as_os_str().as_bytes()) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; let fd = if flags & libc::O_CREAT == libc::O_CREAT { unsafe { libc::openat(dfd, pathname.as_ptr(), flags, mode) } } else { @@ -644,148 +281,12 @@ impl BlobFs { // Safe because we just opened this fd. Ok(unsafe { File::from_raw_fd(fd) }) } - - fn do_lookup(&self, parent: Inode, name: &CStr) -> io::Result { - let p = self.inode_map.get(parent)?; - let f = Self::open_file( - p.get_raw_fd(), - name, - libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, - 0, - )?; - let st = Self::stat(&f)?; - let altkey = InodeAltKey::from_stat(&st); - - let mut found = None; - 'search: loop { - match self.inode_map.get_alt(&altkey) { - // No existing entry found - None => break 'search, - Some(data) => { - let curr = data.refcount.load(Ordering::Acquire); - // forgot_one() has just destroyed the entry, retry... - if curr == 0 { - continue 'search; - } - - // Saturating add to avoid integer overflow, it's not realistic to saturate u64. - let new = curr.saturating_add(1); - - // Synchronizes with the forgot_one() - if data - .refcount - .compare_exchange(curr, new, Ordering::AcqRel, Ordering::Acquire) - .is_ok() - { - found = Some(data.inode); - break; - } - } - } - } - - let inode = if let Some(v) = found { - v - } else { - let mut inodes = self.inode_map.get_map_mut(); - - // Lookup inode_map again after acquiring the inode_map lock, as there might be another - // racing thread already added an inode with the same altkey while we're not holding - // the lock. If so just use the newly added inode, otherwise the inode will be replaced - // and results in EBADF. - match inodes.get_alt(&altkey).map(Arc::clone) { - Some(data) => { - trace!( - "fuse: do_lookup sees existing inode {} altkey {:?}", - data.inode, - altkey - ); - data.refcount.fetch_add(1, Ordering::Relaxed); - data.inode - } - None => { - let inode = self.next_inode.fetch_add(1, Ordering::Relaxed); - if inode > VFS_MAX_INO { - return Err(io::Error::new( - io::ErrorKind::Other, - format!("max inode number reached: {}", VFS_MAX_INO), - )); - } - trace!( - "fuse: do_lookup adds new inode {} altkey {:?}", - inode, - altkey - ); - inodes.insert(inode, altkey, Arc::new(InodeData::new(inode, f, 1))); - inode - } - } - }; - - Ok(Entry { - inode, - generation: 0, - attr: st, - attr_flags: 0, - attr_timeout: self.cfg.attr_timeout, - entry_timeout: self.cfg.entry_timeout, - }) - } - - fn forget_one( - inodes: &mut MultikeyBTreeMap>, - inode: Inode, - count: u64, - ) { - // ROOT_ID should not be forgotten, or we're not able to access to files any more. - if inode == ROOT_ID { - return; - } - - if let Some(data) = inodes.get(&inode) { - // Acquiring the write lock on the inode map prevents new lookups from incrementing the - // refcount but there is the possibility that a previous lookup already acquired a - // reference to the inode data and is in the process of updating the refcount so we need - // to loop here until we can decrement successfully. - loop { - let curr = data.refcount.load(Ordering::Acquire); - - // Saturating sub because it doesn't make sense for a refcount to go below zero and - // we don't want misbehaving clients to cause integer overflow. - let new = curr.saturating_sub(count); - - trace!( - "fuse: forget inode {} refcount {}, count {}, new_count {}", - inode, - curr, - count, - new - ); - - // Synchronizes with the acquire load in `do_lookup`. - if data - .refcount - .compare_exchange(curr, new, Ordering::AcqRel, Ordering::Acquire) - .is_ok() - { - if new == 0 { - // We just removed the last refcount for this inode. - inodes.remove(&inode); - } - break; - } - } - } - } - - fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> { - self.handle_map.release(handle, inode) - } } impl BackendFileSystem for BlobFs { fn mount(&self) -> io::Result<(Entry, u64)> { - let entry = self.do_lookup(ROOT_ID, &CString::new(".").unwrap())?; + let ctx = &Context::default(); + let entry = self.lookup(ctx, ROOT_ID, &CString::new(".").unwrap())?; Ok((entry, VFS_MAX_INO)) } @@ -794,15 +295,14 @@ impl BackendFileSystem for BlobFs { } } -fn ebadf() -> io::Error { - io::Error::from_raw_os_error(libc::EBADF) -} - #[cfg(test)] #[cfg(feature = "virtiofs")] mod tests { use super::*; + use fuse_backend_rs::abi::virtio_fs; + use fuse_backend_rs::transport::FsCacheReqHandler; use nydus_app::setup_logging; + use std::os::unix::prelude::RawFd; struct DummyCacheReq {} @@ -958,25 +458,24 @@ mod tests { }"#; // let rafs_conf = RafsConfig::from_str(config).unwrap(); - let fs_cfg = Config { + let ps_config = PassthroughConfig { root_dir: "/home/b.liu/1_source/3_ali/virtiofs/qemu-my/build-kangaroo/share_dir1" .to_string(), do_import: false, no_open: true, - blob_ondemand_cfg: config.to_string(), ..Default::default() }; + let fs_cfg = Config { + ps_config, + blob_ondemand_cfg: config.to_string(), + }; let fs = BlobFs::new(fs_cfg).unwrap(); fs.import().unwrap(); fs.mount().unwrap(); - let ctx = Context { - uid: 0, - gid: 0, - pid: 0, - }; + let ctx = &Context::default(); // read bootstrap first, should return err as it's not in blobcache dir. // let bootstrap = CString::new("foo").unwrap(); @@ -987,15 +486,15 @@ mod tests { // FIXME: use a real blob id under test4k. let blob_cache_dir = CString::new("blobcache").unwrap(); - let parent_entry = fs.lookup(&ctx, ROOT_ID, &blob_cache_dir).unwrap(); + let parent_entry = fs.lookup(ctx, ROOT_ID, &blob_cache_dir).unwrap(); let blob_id = CString::new("80da976ee69d68af6bb9170395f71b4ef1e235e815e2").unwrap(); - let entry = fs.lookup(&ctx, parent_entry.inode, &blob_id).unwrap(); + let entry = fs.lookup(ctx, parent_entry.inode, &blob_id).unwrap(); let foffset = 0; let len = 1 << 21; let mut req = DummyCacheReq {}; - fs.setupmapping(&ctx, entry.inode, 0, foffset, len, 0, 0, &mut req) + fs.setupmapping(ctx, entry.inode, 0, foffset, len, 0, 0, &mut req) .unwrap(); // FIXME: release fs diff --git a/blobfs/src/multikey.rs b/blobfs/src/multikey.rs deleted file mode 100644 index 965ad01f299..00000000000 --- a/blobfs/src/multikey.rs +++ /dev/null @@ -1,286 +0,0 @@ -// Copyright 2019 The Chromium OS Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE-BSD-3-Clause file. - -//! Struct MultikeyBTreeMap implementation used by passthrough. - -use std::borrow::Borrow; -use std::collections::btree_map::Iter; -use std::collections::BTreeMap; - -/// A BTreeMap that supports 2 types of keys per value. All the usual restrictions and warnings for -/// `std::collections::BTreeMap` also apply to this struct. Additionally, there is a 1:1 -/// relationship between the 2 key types. In other words, for each `K1` in the map, there is exactly -/// one `K2` in the map and vice versa. -#[derive(Default)] -pub struct MultikeyBTreeMap -where - K1: Ord, - K2: Ord, -{ - // We need to keep a copy of the second key in the main map so that we can remove entries using - // just the main key. Otherwise we would require the caller to provide both keys when calling - // `remove`. - main: BTreeMap, - alt: BTreeMap, -} - -impl MultikeyBTreeMap -where - K1: Clone + Ord, - K2: Clone + Ord, -{ - /// Create a new empty MultikeyBTreeMap. - pub fn new() -> Self { - MultikeyBTreeMap { - main: BTreeMap::default(), - alt: BTreeMap::default(), - } - } - - /// Get number of entries in the map. - pub fn len(&self) -> usize { - self.main.len() - } - - /// Returns a reference to the value corresponding to the key. - /// - /// The key may be any borrowed form of `K1``, but the ordering on the borrowed form must match - /// the ordering on `K1`. - pub fn get(&self, key: &Q) -> Option<&V> - where - K1: Borrow, - Q: Ord + ?Sized, - { - self.main.get(key).map(|(_, v)| v) - } - - /// Returns a reference to the value corresponding to the alternate key. - /// - /// The key may be any borrowed form of the `K2``, but the ordering on the borrowed form must - /// match the ordering on `K2`. - /// - /// Note that this method performs 2 lookups: one to get the main key and another to get the - /// value associated with that key. For best performance callers should prefer the `get` method - /// over this method whenever possible as `get` only needs to perform one lookup. - pub fn get_alt(&self, key: &Q2) -> Option<&V> - where - K2: Borrow, - Q2: Ord + ?Sized, - { - if let Some(k) = self.alt.get(key) { - self.get(k) - } else { - None - } - } - - /// Inserts a new entry into the map with the given keys and value. - /// - /// Returns `None` if the map did not have an entry with `k1` or `k2` present. If exactly one - /// key was present, then the value associated with that key is updated, the other key is - /// removed, and the old value is returned. If **both** keys were present then the value - /// associated with the main key is updated, the value associated with the alternate key is - /// removed, and the old value associated with the main key is returned. - pub fn insert(&mut self, k1: K1, k2: K2, v: V) -> Option { - let oldval = if let Some(oldkey) = self.alt.insert(k2.clone(), k1.clone()) { - self.main.remove(&oldkey) - } else { - None - }; - self.main - .insert(k1, (k2.clone(), v)) - .or(oldval) - .map(|(oldk2, v)| { - if oldk2 != k2 { - self.alt.remove(&oldk2); - } - v - }) - } - - /// Remove a key from the map, returning the value associated with that key if it was previously - /// in the map. - /// - /// The key may be any borrowed form of `K1``, but the ordering on the borrowed form must match - /// the ordering on `K1`. - pub fn remove(&mut self, key: &Q) -> Option - where - K1: Borrow, - Q: Ord + ?Sized, - { - self.main.remove(key).map(|(k2, v)| { - self.alt.remove(&k2); - v - }) - } - - /// Clears the map, removing all values. - pub fn clear(&mut self) { - self.alt.clear(); - self.main.clear() - } - - pub fn iter(&self) -> Iter<'_, K1, (K2, V)> { - self.main.iter() - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn get() { - let mut m = MultikeyBTreeMap::::new(); - - let k1 = 0xc6c8_f5e0_b13e_ed40; - let k2 = 0x1a04_ce4b_8329_14fe; - let val = 0xf4e3_c360; - assert!(m.insert(k1, k2, val).is_none()); - - assert_eq!(*m.get(&k1).expect("failed to look up main key"), val); - assert_eq!(*m.get_alt(&k2).expect("failed to look up alt key"), val); - } - - #[test] - fn update_main_key() { - let mut m = MultikeyBTreeMap::::new(); - - let k1 = 0xc6c8_f5e0_b13e_ed40; - let k2 = 0x1a04_ce4b_8329_14fe; - let val = 0xf4e3_c360; - assert!(m.insert(k1, k2, val).is_none()); - - let new_k1 = 0x3add_f8f8_c7c5_df5e; - let val2 = 0x7389_f8a7; - assert_eq!( - m.insert(new_k1, k2, val2) - .expect("failed to update main key"), - val - ); - - assert!(m.get(&k1).is_none()); - assert_eq!(*m.get(&new_k1).expect("failed to look up main key"), val2); - assert_eq!(*m.get_alt(&k2).expect("failed to look up alt key"), val2); - } - - #[test] - fn update_alt_key() { - let mut m = MultikeyBTreeMap::::new(); - - let k1 = 0xc6c8_f5e0_b13e_ed40; - let k2 = 0x1a04_ce4b_8329_14fe; - let val = 0xf4e3_c360; - assert!(m.insert(k1, k2, val).is_none()); - - let new_k2 = 0x6825_a60b_61ac_b333; - let val2 = 0xbb14_8f2c; - assert_eq!( - m.insert(k1, new_k2, val2) - .expect("failed to update alt key"), - val - ); - - assert!(m.get_alt(&k2).is_none()); - assert_eq!(*m.get(&k1).expect("failed to look up main key"), val2); - assert_eq!( - *m.get_alt(&new_k2).expect("failed to look up alt key"), - val2 - ); - } - - #[test] - fn update_value() { - let mut m = MultikeyBTreeMap::::new(); - - let k1 = 0xc6c8_f5e0_b13e_ed40; - let k2 = 0x1a04_ce4b_8329_14fe; - let val = 0xf4e3_c360; - assert!(m.insert(k1, k2, val).is_none()); - - let val2 = 0xe42d_79ba; - assert_eq!( - m.insert(k1, k2, val2).expect("failed to update alt key"), - val - ); - - assert_eq!(*m.get(&k1).expect("failed to look up main key"), val2); - assert_eq!(*m.get_alt(&k2).expect("failed to look up alt key"), val2); - } - - #[test] - fn update_both_keys_main() { - let mut m = MultikeyBTreeMap::::new(); - - let k1 = 0xc6c8_f5e0_b13e_ed40; - let k2 = 0x1a04_ce4b_8329_14fe; - let val = 0xf4e3_c360; - assert!(m.insert(k1, k2, val).is_none()); - - let new_k1 = 0xc980_587a_24b3_ae30; - let new_k2 = 0x2773_c5ee_8239_45a2; - let val2 = 0x31f4_33f9; - assert!(m.insert(new_k1, new_k2, val2).is_none()); - - let val3 = 0x8da1_9cf7; - assert_eq!( - m.insert(k1, new_k2, val3) - .expect("failed to update main key"), - val - ); - - // Both new_k1 and k2 should now be gone from the map. - assert!(m.get(&new_k1).is_none()); - assert!(m.get_alt(&k2).is_none()); - - assert_eq!(*m.get(&k1).expect("failed to look up main key"), val3); - assert_eq!( - *m.get_alt(&new_k2).expect("failed to look up alt key"), - val3 - ); - } - - #[test] - fn update_both_keys_alt() { - let mut m = MultikeyBTreeMap::::new(); - - let k1 = 0xc6c8_f5e0_b13e_ed40; - let k2 = 0x1a04_ce4b_8329_14fe; - let val = 0xf4e3_c360; - assert!(m.insert(k1, k2, val).is_none()); - - let new_k1 = 0xc980_587a_24b3_ae30; - let new_k2 = 0x2773_c5ee_8239_45a2; - let val2 = 0x31f4_33f9; - assert!(m.insert(new_k1, new_k2, val2).is_none()); - - let val3 = 0x8da1_9cf7; - assert_eq!( - m.insert(new_k1, k2, val3) - .expect("failed to update main key"), - val2 - ); - - // Both k1 and new_k2 should now be gone from the map. - assert!(m.get(&k1).is_none()); - assert!(m.get_alt(&new_k2).is_none()); - - assert_eq!(*m.get(&new_k1).expect("failed to look up main key"), val3); - assert_eq!(*m.get_alt(&k2).expect("failed to look up alt key"), val3); - } - - #[test] - fn remove() { - let mut m = MultikeyBTreeMap::::new(); - - let k1 = 0xc6c8_f5e0_b13e_ed40; - let k2 = 0x1a04_ce4b_8329_14fe; - let val = 0xf4e3_c360; - assert!(m.insert(k1, k2, val).is_none()); - - assert_eq!(m.remove(&k1).expect("failed to remove entry"), val); - assert!(m.get(&k1).is_none()); - assert!(m.get_alt(&k2).is_none()); - } -} diff --git a/blobfs/src/sync_io.rs b/blobfs/src/sync_io.rs index bb31c2434c5..4956fcd8a7e 100644 --- a/blobfs/src/sync_io.rs +++ b/blobfs/src/sync_io.rs @@ -5,123 +5,60 @@ //! Fuse passthrough file system, mirroring an existing FS hierarchy. +use super::*; #[cfg(feature = "virtiofs")] -use std::cmp::min; +use fuse_backend_rs::abi::virtio_fs; +use fuse_backend_rs::api::CreateIn; +#[cfg(feature = "virtiofs")] +use fuse_backend_rs::transport::FsCacheReqHandler; +use nydus_error::eacces; #[cfg(feature = "virtiofs")] -use std::convert::TryInto; -use std::ffi::{CStr, CString}; -use std::fs::File; +use std::cmp::min; +use std::ffi::CStr; use std::io; -use std::mem::{self, size_of, ManuallyDrop, MaybeUninit}; -use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; #[cfg(feature = "virtiofs")] use std::path::Path; -use std::sync::atomic::Ordering; -use std::sync::Arc; use std::time::Duration; - -use super::*; -use fuse_backend_rs::{api::CreateIn, bytes_to_cstr}; #[cfg(feature = "virtiofs")] use storage::device::BlobPrefetchRequest; -macro_rules! scoped_cred { - ($name:ident, $ty:ty, $syscall_nr:expr) => { - #[derive(Debug)] - struct $name; - - impl $name { - // Changes the effective uid/gid of the current thread to `val`. Changes - // the thread's credentials back to root when the returned struct is dropped. - fn new(val: $ty) -> io::Result> { - if val == 0 { - // Nothing to do since we are already uid 0. - return Ok(None); - } - - // We want credential changes to be per-thread because otherwise - // we might interfere with operations being carried out on other - // threads with different uids/gids. However, posix requires that - // all threads in a process share the same credentials. To do this - // libc uses signals to ensure that when one thread changes its - // credentials the other threads do the same thing. - // - // So instead we invoke the syscall directly in order to get around - // this limitation. Another option is to use the setfsuid and - // setfsgid systems calls. However since those calls have no way to - // return an error, it's preferable to do this instead. - - // This call is safe because it doesn't modify any memory and we - // check the return value. - let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) }; - if res == 0 { - Ok(Some($name)) - } else { - Err(io::Error::last_os_error()) - } - } - } - - impl Drop for $name { - fn drop(&mut self) { - let res = unsafe { libc::syscall($syscall_nr, -1, 0, -1) }; - if res < 0 { - error!( - "fuse: failed to change credentials back to root: {}", - io::Error::last_os_error(), - ); - } - } - } - }; -} -scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid); -scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid); - -fn set_creds( - uid: libc::uid_t, - gid: libc::gid_t, -) -> io::Result<(Option, Option)> { - // We have to change the gid before we change the uid because if we change the uid first then we - // lose the capability to change the gid. However changing back can happen in any order. - ScopedGid::new(gid).and_then(|gid| Ok((ScopedUid::new(uid)?, gid))) -} - impl BlobFs { #[cfg(feature = "virtiofs")] fn get_blob_id_and_size(&self, inode: Inode) -> io::Result<(String, i64)> { // locate blob file that the inode refers to - let data = self.inode_map.get(inode)?; - let pathname = - CString::new(format!("self/fd/{}", data.get_raw_fd())).map_err(|e| einval!(e))?; - - let blob_id_full_path = self.readlinkat_proc_file(&pathname)?; + let blob_id_full_path = self.pfs.readlinkat_proc_file(inode)?; let parent = blob_id_full_path .parent() .ok_or_else(|| einval!("blobfs: failed to find parent"))?; trace!( - "parent {:?} ------ {:?}", + "parent: {:?}, blob id path: {:?}", parent, - Path::new(self.bootstrap_args.blob_cache_dir.as_str()) + blob_id_full_path ); - if parent - != Path::new(self.cfg.root_dir.as_str()) - .join(self.bootstrap_args.blob_cache_dir.as_str()) - { - error!("blobfs: blob path is not valid."); - return Err(einval!("blobfs: blob path is not valid")); - } - let blob_id = blob_id_full_path - .file_name() - .ok_or_else(|| einval!("blobfs: failed to find blob file"))?; - trace!("load_chunks_on_demand: blob_id {:?}", blob_id); + debug_assert!( + parent + == Path::new(self.cfg.ps_config.root_dir.as_str()) + .join(self.bootstrap_args.blob_cache_dir.as_str()) + ); - let st = Self::stat(&data.file).map_err(|e| { + let blob_file = Self::open_file( + libc::AT_FDCWD, + &blob_id_full_path.as_path(), + libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, + 0, + ) + .map_err(|e| einval!(e))?; + let st = Self::stat(&blob_file).map_err(|e| { error!("get_blob_id_and_size: stat failed {:?}", e); e })?; + let blob_id = blob_id_full_path + .file_name() + .ok_or_else(|| einval!("blobfs: failed to find blob file"))?; + + trace!("load_chunks_on_demand: blob_id {:?}", blob_id); Ok((blob_id.to_os_string().into_string().unwrap(), st.st_size)) } @@ -150,403 +87,10 @@ impl BlobFs { len: min(len, 0x0020_0000_u32), // 2M range }; - self.bootstrap_args - .rafs - .fetch_range_synchronous(&[req]) - .map_err(|e| { - warn!("load chunks: error, {:?}", e); - e - }) - } - - // #[cfg(feature = "virtiofs")] - // fn load_chunks_on_demand_v1(&self, inode: Inode, foffset: u64) -> io::Result<()> { - // // locate blob file that the inode refers to - // let data = self.inode_map.get(inode)?; - // let pathname = - // CString::new(format!("self/fd/{}", data.get_raw_fd())).map_err(|e| einval!(e))?; - - // let blob_id_full_path = self.readlinkat_proc_file(&pathname)?; - // let parent = blob_id_full_path - // .parent() - // .ok_or_else(|| einval!("blobfs: failed to find parent"))?; - - // trace!( - // "parent {:?} ------ {:?}", - // parent, - // Path::new(self.cfg.root_dir.as_str()).join(self.bootstrap_args.blob_cache_dir.as_str()) - // ); - // if parent - // != Path::new(self.cfg.root_dir.as_str()) - // .join(self.bootstrap_args.blob_cache_dir.as_str()) - // { - // return Ok(()); - // } - // let blob_id = blob_id_full_path - // .file_name() - // .ok_or_else(|| einval!("blobfs: failed to find blob file"))?; - - // trace!("load_chunks_on_demand: blob_id {:?}", blob_id); - - // // make sure chunks are available. - // let blob_offset_map_ptr = match self.blob_offset_map.get(&blob_id.to_os_string()) { - // Ok(arg) => arg.base, - // Err(_) => { - // // blob_offset_map and bootstrap is in the same directory. - // let bootstrap_path = Path::new(self.cfg.root_dir.as_str()) - // .join(self.bootstrap_args.bootstrap.as_str()); - // let blob_offset_map_path = bootstrap_path - // .with_file_name(blob_id) - // .with_extension("blob_offset_map"); - - // trace!("blob_offset_map {:?}", blob_offset_map_path); - // let map_f = File::open(&blob_offset_map_path).map_err(|e| einval!(e))?; - // let size = map_f.metadata()?.len() as usize; - // trace!( - // "blobfs: blob offset map {:?} size {}", - // blob_offset_map_path, - // size - // ); - // if size == 0 { - // return Err(einval!("invalid blob offset map file size")); - // } - - // let base = unsafe { - // libc::mmap( - // std::ptr::null_mut(), - // size, - // libc::PROT_READ, - // libc::MAP_NORESERVE | libc::MAP_SHARED, - // map_f.as_raw_fd(), - // 0, - // ) - // } as *const u8; - // if base as *mut core::ffi::c_void == libc::MAP_FAILED { - // return Err(last_error!("failed to mmap blob offset map")); - // } - // if base.is_null() { - // return Err(ebadf!("failed to mmap blob offset map")); - // } - - // // save the ptr for later use. - // self.blob_offset_map.insert( - // blob_id.to_os_string(), - // Arc::new(BlobOffsetMapArg { base, size }), - // ); - // base - // } - // }; - - // let index = foffset >> 21; - // let mut ptr = blob_offset_map_ptr.wrapping_add((index * 16) as usize); - // let mut chunk_vec_off = unsafe { *(ptr as *const u64) }; - - // ptr = ptr.wrapping_add(std::mem::size_of::()); - // let mut chunk_vec_len = unsafe { *(ptr as *const u64) }; - - // loop { - // ptr = blob_offset_map_ptr.wrapping_add(chunk_vec_off as usize); - // // let chunk = unsafe { *(ptr as *const OndiskChunkInfo) }; - // let chunk_info_pos = unsafe { *(ptr as *const u64) }; - // // ptr = self - // // .bootstrap_args - // // .base - // // .wrapping_add(chunk_info_pos as usize); - // // let chunk = unsafe { *(ptr as *const OndiskChunkInfo) }; - // let chunk = self - // .bootstrap_args - // .rafs - // .sb - // .inodes - // .get_chunk_info(chunk_info_pos as usize) - // .map_err(|e| { - // error!( - // "load_chunks_on_demand: failed to get chunk info at {}", - // chunk_info_pos - // ); - // e - // })?; - - // let offset = 0; - // let end = RAFS_DEFAULT_BLOCK_SIZE; - // let mut desc = RafsBioDesc::new(); - // let blksize = RAFS_DEFAULT_BLOCK_SIZE; - - // let blob = self - // .bootstrap_args - // .rafs - // .sb - // .inodes - // .get_blob_table() - // .get(chunk.blob_index()) - // .map_err(|e| { - // error!( - // "load_chunks_on_demand: failed to get blob in blob table at index {}", - // chunk.blob_index() - // ); - // e - // })?; - - // let ret = - // add_chunk_to_bio_desc(offset, end, chunk.clone(), &mut desc, blksize as u32, blob); - // trace!("add chunk block id: {}", chunk.block_id()); - // if ret { - // let mut dummy_writer = DummyZcWriter {}; - // let ret = self - // .bootstrap_args - // .rafs - // .device - // .read_to(&mut dummy_writer, desc)?; - // trace!("read device: {}", ret); - // } else { - // return Err(eio!()); - // } - - // chunk_vec_off += std::mem::size_of::() as u64; - // chunk_vec_len -= 1; - // if chunk_vec_len == 0 { - // break; - // } - // } - // Ok(()) - // } - - #[cfg(feature = "virtiofs")] - fn readlinkat_proc_file(&self, pathname: &CStr) -> io::Result { - Self::readlinkat(self.proc.as_raw_fd(), pathname) - } - - fn open_proc_file(&self, pathname: &CStr, flags: i32) -> io::Result { - Self::open_file(self.proc.as_raw_fd(), pathname, flags, 0) - } - - fn open_inode(&self, inode: Inode, mut flags: i32) -> io::Result { - let data = self.inode_map.get(inode)?; - let pathname = CString::new(format!("self/fd/{}", data.get_raw_fd())) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; - - // When writeback caching is enabled, the kernel may send read requests even if the - // userspace program opened the file write-only. So we need to ensure that we have opened - // the file for reading as well as writing. - let writeback = self.writeback.load(Ordering::Relaxed); - if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY { - flags &= !libc::O_ACCMODE; - flags |= libc::O_RDWR; - } - - // When writeback caching is enabled the kernel is responsible for handling `O_APPEND`. - // However, this breaks atomicity as the file may have changed on disk, invalidating the - // cached copy of the data in the kernel and the offset that the kernel thinks is the end of - // the file. Just allow this for now as it is the user's responsibility to enable writeback - // caching only for directories that are not shared. It also means that we need to clear the - // `O_APPEND` flag. - if writeback && flags & libc::O_APPEND != 0 { - flags &= !libc::O_APPEND; - } - - // We don't really check `flags` because if the kernel can't handle poorly specified flags - // then we have much bigger problems. Also, clear the `O_NOFOLLOW` flag if it is set since - // we need to follow the `/proc/self/fd` symlink to get the file. - self.open_proc_file(&pathname, (flags | libc::O_CLOEXEC) & (!libc::O_NOFOLLOW)) - } - - fn do_readdir( - &self, - inode: Inode, - handle: Handle, - size: u32, - offset: u64, - add_entry: &mut dyn FnMut(DirEntry) -> io::Result, - ) -> io::Result<()> { - if size == 0 { - return Ok(()); - } - - let mut buf = Vec::::with_capacity(size as usize); - let data = self.get_dirdata(handle, inode, libc::O_RDONLY)?; - - { - // Since we are going to work with the kernel offset, we have to acquire the file lock - // for both the `lseek64` and `getdents64` syscalls to ensure that no other thread - // changes the kernel offset while we are using it. - let (guard, dir) = data.get_file_mut(); - - // Safe because this doesn't modify any memory and we check the return value. - let res = - unsafe { libc::lseek64(dir.as_raw_fd(), offset as libc::off64_t, libc::SEEK_SET) }; - if res < 0 { - return Err(io::Error::last_os_error()); - } - - // Safe because the kernel guarantees that it will only write to `buf` and we check the - // return value. - let res = unsafe { - libc::syscall( - libc::SYS_getdents64, - dir.as_raw_fd(), - buf.as_mut_ptr() as *mut LinuxDirent64, - size as libc::c_int, - ) - }; - if res < 0 { - return Err(io::Error::last_os_error()); - } - - // Safe because we trust the value returned by kernel. - unsafe { buf.set_len(res as usize) }; - - // Explicitly drop the lock so that it's not held while we fill in the fuse buffer. - mem::drop(guard); - } - - let mut rem = &buf[..]; - let orig_rem_len = rem.len(); - while !rem.is_empty() { - // We only use debug asserts here because these values are coming from the kernel and we - // trust them implicitly. - debug_assert!( - rem.len() >= size_of::(), - "fuse: not enough space left in `rem`" - ); - - let (front, back) = rem.split_at(size_of::()); - - let dirent64 = LinuxDirent64::from_slice(front) - .expect("fuse: unable to get LinuxDirent64 from slice"); - - let namelen = dirent64.d_reclen as usize - size_of::(); - debug_assert!( - namelen <= back.len(), - "fuse: back is smaller than `namelen`" - ); - - let name = &back[..namelen]; - let res = if name.starts_with(CURRENT_DIR_CSTR) || name.starts_with(PARENT_DIR_CSTR) { - // We don't want to report the "." and ".." entries. However, returning `Ok(0)` will - // break the loop so return `Ok` with a non-zero value instead. - Ok(1) - } else { - // The Sys_getdents64 in kernel will pad the name with '\0' - // bytes up to 8-byte alignment, so @name may contain a few null - // terminators. This causes an extra lookup from fuse when - // called by readdirplus, because kernel path walking only takes - // name without null terminators, the dentry with more than 1 - // null terminators added by readdirplus doesn't satisfy the - // path walking. - let name = bytes_to_cstr(name) - .map_err(|e| { - error!("fuse: do_readdir: {:?}", e); - io::Error::from_raw_os_error(libc::EINVAL) - })? - .to_bytes(); - - add_entry(DirEntry { - ino: dirent64.d_ino, - offset: dirent64.d_off as u64, - type_: u32::from(dirent64.d_ty), - name, - }) - }; - - debug_assert!( - rem.len() >= dirent64.d_reclen as usize, - "fuse: rem is smaller than `d_reclen`" - ); - - match res { - Ok(0) => break, - Ok(_) => rem = &rem[dirent64.d_reclen as usize..], - // If there's an error, we can only signal it if we haven't - // stored any entries yet - otherwise we'd end up with wrong - // lookup counts for the entries that are already in the - // buffer. So we return what we've collected until that point. - Err(e) if rem.len() == orig_rem_len => return Err(e), - Err(_) => return Ok(()), - } - } - - Ok(()) - } - - fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option, OpenOptions)> { - let file = self.open_inode(inode, flags as i32)?; - let data = HandleData::new(inode, file); - let handle = self.next_handle.fetch_add(1, Ordering::Relaxed); - - self.handle_map.insert(handle, data); - - let mut opts = OpenOptions::empty(); - match self.cfg.cache_policy { - // We only set the direct I/O option on files. - CachePolicy::Never => opts.set( - OpenOptions::DIRECT_IO, - flags & (libc::O_DIRECTORY as u32) == 0, - ), - CachePolicy::Always => opts |= OpenOptions::KEEP_CACHE, - _ => {} - }; - - Ok((Some(handle), opts)) - } - - fn do_getattr(&self, inode: Inode) -> io::Result<(libc::stat64, Duration)> { - let data = self.inode_map.get(inode).map_err(|e| { - error!("fuse: do_getattr ino {} Not find err {:?}", inode, e); + self.bootstrap_args.fetch_range_sync(&[req]).map_err(|e| { + warn!("load chunks: error, {:?}", e); e - })?; - - let st = Self::stat(&data.file).map_err(|e| { - error!( - "fuse: do_getattr stat failed ino {} fd: {:?} err {:?}", - inode, - data.get_raw_fd(), - e - ); - e - })?; - - Ok((st, self.cfg.attr_timeout)) - } - - fn do_unlink(&self, parent: Inode, name: &CStr, flags: libc::c_int) -> io::Result<()> { - let data = self.inode_map.get(parent)?; - // Safe because this doesn't modify any memory and we check the return value. - let res = unsafe { libc::unlinkat(data.get_raw_fd(), name.as_ptr(), flags) }; - if res == 0 { - Ok(()) - } else { - Err(io::Error::last_os_error()) - } - } - - fn get_dirdata( - &self, - handle: Handle, - inode: Inode, - flags: libc::c_int, - ) -> io::Result> { - let no_open = self.no_opendir.load(Ordering::Relaxed); - if !no_open { - self.handle_map.get(handle, inode) - } else { - let file = self.open_inode(inode, (flags | libc::O_DIRECTORY) as i32)?; - Ok(Arc::new(HandleData::new(inode, file))) - } - } - - fn get_data( - &self, - handle: Handle, - inode: Inode, - flags: libc::c_int, - ) -> io::Result> { - let no_open = self.no_open.load(Ordering::Relaxed); - if !no_open { - self.handle_map.get(handle, inode) - } else { - let file = self.open_inode(inode, flags as i32)?; - Ok(Arc::new(HandleData::new(inode, file))) - } + }) } } @@ -555,74 +99,29 @@ impl FileSystem for BlobFs { type Handle = Handle; fn init(&self, capable: FsOptions) -> io::Result { - if self.cfg.do_import { - self.import()?; - } - - let mut opts = FsOptions::DO_READDIRPLUS | FsOptions::READDIRPLUS_AUTO; - // !cfg.do_import means we are under vfs, in which case capable is already - // negotiated and must be honored. - if (!self.cfg.do_import || self.cfg.writeback) - && capable.contains(FsOptions::WRITEBACK_CACHE) - { - opts |= FsOptions::WRITEBACK_CACHE; - self.writeback.store(true, Ordering::Relaxed); - } - if (!self.cfg.do_import || self.cfg.no_open) - && capable.contains(FsOptions::ZERO_MESSAGE_OPEN) - { - opts |= FsOptions::ZERO_MESSAGE_OPEN; - // We can't support FUSE_ATOMIC_O_TRUNC with no_open - opts.remove(FsOptions::ATOMIC_O_TRUNC); - self.no_open.store(true, Ordering::Relaxed); - } - if (!self.cfg.do_import || self.cfg.no_opendir) - && capable.contains(FsOptions::ZERO_MESSAGE_OPENDIR) - { - opts |= FsOptions::ZERO_MESSAGE_OPENDIR; - self.no_opendir.store(true, Ordering::Relaxed); - } - - Ok(opts) + #[cfg(feature = "virtiofs")] + let _ = self.bootstrap_args.get_rafs_handle()?; + self.pfs.init(capable) } fn destroy(&self) { - self.handle_map.clear(); - self.inode_map.clear(); - - if let Err(e) = self.import() { - error!("fuse: failed to destroy instance, {:?}", e); - }; + self.pfs.destroy() } fn statfs(&self, _ctx: &Context, inode: Inode) -> io::Result { - let data = self.inode_map.get(inode)?; - let mut out = MaybeUninit::::zeroed(); - - // Safe because this will only modify `out` and we check the return value. - match unsafe { libc::fstatvfs64(data.get_raw_fd(), out.as_mut_ptr()) } { - // Safe because the kernel guarantees that `out` has been initialized. - 0 => Ok(unsafe { out.assume_init() }), - _ => Err(io::Error::last_os_error()), - } + self.pfs.statfs(_ctx, inode) } fn lookup(&self, _ctx: &Context, parent: Inode, name: &CStr) -> io::Result { - self.do_lookup(parent, name) + self.pfs.lookup(_ctx, parent, name) } fn forget(&self, _ctx: &Context, inode: Inode, count: u64) { - let mut inodes = self.inode_map.get_map_mut(); - - Self::forget_one(&mut inodes, inode, count) + self.pfs.forget(_ctx, inode, count) } fn batch_forget(&self, _ctx: &Context, requests: Vec<(Inode, u64)>) { - let mut inodes = self.inode_map.get_map_mut(); - - for (inode, count) in requests { - Self::forget_one(&mut inodes, inode, count) - } + self.pfs.batch_forget(_ctx, requests) } fn opendir( @@ -631,12 +130,7 @@ impl FileSystem for BlobFs { inode: Inode, flags: u32, ) -> io::Result<(Option, OpenOptions)> { - if self.no_opendir.load(Ordering::Relaxed) { - info!("fuse: opendir is not supported."); - Err(io::Error::from_raw_os_error(libc::ENOSYS)) - } else { - self.do_open(inode, flags | (libc::O_DIRECTORY as u32)) - } + self.pfs.opendir(_ctx, inode, flags) } fn releasedir( @@ -646,31 +140,26 @@ impl FileSystem for BlobFs { _flags: u32, handle: Handle, ) -> io::Result<()> { - self.do_release(inode, handle) + self.pfs.releasedir(_ctx, inode, _flags, handle) } + #[allow(unused)] fn mkdir( &self, - ctx: &Context, - parent: Inode, - name: &CStr, - mode: u32, - umask: u32, + _ctx: &Context, + _parent: Inode, + _name: &CStr, + _mode: u32, + _umask: u32, ) -> io::Result { - let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; - let data = self.inode_map.get(parent)?; - - // Safe because this doesn't modify any memory and we check the return value. - let res = unsafe { libc::mkdirat(data.get_raw_fd(), name.as_ptr(), mode & !umask) }; - if res == 0 { - self.do_lookup(parent, name) - } else { - Err(io::Error::last_os_error()) - } + error!("do mkdir req error: blob file can not be written."); + Err(eacces!("Mkdir request is not allowed in blobfs")) } - fn rmdir(&self, _ctx: &Context, parent: Inode, name: &CStr) -> io::Result<()> { - self.do_unlink(parent, name, libc::AT_REMOVEDIR) + #[allow(unused)] + fn rmdir(&self, _ctx: &Context, _parent: Inode, _name: &CStr) -> io::Result<()> { + error!("do rmdir req error: blob file can not be written."); + Err(eacces!("Rmdir request is not allowed in blobfs")) } fn readdir( @@ -682,7 +171,8 @@ impl FileSystem for BlobFs { offset: u64, add_entry: &mut dyn FnMut(DirEntry) -> io::Result, ) -> io::Result<()> { - self.do_readdir(inode, handle, size, offset, add_entry) + self.pfs + .readdir(_ctx, inode, handle, size, offset, add_entry) } fn readdirplus( @@ -694,28 +184,8 @@ impl FileSystem for BlobFs { offset: u64, add_entry: &mut dyn FnMut(DirEntry, Entry) -> io::Result, ) -> io::Result<()> { - self.do_readdir(inode, handle, size, offset, &mut |dir_entry| { - // Safe because do_readdir() has ensured dir_entry.name is a - // valid [u8] generated by CStr::to_bytes(). - let name = unsafe { - CStr::from_bytes_with_nul_unchecked(std::slice::from_raw_parts( - &dir_entry.name[0], - dir_entry.name.len() + 1, - )) - }; - let entry = self.do_lookup(inode, name)?; - let ino = entry.inode; - - add_entry(dir_entry, entry).map(|r| { - // true when size is not large enough to hold entry. - if r == 0 { - // Release the refcount acquired by self.do_lookup(). - let mut inodes = self.inode_map.get_map_mut(); - Self::forget_one(&mut inodes, ino, 1); - } - r - }) - }) + self.pfs + .readdirplus(_ctx, inode, handle, size, offset, add_entry) } fn open( @@ -725,12 +195,7 @@ impl FileSystem for BlobFs { flags: u32, _fuse_flags: u32, ) -> io::Result<(Option, OpenOptions)> { - if self.no_open.load(Ordering::Relaxed) { - info!("fuse: open is not supported."); - Err(io::Error::from_raw_os_error(libc::ENOSYS)) - } else { - self.do_open(inode, flags) - } + self.pfs.open(_ctx, inode, flags, _fuse_flags) } fn release( @@ -743,57 +208,33 @@ impl FileSystem for BlobFs { _flock_release: bool, _lock_owner: Option, ) -> io::Result<()> { - if self.no_open.load(Ordering::Relaxed) { - Err(io::Error::from_raw_os_error(libc::ENOSYS)) - } else { - self.do_release(inode, handle) - } - } - + self.pfs.release( + _ctx, + inode, + _flags, + handle, + _flush, + _flock_release, + _lock_owner, + ) + } + + #[allow(unused)] fn create( &self, - ctx: &Context, - parent: Inode, - name: &CStr, - args: CreateIn, + _ctx: &Context, + _parent: Inode, + _name: &CStr, + _args: CreateIn, ) -> io::Result<(Entry, Option, OpenOptions)> { - let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; - let data = self.inode_map.get(parent)?; - - // Safe because this doesn't modify any memory and we check the return value. We don't - // really check `flags` because if the kernel can't handle poorly specified flags then we - // have much bigger problems. - let file = Self::open_file( - data.get_raw_fd(), - name, - args.flags as i32 | libc::O_CREAT | libc::O_CLOEXEC | libc::O_NOFOLLOW, - args.mode & !(args.umask & 0o777), - )?; - - let entry = self.do_lookup(parent, name)?; - - let ret_handle = if !self.no_open.load(Ordering::Relaxed) { - let handle = self.next_handle.fetch_add(1, Ordering::Relaxed); - let data = HandleData::new(entry.inode, file); - - self.handle_map.insert(handle, data); - Some(handle) - } else { - None - }; - - let mut opts = OpenOptions::empty(); - match self.cfg.cache_policy { - CachePolicy::Never => opts |= OpenOptions::DIRECT_IO, - CachePolicy::Always => opts |= OpenOptions::KEEP_CACHE, - _ => {} - }; - - Ok((entry, ret_handle, opts)) + error!("do create req error: blob file cannot write."); + Err(eacces!("Create request is not allowed in blobfs")) } - fn unlink(&self, _ctx: &Context, parent: Inode, name: &CStr) -> io::Result<()> { - self.do_unlink(parent, name, 0) + #[allow(unused)] + fn unlink(&self, _ctx: &Context, _parent: Inode, _name: &CStr) -> io::Result<()> { + error!("do unlink req error: blob file cannot write."); + Err(eacces!("Unlink request is not allowed in blobfs")) } #[cfg(feature = "virtiofs")] @@ -813,16 +254,12 @@ impl FileSystem for BlobFs { inode, foffset, len, flags, moffset ); + if (flags & virtio_fs::SetupmappingFlags::WRITE.bits()) != 0 { + return Err(eacces!("blob file cannot write in dax")); + } self.load_chunks_on_demand(inode, foffset)?; - - let open_flags = if (flags & virtio_fs::SetupmappingFlags::WRITE.bits()) != 0 { - libc::O_RDWR - } else { - libc::O_RDONLY - }; - - let file = self.open_inode(inode, open_flags as i32)?; - (*vu_req).map(foffset, moffset, len, flags, file.as_raw_fd()) + self.pfs + .setupmapping(_ctx, inode, _handle, foffset, len, flags, moffset, vu_req) } #[cfg(feature = "virtiofs")] @@ -833,53 +270,42 @@ impl FileSystem for BlobFs { requests: Vec, vu_req: &mut dyn FsCacheReqHandler, ) -> io::Result<()> { - (*vu_req).unmap(requests) + self.pfs.removemapping(_ctx, _inode, requests, vu_req) } fn read( &self, _ctx: &Context, - inode: Inode, - handle: Handle, - w: &mut dyn ZeroCopyWriter, - size: u32, - offset: u64, + _inode: Inode, + _handle: Handle, + _w: &mut dyn ZeroCopyWriter, + _size: u32, + _offset: u64, _lock_owner: Option, _flags: u32, ) -> io::Result { - let data = self.get_data(handle, inode, libc::O_RDONLY)?; - - // Manually implement File::try_clone() by borrowing fd of data.file instead of dup(). - // It's safe because the `data` variable's lifetime spans the whole function, - // so data.file won't be closed. - let f = unsafe { File::from_raw_fd(data.get_handle_raw_fd()) }; - let mut f = ManuallyDrop::new(f); - - w.write_from(&mut *f, size as usize, offset) + error!( + "do Read req error: blob file cannot do nondax read, please check if dax is enabled" + ); + Err(eacces!("Read request is not allowed in blobfs")) } + #[allow(unused)] fn write( &self, _ctx: &Context, - inode: Inode, - handle: Handle, - r: &mut dyn ZeroCopyReader, - size: u32, - offset: u64, + _inode: Inode, + _handle: Handle, + _r: &mut dyn ZeroCopyReader, + _size: u32, + _offset: u64, _lock_owner: Option, _delayed_write: bool, _flags: u32, _fuse_flags: u32, ) -> io::Result { - let data = self.get_data(handle, inode, libc::O_RDWR)?; - - // Manually implement File::try_clone() by borrowing fd of data.file instead of dup(). - // It's safe because the `data` variable's lifetime spans the whole function, - // so data.file won't be closed. - let f = unsafe { File::from_raw_fd(data.get_handle_raw_fd()) }; - let mut f = ManuallyDrop::new(f); - - r.read_to(&mut *f, size as usize, offset) + error!("do Write req error: blob file cannot write."); + Err(eacces!("Write request is not allowed in blobfs")) } fn getattr( @@ -888,276 +314,76 @@ impl FileSystem for BlobFs { inode: Inode, _handle: Option, ) -> io::Result<(libc::stat64, Duration)> { - self.do_getattr(inode) + self.pfs.getattr(_ctx, inode, _handle) } + #[allow(unused)] fn setattr( &self, _ctx: &Context, - inode: Inode, - attr: libc::stat64, - handle: Option, - valid: SetattrValid, + _inode: Inode, + _attr: libc::stat64, + _handle: Option, + _valid: SetattrValid, ) -> io::Result<(libc::stat64, Duration)> { - let inode_data = self.inode_map.get(inode)?; - - enum Data { - Handle(Arc, RawFd), - ProcPath(CString), - } - - let data = if self.no_open.load(Ordering::Relaxed) { - let pathname = CString::new(format!("self/fd/{}", inode_data.get_raw_fd())) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; - Data::ProcPath(pathname) - } else { - // If we have a handle then use it otherwise get a new fd from the inode. - if let Some(handle) = handle { - let hd = self.handle_map.get(handle, inode)?; - let fd = hd.get_handle_raw_fd(); - Data::Handle(hd, fd) - } else { - let pathname = CString::new(format!("self/fd/{}", inode_data.get_raw_fd())) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; - Data::ProcPath(pathname) - } - }; - - if valid.contains(SetattrValid::MODE) { - // Safe because this doesn't modify any memory and we check the return value. - let res = unsafe { - match data { - Data::Handle(_, fd) => libc::fchmod(fd, attr.st_mode), - Data::ProcPath(ref p) => { - libc::fchmodat(self.proc.as_raw_fd(), p.as_ptr(), attr.st_mode, 0) - } - } - }; - if res < 0 { - return Err(io::Error::last_os_error()); - } - } - - if valid.intersects(SetattrValid::UID | SetattrValid::GID) { - let uid = if valid.contains(SetattrValid::UID) { - attr.st_uid - } else { - // Cannot use -1 here because these are unsigned values. - ::std::u32::MAX - }; - let gid = if valid.contains(SetattrValid::GID) { - attr.st_gid - } else { - // Cannot use -1 here because these are unsigned values. - ::std::u32::MAX - }; - - // Safe because this is a constant value and a valid C string. - let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; - - // Safe because this doesn't modify any memory and we check the return value. - let res = unsafe { - libc::fchownat( - inode_data.get_raw_fd(), - empty.as_ptr(), - uid, - gid, - libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW, - ) - }; - if res < 0 { - return Err(io::Error::last_os_error()); - } - } - - if valid.contains(SetattrValid::SIZE) { - // Safe because this doesn't modify any memory and we check the return value. - let res = match data { - Data::Handle(_, fd) => unsafe { libc::ftruncate(fd, attr.st_size) }, - _ => { - // There is no `ftruncateat` so we need to get a new fd and truncate it. - let f = self.open_inode(inode, libc::O_NONBLOCK | libc::O_RDWR)?; - unsafe { libc::ftruncate(f.as_raw_fd(), attr.st_size) } - } - }; - if res < 0 { - return Err(io::Error::last_os_error()); - } - } - - if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) { - let mut tvs = [ - libc::timespec { - tv_sec: 0, - tv_nsec: libc::UTIME_OMIT, - }, - libc::timespec { - tv_sec: 0, - tv_nsec: libc::UTIME_OMIT, - }, - ]; - - if valid.contains(SetattrValid::ATIME_NOW) { - tvs[0].tv_nsec = libc::UTIME_NOW; - } else if valid.contains(SetattrValid::ATIME) { - tvs[0].tv_sec = attr.st_atime; - tvs[0].tv_nsec = attr.st_atime_nsec; - } - - if valid.contains(SetattrValid::MTIME_NOW) { - tvs[1].tv_nsec = libc::UTIME_NOW; - } else if valid.contains(SetattrValid::MTIME) { - tvs[1].tv_sec = attr.st_mtime; - tvs[1].tv_nsec = attr.st_mtime_nsec; - } - - // Safe because this doesn't modify any memory and we check the return value. - let res = match data { - Data::Handle(_, fd) => unsafe { libc::futimens(fd, tvs.as_ptr()) }, - Data::ProcPath(ref p) => unsafe { - libc::utimensat(self.proc.as_raw_fd(), p.as_ptr(), tvs.as_ptr(), 0) - }, - }; - if res < 0 { - return Err(io::Error::last_os_error()); - } - } - - self.do_getattr(inode) + error!("do setattr req error: blob file cannot write."); + Err(eacces!("Setattr request is not allowed in blobfs")) } + #[allow(unused)] fn rename( &self, _ctx: &Context, - olddir: Inode, - oldname: &CStr, - newdir: Inode, - newname: &CStr, - flags: u32, + _olddir: Inode, + _oldname: &CStr, + _newdir: Inode, + _newname: &CStr, + _flags: u32, ) -> io::Result<()> { - let old_inode = self.inode_map.get(olddir)?; - let new_inode = self.inode_map.get(newdir)?; - - // Safe because this doesn't modify any memory and we check the return value. - // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands - // and we have glibc 2.28. - let res = unsafe { - libc::syscall( - libc::SYS_renameat2, - old_inode.get_raw_fd(), - oldname.as_ptr(), - new_inode.get_raw_fd(), - newname.as_ptr(), - flags, - ) - }; - if res == 0 { - Ok(()) - } else { - Err(io::Error::last_os_error()) - } + error!("do rename req error: blob file cannot write."); + Err(eacces!("Rename request is not allowed in blobfs")) } + #[allow(unused)] fn mknod( &self, - ctx: &Context, - parent: Inode, - name: &CStr, - mode: u32, - rdev: u32, - umask: u32, + _ctx: &Context, + _parent: Inode, + _name: &CStr, + _mode: u32, + _rdev: u32, + _umask: u32, ) -> io::Result { - let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; - let data = self.inode_map.get(parent)?; - - // Safe because this doesn't modify any memory and we check the return value. - let res = unsafe { - libc::mknodat( - data.get_raw_fd(), - name.as_ptr(), - (mode & !umask) as libc::mode_t, - u64::from(rdev), - ) - }; - if res < 0 { - Err(io::Error::last_os_error()) - } else { - self.do_lookup(parent, name) - } + error!("do mknode req error: blob file cannot write."); + Err(eacces!("Mknod request is not allowed in blobfs")) } + #[allow(unused)] fn link( &self, _ctx: &Context, - inode: Inode, - newparent: Inode, - newname: &CStr, + _inode: Inode, + _newparent: Inode, + _newname: &CStr, ) -> io::Result { - let data = self.inode_map.get(inode)?; - let new_inode = self.inode_map.get(newparent)?; - - // Safe because this is a constant value and a valid C string. - let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; - - // Safe because this doesn't modify any memory and we check the return value. - let res = unsafe { - libc::linkat( - data.get_raw_fd(), - empty.as_ptr(), - new_inode.get_raw_fd(), - newname.as_ptr(), - libc::AT_EMPTY_PATH, - ) - }; - if res == 0 { - self.do_lookup(newparent, newname) - } else { - Err(io::Error::last_os_error()) - } + error!("do link req error: blob file cannot write."); + Err(eacces!("Link request is not allowed in blobfs")) } + #[allow(unused)] fn symlink( &self, - ctx: &Context, - linkname: &CStr, - parent: Inode, - name: &CStr, + _ctx: &Context, + _linkname: &CStr, + _parent: Inode, + _name: &CStr, ) -> io::Result { - let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; - let data = self.inode_map.get(parent)?; - - // Safe because this doesn't modify any memory and we check the return value. - let res = unsafe { libc::symlinkat(linkname.as_ptr(), data.get_raw_fd(), name.as_ptr()) }; - if res == 0 { - self.do_lookup(parent, name) - } else { - Err(io::Error::last_os_error()) - } + error!("do symlink req error: blob file cannot write."); + Err(eacces!("Symlink request is not allowed in blobfs")) } fn readlink(&self, _ctx: &Context, inode: Inode) -> io::Result> { - // Safe because this is a constant value and a valid C string. - let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; - let mut buf = Vec::::with_capacity(libc::PATH_MAX as usize); - let data = self.inode_map.get(inode)?; - - // Safe because this will only modify the contents of `buf` and we check the return value. - let res = unsafe { - libc::readlinkat( - data.get_raw_fd(), - empty.as_ptr(), - buf.as_mut_ptr() as *mut libc::c_char, - libc::PATH_MAX as usize, - ) - }; - if res < 0 { - return Err(io::Error::last_os_error()); - } - - // Safe because we trust the value returned by kernel. - unsafe { buf.set_len(res as usize) }; - - Ok(buf) + self.pfs.readlink(_ctx, inode) } fn flush( @@ -1167,27 +393,7 @@ impl FileSystem for BlobFs { handle: Handle, _lock_owner: u64, ) -> io::Result<()> { - if self.no_open.load(Ordering::Relaxed) { - return Err(io::Error::from_raw_os_error(libc::ENOSYS)); - } - - let data = self.handle_map.get(handle, inode)?; - - // Since this method is called whenever an fd is closed in the client, we can emulate that - // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe - // because this doesn't modify any memory and we check the return values. - unsafe { - let newfd = libc::dup(data.get_handle_raw_fd()); - if newfd < 0 { - return Err(io::Error::last_os_error()); - } - - if libc::close(newfd) < 0 { - Err(io::Error::last_os_error()) - } else { - Ok(()) - } - } + self.pfs.flush(_ctx, inode, handle, _lock_owner) } fn fsync( @@ -1197,22 +403,7 @@ impl FileSystem for BlobFs { datasync: bool, handle: Handle, ) -> io::Result<()> { - let data = self.get_data(handle, inode, libc::O_RDONLY)?; - let fd = data.get_handle_raw_fd(); - - // Safe because this doesn't modify any memory and we check the return value. - let res = unsafe { - if datasync { - libc::fdatasync(fd) - } else { - libc::fsync(fd) - } - }; - if res == 0 { - Ok(()) - } else { - Err(io::Error::last_os_error()) - } + self.pfs.fsync(_ctx, inode, datasync, handle) } fn fsyncdir( @@ -1222,84 +413,24 @@ impl FileSystem for BlobFs { datasync: bool, handle: Handle, ) -> io::Result<()> { - self.fsync(ctx, inode, datasync, handle) + self.pfs.fsyncdir(ctx, inode, datasync, handle) } fn access(&self, ctx: &Context, inode: Inode, mask: u32) -> io::Result<()> { - let data = self.inode_map.get(inode)?; - let st = Self::stat(&data.file)?; - let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK); - - if mode == libc::F_OK { - // The file exists since we were able to call `stat(2)` on it. - return Ok(()); - } - - if (mode & libc::R_OK) != 0 - && ctx.uid != 0 - && (st.st_uid != ctx.uid || st.st_mode & 0o400 == 0) - && (st.st_gid != ctx.gid || st.st_mode & 0o040 == 0) - && st.st_mode & 0o004 == 0 - { - return Err(io::Error::from_raw_os_error(libc::EACCES)); - } - - if (mode & libc::W_OK) != 0 - && ctx.uid != 0 - && (st.st_uid != ctx.uid || st.st_mode & 0o200 == 0) - && (st.st_gid != ctx.gid || st.st_mode & 0o020 == 0) - && st.st_mode & 0o002 == 0 - { - return Err(io::Error::from_raw_os_error(libc::EACCES)); - } - - // root can only execute something if it is executable by one of the owner, the group, or - // everyone. - if (mode & libc::X_OK) != 0 - && (ctx.uid != 0 || st.st_mode & 0o111 == 0) - && (st.st_uid != ctx.uid || st.st_mode & 0o100 == 0) - && (st.st_gid != ctx.gid || st.st_mode & 0o010 == 0) - && st.st_mode & 0o001 == 0 - { - return Err(io::Error::from_raw_os_error(libc::EACCES)); - } - - Ok(()) + self.pfs.access(ctx, inode, mask) } + #[allow(unused)] fn setxattr( &self, _ctx: &Context, - inode: Inode, - name: &CStr, - value: &[u8], - flags: u32, + _inode: Inode, + _name: &CStr, + _value: &[u8], + _flags: u32, ) -> io::Result<()> { - if !self.cfg.xattr { - return Err(io::Error::from_raw_os_error(libc::ENOSYS)); - } - - let data = self.inode_map.get(inode)?; - let pathname = CString::new(format!("/proc/self/fd/{}", data.file.as_raw_fd())) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; - - // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we - // need to use the {set,get,remove,list}xattr variants. - // Safe because this doesn't modify any memory and we check the return value. - let res = unsafe { - libc::setxattr( - pathname.as_ptr(), - name.as_ptr(), - value.as_ptr() as *const libc::c_void, - value.len(), - flags as libc::c_int, - ) - }; - if res == 0 { - Ok(()) - } else { - Err(io::Error::last_os_error()) - } + error!("do setxattr req error: blob file cannot write."); + Err(eacces!("Setxattr request is not allowed in blobfs")) } fn getxattr( @@ -1309,121 +440,34 @@ impl FileSystem for BlobFs { name: &CStr, size: u32, ) -> io::Result { - if !self.cfg.xattr { - return Err(io::Error::from_raw_os_error(libc::ENOSYS)); - } - - let data = self.inode_map.get(inode)?; - let mut buf = Vec::::with_capacity(size as usize); - let pathname = CString::new(format!("/proc/self/fd/{}", data.file.as_raw_fd())) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; - - // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we - // need to use the {set,get,remove,list}xattr variants. - // Safe because this will only modify the contents of `buf`. - let res = unsafe { - libc::getxattr( - pathname.as_ptr(), - name.as_ptr(), - buf.as_mut_ptr() as *mut libc::c_void, - size as libc::size_t, - ) - }; - if res < 0 { - return Err(io::Error::last_os_error()); - } - - if size == 0 { - Ok(GetxattrReply::Count(res as u32)) - } else { - // Safe because we trust the value returned by kernel. - unsafe { buf.set_len(res as usize) }; - Ok(GetxattrReply::Value(buf)) - } + self.pfs.getxattr(_ctx, inode, name, size) } fn listxattr(&self, _ctx: &Context, inode: Inode, size: u32) -> io::Result { - if !self.cfg.xattr { - return Err(io::Error::from_raw_os_error(libc::ENOSYS)); - } - - let data = self.inode_map.get(inode)?; - let mut buf = Vec::::with_capacity(size as usize); - let pathname = CString::new(format!("/proc/self/fd/{}", data.file.as_raw_fd())) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; - - // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we - // need to use the {set,get,remove,list}xattr variants. - // Safe because this will only modify the contents of `buf`. - let res = unsafe { - libc::listxattr( - pathname.as_ptr(), - buf.as_mut_ptr() as *mut libc::c_char, - size as libc::size_t, - ) - }; - if res < 0 { - return Err(io::Error::last_os_error()); - } - - if size == 0 { - Ok(ListxattrReply::Count(res as u32)) - } else { - // Safe because we trust the value returned by kernel. - unsafe { buf.set_len(res as usize) }; - Ok(ListxattrReply::Names(buf)) - } + self.pfs.listxattr(_ctx, inode, size) } - fn removexattr(&self, _ctx: &Context, inode: Inode, name: &CStr) -> io::Result<()> { - if !self.cfg.xattr { - return Err(io::Error::from_raw_os_error(libc::ENOSYS)); - } - - let data = self.inode_map.get(inode)?; - let pathname = CString::new(format!("/proc/self/fd/{}", data.file.as_raw_fd())) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; - - // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we - // need to use the {set,get,remove,list}xattr variants. - // Safe because this doesn't modify any memory and we check the return value. - let res = unsafe { libc::removexattr(pathname.as_ptr(), name.as_ptr()) }; - if res == 0 { - Ok(()) - } else { - Err(io::Error::last_os_error()) - } + #[allow(unused)] + fn removexattr(&self, _ctx: &Context, _inode: Inode, _name: &CStr) -> io::Result<()> { + error!("do removexattr req error: blob file cannot write."); + Err(eacces!("Removexattr request is not allowed in blobfs")) } + #[allow(unused)] fn fallocate( &self, _ctx: &Context, - inode: Inode, - handle: Handle, - mode: u32, - offset: u64, - length: u64, + _inode: Inode, + _handle: Handle, + _mode: u32, + _offset: u64, + _length: u64, ) -> io::Result<()> { - // Let the Arc in scope, otherwise fd may get invalid. - let data = self.get_data(handle, inode, libc::O_RDWR)?; - let fd = data.get_handle_raw_fd(); - - // Safe because this doesn't modify any memory and we check the return value. - let res = unsafe { - libc::fallocate64( - fd, - mode as libc::c_int, - offset as libc::off64_t, - length as libc::off64_t, - ) - }; - if res == 0 { - Ok(()) - } else { - Err(io::Error::last_os_error()) - } + error!("do fallocate req error: blob file cannot write."); + Err(eacces!("Fallocate request is not allowed in blobfs")) } + #[allow(unused)] fn lseek( &self, _ctx: &Context, @@ -1432,24 +476,6 @@ impl FileSystem for BlobFs { offset: u64, whence: u32, ) -> io::Result { - // Let the Arc in scope, otherwise fd may get invalid. - let data = self.handle_map.get(handle, inode)?; - - // Acquire the lock to get exclusive access, otherwise it may break do_readdir(). - let (_guard, file) = data.get_file_mut(); - - // Safe because this doesn't modify any memory and we check the return value. - let res = unsafe { - libc::lseek( - file.as_raw_fd(), - offset as libc::off64_t, - whence as libc::c_int, - ) - }; - if res < 0 { - Err(io::Error::last_os_error()) - } else { - Ok(res as u64) - } + self.pfs.lseek(_ctx, inode, handle, offset, whence) } } From 4594de8de057bd0244e6930583e2d912e9f2233e Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Tue, 18 Jan 2022 23:52:26 +0800 Subject: [PATCH 8/9] storage: add alt_dirs config to localfs backend So that if blobfile doesn't exist in the primary 'dir', try the 'alt_dirs' again. Signed-off-by: Eryu Guan --- storage/src/backend/localfs.rs | 51 +++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/storage/src/backend/localfs.rs b/storage/src/backend/localfs.rs index a2bcc7bf7ec..ff4b3aec371 100644 --- a/storage/src/backend/localfs.rs +++ b/storage/src/backend/localfs.rs @@ -66,6 +66,8 @@ struct LocalFsConfig { blob_file: String, #[serde(default)] dir: String, + #[serde(default)] + alt_dirs: Vec, } struct LocalFsEntry { @@ -210,6 +212,8 @@ pub struct LocalFs { // Directory to store blob files. If `blob_file` is not specified, `dir`/`blob_id` will be used // as the blob file name. dir: String, + // Alternative directories to store blob files + alt_dirs: Vec, // Whether to prefetch data from the blob file readahead: bool, // Number of seconds to collect blob access logs @@ -232,6 +236,7 @@ impl LocalFs { Ok(LocalFs { blob_file: config.blob_file, dir: config.dir, + alt_dirs: config.alt_dirs, readahead: config.readahead, readahead_sec: config.readahead_sec, metrics: BackendMetrics::new(id, "localfs"), @@ -245,7 +250,29 @@ impl LocalFs { let path = if !self.blob_file.is_empty() { Path::new(&self.blob_file).to_path_buf() } else { - Path::new(&self.dir).join(blob_id) + // Search blob file in dir and additionally in alt_dirs + let is_valid = |dir: &PathBuf| -> bool { + let blob = Path::new(&dir).join(blob_id); + if let Ok(meta) = std::fs::metadata(&blob) { + meta.len() != 0 + } else { + false + } + }; + + let blob = Path::new(&self.dir).join(blob_id); + if is_valid(&blob) || self.alt_dirs.is_empty() { + blob + } else { + let mut file = PathBuf::new(); + for dir in &self.alt_dirs { + file = Path::new(dir).join(blob_id); + if is_valid(&file) { + break; + } + } + file + } }; path.canonicalize().map_err(LocalFsError::BlobFile) @@ -480,6 +507,7 @@ mod tests { readahead_sec: 20, blob_file: "".to_string(), dir: "".to_string(), + alt_dirs: Vec::new(), }; let json = serde_json::to_value(&config).unwrap(); assert!(LocalFs::new(json, Some("test")).is_err()); @@ -489,6 +517,7 @@ mod tests { readahead_sec: 20, blob_file: "/a/b/c".to_string(), dir: "/a/b".to_string(), + alt_dirs: Vec::new(), }; let json = serde_json::to_value(&config).unwrap(); assert!(LocalFs::new(json, None).is_err()); @@ -501,6 +530,7 @@ mod tests { readahead_sec: 20, blob_file: "/a/b/cxxxxxxxxxxxxxxxxxxxxxxx".to_string(), dir: "/a/b".to_string(), + alt_dirs: Vec::new(), }; let json = serde_json::to_value(&config).unwrap(); let fs = LocalFs::new(json, Some("test")).unwrap(); @@ -515,6 +545,7 @@ mod tests { readahead_sec: 20, blob_file: path.to_str().unwrap().to_owned(), dir: path.parent().unwrap().to_str().unwrap().to_owned(), + alt_dirs: Vec::new(), }; let json = serde_json::to_value(&config).unwrap(); let fs = LocalFs::new(json, Some("test")).unwrap(); @@ -525,6 +556,21 @@ mod tests { readahead_sec: 20, blob_file: "".to_string(), dir: path.parent().unwrap().to_str().unwrap().to_owned(), + alt_dirs: Vec::new(), + }; + let json = serde_json::to_value(&config).unwrap(); + let fs = LocalFs::new(json, Some(filename)).unwrap(); + assert_eq!(fs.get_blob_path(filename).unwrap().to_str(), path.to_str()); + + let config = LocalFsConfig { + readahead: true, + readahead_sec: 20, + blob_file: "".to_string(), + dir: "/a/b".to_string(), + alt_dirs: vec![ + "/test".to_string(), + path.parent().unwrap().to_str().unwrap().to_owned(), + ], }; let json = serde_json::to_value(&config).unwrap(); let fs = LocalFs::new(json, Some(filename)).unwrap(); @@ -541,6 +587,7 @@ mod tests { readahead_sec: 20, blob_file: "".to_string(), dir: path.parent().unwrap().to_str().unwrap().to_owned(), + alt_dirs: Vec::new(), }; let json = serde_json::to_value(&config).unwrap(); let fs = LocalFs::new(json, Some(filename)).unwrap(); @@ -567,6 +614,7 @@ mod tests { readahead_sec: 20, blob_file: "".to_string(), dir: path.parent().unwrap().to_str().unwrap().to_owned(), + alt_dirs: Vec::new(), }; let json = serde_json::to_value(&config).unwrap(); let fs = LocalFs::new(json, Some(filename)).unwrap(); @@ -616,6 +664,7 @@ mod tests { readahead_sec: 10, blob_file: "".to_string(), dir: path.parent().unwrap().to_str().unwrap().to_owned(), + alt_dirs: Vec::new(), }; let json = serde_json::to_value(&config).unwrap(); let fs = LocalFs::new(json, Some(filename)).unwrap(); From 2d5ee2f30c044518a4de12cbb23ee3a4d264044c Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Wed, 9 Mar 2022 17:35:07 +0800 Subject: [PATCH 9/9] cargo: pin regex crate version To fix RUSTSEC-2022-0013 Advisory: https://rustsec.org/advisories/RUSTSEC-2022-0013 Signed-off-by: Peng Tao --- Cargo.lock | 36 ++++++++++++++---------------------- Cargo.toml | 2 ++ app/Cargo.toml | 2 ++ 3 files changed, 18 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3dc716cfbbb..2a6dd7d73da 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,9 +17,9 @@ checksum = "ee2a4ec343196209d6594e19543ae87a39f96d5534d7174822a3ad825dd6ed7e" [[package]] name = "aho-corasick" -version = "0.7.15" +version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" dependencies = [ "memchr", ] @@ -375,9 +375,9 @@ dependencies = [ [[package]] name = "env_logger" -version = "0.8.2" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26ecb66b4bdca6c1409b40fb255eefc2bd4f6d135dab3c3124f80ffa2a9661e" +checksum = "a19187fea3ac7e84da7dacf48de0c45d63c6a76f9490dae389aead16c243fce3" dependencies = [ "atty", "humantime", @@ -883,9 +883,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.102" +version = "0.2.119" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2a5ac8f984bfcf3a823267e5fde638acc3325f6496633a5da6bb6eb2171e103" +checksum = "1bf2e165bb3457c8e098ea76f3e3bc9db55f87aa90d52d0e6be741470916aaa4" [[package]] name = "libgit2-sys" @@ -956,9 +956,9 @@ checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" [[package]] name = "memchr" -version = "2.3.3" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400" +checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" [[package]] name = "memoffset" @@ -1151,6 +1151,7 @@ dependencies = [ "log", "nix 0.23.1", "nydus-error", + "regex", "serde", ] @@ -1193,6 +1194,7 @@ dependencies = [ "openssl-src", "rafs", "rand_core", + "regex", "rlimit", "rust-fsm", "sendfd", @@ -1516,21 +1518,20 @@ dependencies = [ [[package]] name = "regex" -version = "1.4.3" +version = "1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9251239e129e16308e70d853559389de218ac275b515068abc96829d05b948a" +checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286" dependencies = [ "aho-corasick", "memchr", "regex-syntax", - "thread_local", ] [[package]] name = "regex-syntax" -version = "0.6.22" +version = "0.6.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5eb417147ba9860a96cfe72a0b93bf88fee1744b5636ec99ab20c1aa9376581" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" [[package]] name = "remove_dir_all" @@ -1928,15 +1929,6 @@ dependencies = [ "syn", ] -[[package]] -name = "thread_local" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb9bc092d0d51e76b2b19d9d85534ffc9ec2db959a2523cdae0697e2972cd447" -dependencies = [ - "lazy_static", -] - [[package]] name = "time" version = "0.1.43" diff --git a/Cargo.toml b/Cargo.toml index e09a7c97f01..698ef5690c5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,8 @@ libc = "0.2" vmm-sys-util = ">=0.9.0" clap = "2.33" flexi_logger = { version = "0.17" } +# pin regex to fix RUSTSEC-2022-0013 +regex = ">=1.5.5" serde = { version = ">=1.0.27", features = ["serde_derive", "rc"] } serde_json = "1.0.51" serde_with = { version = "1.6.0", features = ["macros"] } diff --git a/app/Cargo.toml b/app/Cargo.toml index 5d5951c86cd..8f5b7e9fca2 100644 --- a/app/Cargo.toml +++ b/app/Cargo.toml @@ -13,6 +13,8 @@ build = "build.rs" built = { version = "=0.4.3", features = ["chrono", "git2"] } [dependencies] +# pin regex to fix RUSTSEC-2022-0013 +regex = ">=1.5.5" flexi_logger = { version = "0.17" } libc = "0.2" log = "0.4"