From 22ab3c32029964990890a5ff7eb8b32d5e760896 Mon Sep 17 00:00:00 2001 From: Bert Belder Date: Tue, 29 Mar 2022 14:07:39 +0200 Subject: [PATCH 1/2] Use serde instead of hand-rolled serialization Co-authored-by: Pavel Grigorenko --- Cargo.lock | 51 ++++++- Cargo.toml | 23 ++-- src/db.rs | 376 ++++++++++++++------------------------------------- src/graph.rs | 3 +- 4 files changed, 170 insertions(+), 283 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9e79a6f..87eb076 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -27,7 +27,7 @@ dependencies = [ "argh_shared", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -90,6 +90,12 @@ dependencies = [ "instant", ] +[[package]] +name = "half" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" + [[package]] name = "hermit-abi" version = "0.3.1" @@ -163,6 +169,8 @@ dependencies = [ "jemallocator", "lazy_static", "libc", + "serde", + "serde_cbor", "tempfile", "winapi", ] @@ -208,6 +216,36 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "serde" +version = "1.0.164" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_cbor" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" +dependencies = [ + "half", + "serde", +] + +[[package]] +name = "serde_derive" +version = "1.0.164" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.18", +] + [[package]] name = "syn" version = "1.0.109" @@ -219,6 +257,17 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "syn" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "tempfile" version = "3.6.0" diff --git a/Cargo.toml b/Cargo.toml index 50b4e8d..6c74f63 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,17 +13,24 @@ argh = "0.1.10" lazy_static = "1.4.0" libc = "0.2" +[dependencies.serde] +version = "1.0" +features = ["std", "derive"] + +[dependencies.serde_cbor] +version = "0.11" + [target.'cfg(windows)'.dependencies.winapi] version = "0.3.6" features = [ - "consoleapi", - "errhandlingapi", - "handleapi", - "processenv", - "processthreadsapi", - "std", - "synchapi", - "winbase", + "consoleapi", + "errhandlingapi", + "handleapi", + "processenv", + "processthreadsapi", + "std", + "synchapi", + "winbase", ] [target.'cfg(not(any(windows, target_arch = "wasm32")))'.dependencies] diff --git a/src/db.rs b/src/db.rs index 40f877d..39c37c4 100644 --- a/src/db.rs +++ b/src/db.rs @@ -6,17 +6,19 @@ use crate::{ graph::Hashes, }; use anyhow::{anyhow, bail}; +use serde::{Deserialize, Serialize}; use std::collections::HashMap; +use std::collections::HashSet; use std::fs::File; use std::io::BufReader; +use std::io::BufWriter; use std::io::Read; use std::io::Write; -use std::mem::MaybeUninit; -const VERSION: u32 = 1; +const VERSION: u32 = 2; /// Files are identified by integers that are stable across n2 executions. -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct Id(u32); impl densemap::Index for Id { fn index(&self) -> usize { @@ -39,296 +41,114 @@ pub struct IdMap { db_ids: HashMap, } -/// Buffer that accumulates a single record's worth of writes. -/// Caller calls various .write_*() methods and then flush()es it to a Write. -/// We use this instead of a BufWrite because we want to write one full record -/// at a time if possible. -struct WriteBuf { - buf: [MaybeUninit; 16 << 10], - len: usize, -} - -impl WriteBuf { - fn new() -> Self { - WriteBuf { - buf: unsafe { MaybeUninit::uninit().assume_init() }, - len: 0, - } - } - - // Perf note: I tinkered with these writes in godbolt and using - // copy_from_slice generated better code than alternatives that did - // different kinds of indexing. - - fn write(&mut self, buf: &[u8]) { - // Safety: self.buf and buf are non-overlapping; bounds checks. - unsafe { - let ptr = self.buf.as_mut_ptr().add(self.len); - self.len += buf.len(); - if self.len > self.buf.len() { - panic!("oversized WriteBuf"); - } - std::ptr::copy_nonoverlapping(buf.as_ptr(), ptr as *mut u8, buf.len()); - } - } - - fn write_u16(&mut self, n: u16) { - self.write(&n.to_le_bytes()); - } - - fn write_u24(&mut self, n: u32) { - self.write(&n.to_le_bytes()[..3]); - } - - fn write_u64(&mut self, n: u64) { - self.write(&n.to_le_bytes()); +fn read_signature(file: &mut impl Read) -> anyhow::Result<()> { + let mut buf: [u8; 4] = [0; 4]; + file.read_exact(&mut buf)?; + if buf.as_slice() != "n2db".as_bytes() { + bail!("invalid db signature"); } - - fn write_str(&mut self, s: &str) { - self.write_u16(s.len() as u16); - self.write(s.as_bytes()); - } - - fn write_id(&mut self, id: Id) { - if id.0 > (1 << 24) { - panic!("too many fileids"); - } - self.write_u24(id.0); + file.read_exact(&mut buf)?; + let version = u32::from_le_bytes(buf); + if version != VERSION { + bail!("db version mismatch: got {version}, expected {VERSION}; TODO: db upgrades etc"); } + Ok(()) +} - fn flush(self, w: &mut W) -> std::io::Result<()> { - // Safety: invariant is that self.buf up to self.len is initialized. - let buf: &[u8] = unsafe { std::mem::transmute(&self.buf[..self.len]) }; - w.write_all(buf)?; - Ok(()) - } +fn write_signature(file: &mut impl Write) -> std::io::Result<()> { + write!(file, "n2db")?; + file.write_all(&u32::to_le_bytes(VERSION)) } /// An opened database, ready for writes. pub struct Writer { ids: IdMap, - w: File, + w: BufWriter, } impl Writer { - fn create(path: &str) -> std::io::Result { - let f = std::fs::File::create(path)?; - let mut w = Writer { - ids: IdMap::default(), - w: f, - }; - w.write_signature()?; - Ok(w) - } - - fn from_opened(ids: IdMap, w: File) -> Self { - Writer { ids, w } - } - - fn write_signature(&mut self) -> std::io::Result<()> { - write!(&mut self.w, "n2db")?; - self.w.write_all(&u32::to_le_bytes(VERSION)) - } - - fn write_path(&mut self, name: &str) -> std::io::Result<()> { - if name.len() >= 0b1000_0000_0000_0000 { - panic!("filename too long"); + fn create(path: &str) -> anyhow::Result { + let file = File::create(path)?; + let mut w = BufWriter::new(file); + write_signature(&mut w)?; + Ok(Self { + ids: Default::default(), + w, + }) + } + + fn open(mut f: File, graph: &mut Graph, hashes: &mut Hashes) -> anyhow::Result { + let mut reader = BufReader::with_capacity(1usize << 16, &mut f); + read_signature(&mut reader)?; + let mut ids = IdMap::default(); + + for entry in serde_cbor::Deserializer::from_reader(&mut reader).into_iter() { + match entry? { + DbEntry::File(name) => { + let file_id = graph.file_id(name); + let db_id = ids.fileids.push(file_id); + ids.db_ids.insert(file_id, db_id); + } + DbEntry::Build { outs, deps, hash } => { + // Map each output to the associated build. + // In the common case, there is only one. + let builds = outs + .into_iter() + .filter_map(|id| graph.file(*ids.fileids.get(id)).input) + .collect::>(); + let deps = deps + .into_iter() + .map(|id| *ids.fileids.get(id)) + .collect::>(); + if builds.len() == 1 { + // Common case: only one associated build. + let bid = builds.into_iter().next().unwrap(); + graph.build_mut(bid).set_discovered_ins(deps); + hashes.set(bid, hash); + } else { + // The graph layout has changed since this build was recorded. + // The hashes won't line up anyway so it will be treated as dirty. + } + } + } } - let mut buf = WriteBuf::new(); - buf.write_str(name); - buf.flush(&mut self.w) + Ok(Self { + ids, + w: BufWriter::new(f), + }) } - fn ensure_id(&mut self, graph: &Graph, fileid: FileId) -> std::io::Result { + fn ensure_id(&mut self, graph: &Graph, fileid: FileId) -> anyhow::Result { let id = match self.ids.db_ids.get(&fileid) { Some(&id) => id, None => { let id = self.ids.fileids.push(fileid); self.ids.db_ids.insert(fileid, id); - self.write_path(&graph.file(fileid).name)?; + let entry = DbEntry::File(graph.file(fileid).name.to_owned()); + serde_cbor::ser::to_writer(&mut self.w, &entry)?; id } }; Ok(id) } - pub fn write_build(&mut self, graph: &Graph, id: BuildId, hash: Hash) -> std::io::Result<()> { + pub fn write_build(&mut self, graph: &Graph, id: BuildId, hash: Hash) -> anyhow::Result<()> { let build = graph.build(id); - let mut buf = WriteBuf::new(); - let outs = build.outs(); - let mark = (outs.len() as u16) | 0b1000_0000_0000_0000; - buf.write_u16(mark); - for &out in outs { - let id = self.ensure_id(graph, out)?; - buf.write_id(id); - } - - let deps = build.discovered_ins(); - buf.write_u16(deps.len() as u16); - for &dep in deps { - let id = self.ensure_id(graph, dep)?; - buf.write_id(id); - } - - buf.write_u64(hash.0); - - buf.flush(&mut self.w) - } -} - -struct Reader<'a> { - r: BufReader<&'a mut File>, - ids: IdMap, - graph: &'a mut Graph, - hashes: &'a mut Hashes, -} - -impl<'a> Reader<'a> { - fn read_u16(&mut self) -> std::io::Result { - let mut buf: [u8; 2] = [0; 2]; - self.r.read_exact(&mut buf[..])?; - Ok(u16::from_le_bytes(buf)) - } - - fn read_u24(&mut self) -> std::io::Result { - let mut buf: [u8; 4] = [0; 4]; - self.r.read_exact(&mut buf[..3])?; - Ok(u32::from_le_bytes(buf)) - } - - fn read_u64(&mut self) -> std::io::Result { - let mut buf: [u8; 8] = [0; 8]; - self.r.read_exact(&mut buf)?; - Ok(u64::from_le_bytes(buf)) - } - - fn read_id(&mut self) -> std::io::Result { - self.read_u24().map(Id) - } - - fn read_str(&mut self, len: usize) -> std::io::Result { - let mut buf = Vec::with_capacity(len); - // Safety: buf contents are uninitialized here, but we never read them - // before initialization. - // TODO: clippy says this is still UB, yuck. - unsafe { buf.set_len(len) }; - self.r.read_exact(buf.as_mut_slice())?; - Ok(unsafe { String::from_utf8_unchecked(buf) }) - } - - fn read_path(&mut self, len: usize) -> std::io::Result<()> { - let name = self.read_str(len)?; - // No canonicalization needed, paths were written canonicalized. - let fileid = self.graph.file_id(name); - let dbid = self.ids.fileids.push(fileid); - self.ids.db_ids.insert(fileid, dbid); + let outs = build + .outs() + .iter() + .map(|&file_id| self.ensure_id(graph, file_id)) + .collect::>>()?; + let deps = build + .discovered_ins() + .iter() + .map(|&file_id| self.ensure_id(graph, file_id)) + .collect::>>()?; + let entry = DbEntry::Build { outs, deps, hash }; + serde_cbor::ser::to_writer(&mut self.w, &entry)?; + self.w.flush()?; Ok(()) } - - fn read_build(&mut self, len: usize) -> std::io::Result<()> { - // This record logs a build. We expect all the outputs to be - // outputs of the same build id; if not, that means the graph has - // changed since this log, in which case we just ignore it. - // - // It's possible we log a build that generates files A B, then - // change the build file such that it only generates file A; this - // logic will still attach the old dependencies to A, but it - // shouldn't matter because the changed command line will cause us - // to rebuild A regardless, and these dependencies are only used - // to affect dirty checking, not build order. - - let mut unique_bid = None; - let mut obsolete = false; - for _ in 0..len { - let fileid = self.read_id()?; - if obsolete { - // Even though we know we don't want this record, we must - // keep reading to parse through it. - continue; - } - match self.graph.file(*self.ids.fileids.get(fileid)).input { - None => { - obsolete = true; - } - Some(bid) => { - match unique_bid { - None => unique_bid = Some(bid), - Some(unique_bid) if unique_bid == bid => { - // Ok, matches the existing id. - } - Some(_) => { - // Mismatch. - unique_bid = None; - obsolete = true; - } - } - } - } - } - - let len = self.read_u16()?; - let mut deps = Vec::new(); - for _ in 0..len { - let id = self.read_id()?; - deps.push(*self.ids.fileids.get(id)); - } - - let hash = Hash(self.read_u64()?); - - // unique_bid is set here if this record is valid. - if let Some(id) = unique_bid { - // Common case: only one associated build. - self.graph.build_mut(id).set_discovered_ins(deps); - self.hashes.set(id, hash); - } - Ok(()) - } - - fn read_signature(&mut self) -> anyhow::Result<()> { - let mut buf: [u8; 4] = [0; 4]; - self.r.read_exact(&mut buf[..])?; - if buf.as_slice() != "n2db".as_bytes() { - bail!("invalid db signature"); - } - self.r.read_exact(&mut buf[..])?; - let version = u32::from_le_bytes(buf); - if version != VERSION { - bail!("db version mismatch: got {version}, expected {VERSION}; TODO: db upgrades etc"); - } - Ok(()) - } - - fn read_file(&mut self) -> anyhow::Result<()> { - self.read_signature()?; - loop { - let mut len = match self.read_u16() { - Ok(r) => r, - Err(err) if err.kind() == std::io::ErrorKind::UnexpectedEof => break, - Err(err) => bail!(err), - }; - let mask = 0b1000_0000_0000_0000; - if len & mask == 0 { - self.read_path(len as usize)?; - } else { - len &= !mask; - self.read_build(len as usize)?; - } - } - Ok(()) - } - - /// Reads an on-disk database, loading its state into the provided Graph/Hashes. - fn read(f: &mut File, graph: &mut Graph, hashes: &mut Hashes) -> anyhow::Result { - let mut r = Reader { - r: std::io::BufReader::new(f), - ids: IdMap::default(), - graph, - hashes, - }; - r.read_file()?; - - Ok(r.ids) - } } /// Opens or creates an on-disk database, loading its state into the provided Graph. @@ -338,14 +158,24 @@ pub fn open(path: &str, graph: &mut Graph, hashes: &mut Hashes) -> anyhow::Resul .append(true) .open(path) { - Ok(mut f) => { - let ids = Reader::read(&mut f, graph, hashes)?; - Ok(Writer::from_opened(ids, f)) - } - Err(err) if err.kind() == std::io::ErrorKind::NotFound => { - let w = Writer::create(path)?; - Ok(w) - } + Ok(f) => Writer::open(f, graph, hashes), + Err(err) if err.kind() == std::io::ErrorKind::NotFound => Writer::create(path), Err(err) => Err(anyhow!(err)), } } + +#[derive(Serialize, Deserialize)] +enum DbEntry { + #[serde(rename = "f")] + File(String), + + #[serde(rename = "b")] + Build { + #[serde(rename = "o")] + outs: Vec, + #[serde(rename = "d")] + deps: Vec, + #[serde(rename = "h")] + hash: Hash, + }, +} diff --git a/src/graph.rs b/src/graph.rs index 506023d..508712a 100644 --- a/src/graph.rs +++ b/src/graph.rs @@ -1,6 +1,7 @@ //! The build graph, a graph between files and commands. use crate::densemap::{self, DenseMap}; +use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::hash::{self, Hasher}; use std::path::{Path, PathBuf}; @@ -8,7 +9,7 @@ use std::time::SystemTime; /// Hash value used to identify a given instance of a Build's execution; /// compared to verify whether a Build is up to date. -#[derive(Debug, Copy, Clone, Eq, PartialEq)] +#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)] pub struct Hash(pub u64); /// Id for File nodes in the Graph. From 63b1036df01098b1483797c3c9201fdb3cab6a6f Mon Sep 17 00:00:00 2001 From: Pavel Grigorenko Date: Wed, 14 Jun 2023 23:44:05 +0300 Subject: [PATCH 2/2] Switch from cbor to bincode --- Cargo.lock | 27 ++++++++++----------------- Cargo.toml | 4 ++-- src/db.rs | 22 +++++++++++++++++----- 3 files changed, 29 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 87eb076..478fc34 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -42,6 +42,15 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -90,12 +99,6 @@ dependencies = [ "instant", ] -[[package]] -name = "half" -version = "1.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" - [[package]] name = "hermit-abi" version = "0.3.1" @@ -166,11 +169,11 @@ version = "0.1.0" dependencies = [ "anyhow", "argh", + "bincode", "jemallocator", "lazy_static", "libc", "serde", - "serde_cbor", "tempfile", "winapi", ] @@ -225,16 +228,6 @@ dependencies = [ "serde_derive", ] -[[package]] -name = "serde_cbor" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" -dependencies = [ - "half", - "serde", -] - [[package]] name = "serde_derive" version = "1.0.164" diff --git a/Cargo.toml b/Cargo.toml index 6c74f63..bea374b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,8 +17,8 @@ libc = "0.2" version = "1.0" features = ["std", "derive"] -[dependencies.serde_cbor] -version = "0.11" +[dependencies.bincode] +version = "1.3" [target.'cfg(windows)'.dependencies.winapi] version = "0.3.6" diff --git a/src/db.rs b/src/db.rs index 39c37c4..76f6f79 100644 --- a/src/db.rs +++ b/src/db.rs @@ -15,7 +15,7 @@ use std::io::BufWriter; use std::io::Read; use std::io::Write; -const VERSION: u32 = 2; +const VERSION: u32 = 3; /// Files are identified by integers that are stable across n2 executions. #[derive(Debug, Clone, Copy, Serialize, Deserialize)] @@ -77,13 +77,25 @@ impl Writer { }) } + fn read_entry(reader: &mut BufReader<&mut File>) -> bincode::Result> { + let result = bincode::deserialize_from(reader); + if let Err(boxed) = &result { + if let bincode::ErrorKind::Io(err) = boxed.as_ref() { + if err.kind() == std::io::ErrorKind::UnexpectedEof { + return Ok(None); + } + } + } + result.map(Some) + } + fn open(mut f: File, graph: &mut Graph, hashes: &mut Hashes) -> anyhow::Result { let mut reader = BufReader::with_capacity(1usize << 16, &mut f); read_signature(&mut reader)?; let mut ids = IdMap::default(); - for entry in serde_cbor::Deserializer::from_reader(&mut reader).into_iter() { - match entry? { + while let Some(entry) = Self::read_entry(&mut reader)? { + match entry { DbEntry::File(name) => { let file_id = graph.file_id(name); let db_id = ids.fileids.push(file_id); @@ -125,7 +137,7 @@ impl Writer { let id = self.ids.fileids.push(fileid); self.ids.db_ids.insert(fileid, id); let entry = DbEntry::File(graph.file(fileid).name.to_owned()); - serde_cbor::ser::to_writer(&mut self.w, &entry)?; + bincode::serialize_into(&mut self.w, &entry)?; id } }; @@ -145,7 +157,7 @@ impl Writer { .map(|&file_id| self.ensure_id(graph, file_id)) .collect::>>()?; let entry = DbEntry::Build { outs, deps, hash }; - serde_cbor::ser::to_writer(&mut self.w, &entry)?; + bincode::serialize_into(&mut self.w, &entry)?; self.w.flush()?; Ok(()) }