From 354d70d8a4acfc830b4931a90902d46abc193592 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Tue, 30 May 2023 12:04:21 +0100 Subject: [PATCH 1/3] docs: add link to nightly config doc for `SourceConfigMap` --- src/cargo/sources/config.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cargo/sources/config.rs b/src/cargo/sources/config.rs index 5d5a4e8dbdf..4097567bbf6 100644 --- a/src/cargo/sources/config.rs +++ b/src/cargo/sources/config.rs @@ -14,7 +14,9 @@ use log::debug; use std::collections::{HashMap, HashSet}; use url::Url; -/// Represents the entire `[source]` table in Cargo configuration. +/// Represents the entire [`[source]` replacement table][1] in Cargo configuration. +/// +/// [1]: https://doc.rust-lang.org/nightly/cargo/reference/config.html#source #[derive(Clone)] pub struct SourceConfigMap<'cfg> { /// Mapping of source name to the toml configuration. From 0b5ea836f405715bb31d38bdc95dd4f028991c4f Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Wed, 7 Jun 2023 13:49:22 +0100 Subject: [PATCH 2/3] doc: doc comments for registry module --- src/cargo/sources/registry/mod.rs | 299 +++++++++++++++++++----------- 1 file changed, 193 insertions(+), 106 deletions(-) diff --git a/src/cargo/sources/registry/mod.rs b/src/cargo/sources/registry/mod.rs index 4143ac16371..373d9cd555e 100644 --- a/src/cargo/sources/registry/mod.rs +++ b/src/cargo/sources/registry/mod.rs @@ -2,13 +2,47 @@ //! //! # What's a Registry? //! -//! Registries are central locations where packages can be uploaded to, +//! [Registries] are central locations where packages can be uploaded to, //! discovered, and searched for. The purpose of a registry is to have a //! location that serves as permanent storage for versions of a crate over time. //! -//! Compared to git sources, a registry provides many packages as well as many -//! versions simultaneously. Git sources can also have commits deleted through -//! rebasings where registries cannot have their versions deleted. +//! Compared to git sources (see [`GitSource`]), a registry provides many +//! packages as well as many versions simultaneously. Git sources can also +//! have commits deleted through rebasings where registries cannot have their +//! versions deleted. +//! +//! In Cargo, [`RegistryData`] is an abstraction over each kind of actual +//! registry, and [`RegistrySource`] connects those implementations to +//! [`Source`] trait. Two prominent features these abstractions provide are +//! +//! * A way to query the metadata of a package from a registry. The metadata +//! comes from the index. +//! * A way to download package contents (a.k.a source files) that are required +//! when building the package itself. +//! +//! We'll cover each functionality later. +//! +//! [Registries]: https://doc.rust-lang.org/nightly/cargo/reference/registries.html +//! [`GitSource`]: super::GitSource +//! +//! # Different Kinds of Registries +//! +//! Cargo provides multiple kinds of registries. Each of them serves the index +//! and package contents in a slightly different way. Namely, +//! +//! * [`LocalRegistry`] --- Serves the index and package contents entirely on +//! a local filesystem. +//! * [`RemoteRegistry`] --- Serves the index ahead of time from a Git +//! repository, and package contents are downloaded as needed. +//! * [`HttpRegistry`] --- Serves both the index and package contents on demand +//! over a HTTP-based registry API. This is the default starting from 1.70.0. +//! +//! Each registry has its own [`RegistryData`] implementation, and can be +//! created from either [`RegistrySource::local`] or [`RegistrySource::remote`]. +//! +//! [`LocalRegistry`]: local::LocalRegistry +//! [`RemoteRegistry`]: remote::RemoteRegistry +//! [`HttpRegistry`]: http_remote::HttpRegistry //! //! # The Index of a Registry //! @@ -20,36 +54,16 @@ //! available on a registry, what versions are available, and what the //! dependencies for each version is. //! -//! One method of doing so would be having the registry expose an HTTP endpoint -//! which can be queried with a list of packages and a response of their -//! dependencies and versions is returned. This is somewhat inefficient however -//! as we may have to hit the endpoint many times and we may have already -//! queried for much of the data locally already (for other packages, for -//! example). This also involves inventing a transport format between the -//! registry and Cargo itself, so this route was not taken. -//! -//! Instead, Cargo communicates with registries through a git repository -//! referred to as the Index. The Index of a registry is essentially an easily -//! query-able version of the registry's database for a list of versions of a -//! package as well as a list of dependencies for each version. +//! To solve the problem, a registry must provide an index of package metadata. +//! The index of a registry is essentially an easily query-able version of the +//! registry's database for a list of versions of a package as well as a list +//! of dependencies for each version. The exact format of the index is +//! described later. //! -//! Using git to host this index provides a number of benefits: +//! See the [`index`] module for topics about the management, parsing, caching, +//! and versioning for the on-disk index. //! -//! * The entire index can be stored efficiently locally on disk. This means -//! that all queries of a registry can happen locally and don't need to touch -//! the network. -//! -//! * Updates of the index are quite efficient. Using git buys incremental -//! updates, compressed transmission, etc for free. The index must be updated -//! each time we need fresh information from a registry, but this is one -//! update of a git repository that probably hasn't changed a whole lot so -//! it shouldn't be too expensive. -//! -//! Additionally, each modification to the index is just appending a line at -//! the end of a file (the exact format is described later). This means that -//! the commits for an index are quite small and easily applied/compressible. -//! -//! ## The format of the Index +//! ## The Format of The Index //! //! The index is a store for the list of versions for all packages known, so its //! format on disk is optimized slightly to ensure that `ls registry` doesn't @@ -59,9 +73,12 @@ //! about the format of the registry: //! //! 1. Each crate will have one file corresponding to it. Each version for a -//! crate will just be a line in this file. +//! crate will just be a line in this file (see [`RegistryPackage`] for its +//! representation). //! 2. There will be two tiers of directories for crate names, under which //! crates corresponding to those tiers will be located. +//! (See [`cargo_util::registry::make_dep_path`] for the implementation of +//! this layout hierarchy.) //! //! As an example, this is an example hierarchy of an index: //! @@ -99,26 +116,30 @@ //! The purpose of this layout is to hopefully cut down on `ls` sizes as well as //! efficient lookup based on the crate name itself. //! -//! ## Crate files +//! See [The Cargo Book: Registry Index][registry-index] for the public +//! interface on the index format. +//! +//! [registry-index]: https://doc.rust-lang.org/nightly/cargo/reference/registry-index.html +//! +//! ## The Index Files //! //! Each file in the index is the history of one crate over time. Each line in //! the file corresponds to one version of a crate, stored in JSON format (see -//! the `RegistryPackage` structure below). +//! the [`RegistryPackage`] structure below). //! -//! As new versions are published, new lines are appended to this file. The only -//! modifications to this file that should happen over time are yanks of a -//! particular version. +//! As new versions are published, new lines are appended to this file. **The +//! only modifications to this file that should happen over time are yanks of a +//! particular version.** //! //! # Downloading Packages //! -//! The purpose of the Index was to provide an efficient method to resolve the -//! dependency graph for a package. So far we only required one network -//! interaction to update the registry's repository (yay!). After resolution has -//! been performed, however we need to download the contents of packages so we -//! can read the full manifest and build the source code. +//! The purpose of the index was to provide an efficient method to resolve the +//! dependency graph for a package. After resolution has been performed, we need +//! to download the contents of packages so we can read the full manifest and +//! build the source code. //! -//! To accomplish this, this source's `download` method will make an HTTP -//! request per-package requested to download tarballs into a local cache. These +//! To accomplish this, [`RegistryData::download`] will "make" an HTTP request +//! per-package requested to download tarballs into a local cache. These //! tarballs will then be unpacked into a destination folder. //! //! Note that because versions uploaded to the registry are frozen forever that @@ -128,7 +149,8 @@ //! //! # Filesystem Hierarchy //! -//! Overall, the `$HOME/.cargo` looks like this when talking about the registry: +//! Overall, the `$HOME/.cargo` looks like this when talking about the registry +//! (remote registries, specifically): //! //! ```notrust //! # A folder under which all registry metadata is hosted (similar to @@ -144,8 +166,8 @@ //! registry2-/ //! ... //! -//! # This folder is a cache for all downloaded tarballs from a registry. -//! # Once downloaded and verified, a tarball never changes. +//! # This folder is a cache for all downloaded tarballs (`.crate` file) +//! # from a registry. Once downloaded and verified, a tarball never changes. //! cache/ //! registry1-/-.crate //! ... @@ -153,6 +175,7 @@ //! # Location in which all tarballs are unpacked. Each tarball is known to //! # be frozen after downloading, so transitively this folder is also //! # frozen once its unpacked (it's never unpacked again) +//! # CAVEAT: They are not read-only. See rust-lang/cargo#9455. //! src/ //! registry1-/-/... //! ... @@ -186,26 +209,35 @@ use crate::util::{ restricted_names, CargoResult, Config, Filesystem, LimitErrorReader, OptVersionReq, }; +/// The `.cargo-ok` file is used to track if the source is already unpacked. +/// See [`RegistrySource::unpack_package`] for more. +/// +/// Not to be confused with `.cargo-ok` file in git sources. const PACKAGE_SOURCE_LOCK: &str = ".cargo-ok"; + pub const CRATES_IO_INDEX: &str = "https://github.com/rust-lang/crates.io-index"; pub const CRATES_IO_HTTP_INDEX: &str = "sparse+https://index.crates.io/"; pub const CRATES_IO_REGISTRY: &str = "crates-io"; pub const CRATES_IO_DOMAIN: &str = "crates.io"; + const CRATE_TEMPLATE: &str = "{crate}"; const VERSION_TEMPLATE: &str = "{version}"; const PREFIX_TEMPLATE: &str = "{prefix}"; const LOWER_PREFIX_TEMPLATE: &str = "{lowerprefix}"; const CHECKSUM_TEMPLATE: &str = "{sha256-checksum}"; + const MAX_UNPACK_SIZE: u64 = 512 * 1024 * 1024; const MAX_COMPRESSION_RATIO: usize = 20; // 20:1 -/// A "source" for a local (see `local::LocalRegistry`) or remote (see -/// `remote::RemoteRegistry`) registry. +/// A [`Source`] implementation for a local or a remote registry. /// -/// This contains common functionality that is shared between the two registry -/// kinds, with the registry-specific logic implemented as part of the +/// This contains common functionality that is shared between each registry +/// kind, with the registry-specific logic implemented as part of the /// [`RegistryData`] trait referenced via the `ops` field. +/// +/// For general concepts of registries, see the [module-level documentation](crate::sources::registry). pub struct RegistrySource<'cfg> { + /// The unique identifier of this source. source_id: SourceId, /// The path where crate files are extracted (`$CARGO_HOME/registry/src/$REG-HASH`). src_path: Filesystem, @@ -225,7 +257,19 @@ pub struct RegistrySource<'cfg> { yanked_whitelist: HashSet, } -/// The `config.json` file stored in the index. +/// The [`config.json`] file stored in the index. +/// +/// The config file may look like: +/// +/// ```json +/// { +/// "dl": "https://example.com/api/{crate}/{version}/download", +/// "api": "https://example.com/api", +/// "auth-required": false # unstable feature (RFC 3139) +/// } +/// ``` +/// +/// [`config.json`]: https://doc.rust-lang.org/nightly/cargo/reference/registry-index.html#index-configuration #[derive(Deserialize, Debug, Clone)] #[serde(rename_all = "kebab-case")] pub struct RegistryConfig { @@ -252,22 +296,29 @@ pub struct RegistryConfig { /// If this is None, the registry does not support API commands. pub api: Option, - /// Whether all operations require authentication. + /// Whether all operations require authentication. See [RFC 3139]. + /// + /// [RFC 3139]: https://rust-lang.github.io/rfcs/3139-cargo-alternative-registry-auth.html #[serde(default)] pub auth_required: bool, } -/// The maximum version of the `v` field in the index this version of cargo -/// understands. +/// The maximum schema version of the `v` field in the index this version of +/// cargo understands. See [`RegistryPackage::v`] for the detail. pub(crate) const INDEX_V_MAX: u32 = 2; /// A single line in the index representing a single version of a package. #[derive(Deserialize)] pub struct RegistryPackage<'a> { + /// Name of the pacakge. name: InternedString, + /// The version of this dependency. vers: Version, + /// All kinds of direct dependencies of the package, including dev and + /// build dependencies. #[serde(borrow)] deps: Vec>, + /// Set of features defined for the package, i.e., `[features]` table. features: BTreeMap>, /// This field contains features with new, extended syntax. Specifically, /// namespaced features (`dep:`) and weak dependencies (`pkg?/feat`). @@ -276,6 +327,7 @@ pub struct RegistryPackage<'a> { /// will fail to load due to not being able to parse the new syntax, even /// with a `Cargo.lock` file. features2: Option>>, + /// Checksum for verifying the integrity of the corresponding downloaded package. cksum: String, /// If `true`, Cargo will skip this version when resolving. /// @@ -349,19 +401,33 @@ fn escaped_char_in_json() { .unwrap(); } -/// A dependency as encoded in the index JSON. +/// A dependency as encoded in the [`RegistryPackage`] index JSON. #[derive(Deserialize)] struct RegistryDependency<'a> { + /// Name of the dependency. If the dependency is renamed, the original + /// would be stored in [`RegistryDependency::package`]. name: InternedString, + /// The SemVer requirement for this dependency. #[serde(borrow)] req: Cow<'a, str>, + /// Set of features enabled for this dependency. features: Vec, + /// Whether or not this is an optional dependency. optional: bool, + /// Whether or not default features are enabled. default_features: bool, + /// The target platform for this dependency. target: Option>, + /// The dependency kind. "dev", "build", and "normal". kind: Option>, + // The URL of the index of the registry where this dependency is from. + // `None` if it is from the same index. registry: Option>, + /// The original name if the dependency is renamed. package: Option, + /// Whether or not this is a public dependency. Unstable. See [RFC 1977]. + /// + /// [RFC 1977]: https://rust-lang.github.io/rfcs/1977-public-private-dependencies.html public: Option, } @@ -437,6 +503,7 @@ pub enum LoadResponse { /// The cache is out of date. Returned data should be used. Data { raw_data: Vec, + /// Version of this data to determine whether it is out of date. index_version: Option, }, @@ -444,10 +511,11 @@ pub enum LoadResponse { NotFound, } -/// An abstract interface to handle both a local (see `local::LocalRegistry`) -/// and remote (see `remote::RemoteRegistry`) registry. +/// An abstract interface to handle both a local and and remote registry. /// -/// This allows [`RegistrySource`] to abstractly handle both registry kinds. +/// This allows [`RegistrySource`] to abstractly handle each registry kind. +/// +/// For general concepts of registries, see the [module-level documentation](crate::sources::registry). pub trait RegistryData { /// Performs initialization for the registry. /// @@ -458,14 +526,15 @@ pub trait RegistryData { /// Returns the path to the index. /// /// Note that different registries store the index in different formats - /// (remote=git, local=files). + /// (remote = git, http & local = files). fn index_path(&self) -> &Filesystem; /// Loads the JSON for a specific named package from the index. /// /// * `root` is the root path to the index. /// * `path` is the relative path to the package to load (like `ca/rg/cargo`). - /// * `index_version` is the version of the requested crate data currently in cache. + /// * `index_version` is the version of the requested crate data currently + /// in cache. This is useful for checking if a local cache is outdated. fn load( &mut self, root: &Path, @@ -556,6 +625,8 @@ mod index; mod local; mod remote; +/// Generates a unique name for [`SourceId`] to have a unique path to put their +/// index files. fn short_name(id: SourceId, is_shallow: bool) -> String { let hash = hex::short_hash(&id); let ident = id.url().host_str().unwrap_or("").to_string(); @@ -567,6 +638,11 @@ fn short_name(id: SourceId, is_shallow: bool) -> String { } impl<'cfg> RegistrySource<'cfg> { + /// Creates a [`Source`] of a "remote" registry. + /// It could be either an HTTP-based [`http_remote::HttpRegistry`] or + /// a Git-based [`remote::RemoteRegistry`]. + /// + /// * `yanked_whitelist` --- Packages allowed to be used, even if they are yanked. pub fn remote( source_id: SourceId, yanked_whitelist: &HashSet, @@ -596,6 +672,10 @@ impl<'cfg> RegistrySource<'cfg> { )) } + /// Creates a [`Source`] of a local registry, with [`local::LocalRegistry`] under the hood. + /// + /// * `path` --- The root path of a local registry on the file system. + /// * `yanked_whitelist` --- Packages allowed to be used, even if they are yanked. pub fn local( source_id: SourceId, path: &Path, @@ -607,6 +687,10 @@ impl<'cfg> RegistrySource<'cfg> { RegistrySource::new(source_id, config, &name, Box::new(ops), yanked_whitelist) } + /// Creates a source of a registry. This is a inner helper function. + /// + /// * `name` --- Unique name for this source to store source files (`.crate` tarballs) are stored. + /// * `ops` --- The underlying [`RegistryData`] type. fn new( source_id: SourceId, config: &'cfg Config, @@ -624,7 +708,7 @@ impl<'cfg> RegistrySource<'cfg> { } } - /// Decode the configuration stored within the registry. + /// Decode the [configuration](RegistryConfig) stored within the registry. /// /// This requires that the index has been at least checked out. pub fn config(&mut self) -> Poll>> { @@ -635,9 +719,46 @@ impl<'cfg> RegistrySource<'cfg> { /// compiled. /// /// No action is taken if the source looks like it's already unpacked. + /// + /// # History of interruption detection with `.cargo-lock` file + /// + /// Cargo has always included a `.cargo-ok` file ([`PACKAGE_SOURCE_LOCK`]) + /// to detect if extraction was interrupted, but it was originally empty. + /// + /// In 1.34, Cargo was changed to create the `.cargo-ok` file before it + /// started extraction to implement fine-grained locking. After it was + /// finished extracting, it wrote two bytes to indicate it was complete. + /// It would use the length check to detect if it was possibly interrupted. + /// + /// In 1.36, Cargo changed to not use fine-grained locking, and instead used + /// a global lock. The use of `.cargo-ok` was no longer needed for locking + /// purposes, but was kept to detect when extraction was interrupted. + /// + /// In 1.49, Cargo changed to not create the `.cargo-ok` file before it + /// started extraction to deal with `.crate` files that inexplicably had + /// a `.cargo-ok` file in them. + /// + /// In 1.64, Cargo changed to detect `.crate` files with `.cargo-ok` files + /// in them in response to [CVE-2022-36113], which dealt with malicious + /// `.crate` files making `.cargo-ok` a symlink causing cargo to write "ok" + /// to any arbitrary file on the filesystem it has permission to. + /// + /// This is all a long-winded way of explaining the circumstances that might + /// cause a directory to contain a `.cargo-ok` file that is empty or + /// otherwise corrupted. Either this was extracted by a version of Rust + /// before 1.34, in which case everything should be fine. However, an empty + /// file created by versions 1.36 to 1.49 indicates that the extraction was + /// interrupted and that we need to start again. + /// + /// Another possibility is that the filesystem is simply corrupted, in + /// which case deleting the directory might be the safe thing to do. That + /// is probably unlikely, though. + /// + /// To be safe, we deletes the directory and starts over again if an empty + /// `.cargo-ok` file is found. + /// + /// [CVE-2022-36113]: https://blog.rust-lang.org/2022/09/14/cargo-cves.html#arbitrary-file-corruption-cve-2022-36113 fn unpack_package(&self, pkg: PackageId, tarball: &File) -> CargoResult { - // The `.cargo-ok` file is used to track if the source is already - // unpacked. let package_dir = format!("{}-{}", pkg.name(), pkg.version()); let dst = self.src_path.join(&package_dir); let path = dst.join(PACKAGE_SOURCE_LOCK); @@ -646,47 +767,7 @@ impl<'cfg> RegistrySource<'cfg> { match path.metadata() { Ok(meta) if meta.len() > 0 => return Ok(unpack_dir.to_path_buf()), Ok(_meta) => { - // The `.cargo-ok` file is not in a state we expect it to be - // (with two bytes containing "ok"). - // - // Cargo has always included a `.cargo-ok` file to detect if - // extraction was interrupted, but it was originally empty. - // - // In 1.34, Cargo was changed to create the `.cargo-ok` file - // before it started extraction to implement fine-grained - // locking. After it was finished extracting, it wrote two - // bytes to indicate it was complete. It would use the length - // check to detect if it was possibly interrupted. - // - // In 1.36, Cargo changed to not use fine-grained locking, and - // instead used a global lock. The use of `.cargo-ok` was no - // longer needed for locking purposes, but was kept to detect - // when extraction was interrupted. - // - // In 1.49, Cargo changed to not create the `.cargo-ok` file - // before it started extraction to deal with `.crate` files - // that inexplicably had a `.cargo-ok` file in them. - // - // In 1.64, Cargo changed to detect `.crate` files with - // `.cargo-ok` files in them in response to CVE-2022-36113, - // which dealt with malicious `.crate` files making - // `.cargo-ok` a symlink causing cargo to write "ok" to any - // arbitrary file on the filesystem it has permission to. - // - // This is all a long-winded way of explaining the - // circumstances that might cause a directory to contain a - // `.cargo-ok` file that is empty or otherwise corrupted. - // Either this was extracted by a version of Rust before 1.34, - // in which case everything should be fine. However, an empty - // file created by versions 1.36 to 1.49 indicates that the - // extraction was interrupted and that we need to start again. - // - // Another possibility is that the filesystem is simply - // corrupted, in which case deleting the directory might be - // the safe thing to do. That is probably unlikely, though. - // - // To be safe, this deletes the directory and starts over - // again. + // See comment of `unpack_package` about why removing all stuff. log::warn!("unexpected length of {path:?}, clearing cache"); paths::remove_dir_all(dst.as_path_unlocked())?; } @@ -758,6 +839,12 @@ impl<'cfg> RegistrySource<'cfg> { Ok(unpack_dir.to_path_buf()) } + /// Turns the downloaded `.crate` tarball file into a [`Package`]. + /// + /// This unconditionally sets checksum for the returned package, so it + /// should only be called after doing integrity check. That is to say, + /// you need to call either [`RegistryData::download`] or + /// [`RegistryData::finish_download`] before calling this method. fn get_pkg(&mut self, package: PackageId, path: &File) -> CargoResult { let path = self .unpack_package(package, path) From 788816718290b6e00cf4347afabc26f9719762b3 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Wed, 7 Jun 2023 13:47:19 +0100 Subject: [PATCH 3/3] doc: doc comments for registry index --- src/cargo/sources/registry/index.rs | 282 ++++++++++++++++++---------- 1 file changed, 185 insertions(+), 97 deletions(-) diff --git a/src/cargo/sources/registry/index.rs b/src/cargo/sources/registry/index.rs index d857a053ed2..1c5587a4a8c 100644 --- a/src/cargo/sources/registry/index.rs +++ b/src/cargo/sources/registry/index.rs @@ -1,11 +1,26 @@ -//! Management of the index of a registry source +//! Management of the index of a registry source. //! //! This module contains management of the index and various operations, such as //! actually parsing the index, looking for crates, etc. This is intended to be -//! abstract over remote indices (downloaded via git) and local registry indices -//! (which are all just present on the filesystem). +//! abstract over remote indices (downloaded via Git or HTTP) and local registry +//! indices (which are all just present on the filesystem). //! -//! ## Index Performance +//! ## How the index works +//! +//! Here is a simple flow when loading a [`Summary`] (metadata) from the index: +//! +//! 1. A query is fired via [`RegistryIndex::query_inner`]. +//! 2. Tries loading all summaries via [`RegistryIndex::load_summaries`], and +//! under the hood calling [`Summaries::parse`] to parse an index file. +//! 1. If an on-disk index cache is present, loads it via +//! [`Summaries::parse_cache`]. +//! 2. Otherwise goes to the slower path [`RegistryData::load`] to get the +//! specific index file. +//! 3. A [`Summary`] is now ready in callback `f` in [`RegistryIndex::query_inner`]. +//! +//! This is just an overview. To know the rationale behind, continue reading. +//! +//! ## A layer of on-disk index cache for performance //! //! One important aspect of the index is that we want to optimize the "happy //! path" as much as possible. Whenever you type `cargo build` Cargo will @@ -20,19 +35,20 @@ //! don't need them. Most secondary optimizations are centered around removing //! allocations and such, but avoiding parsing JSON is the #1 optimization. //! -//! When we get queries from the resolver we're given a `Dependency`. This +//! When we get queries from the resolver we're given a [`Dependency`]. This //! dependency in turn has a version requirement, and with lock files that //! already exist these version requirements are exact version requirements //! `=a.b.c`. This means that we in theory only need to parse one line of JSON //! per query in the registry, the one that matches version `a.b.c`. //! //! The crates.io index, however, is not amenable to this form of query. Instead -//! the crates.io index simply is a file where each line is a JSON blob. To -//! learn about the versions in each JSON blob we would need to parse the JSON, -//! defeating the purpose of trying to parse as little as possible. +//! the crates.io index simply is a file where each line is a JSON blob, aka +//! [`RegistryPackage`]. To learn about the versions in each JSON blob we +//! would need to parse the JSON via [`IndexSummary::parse`], defeating the +//! purpose of trying to parse as little as possible. //! //! > Note that as a small aside even *loading* the JSON from the registry is -//! > actually pretty slow. For crates.io and remote registries we don't +//! > actually pretty slow. For crates.io and [`RemoteRegistry`] we don't //! > actually check out the git index on disk because that takes quite some //! > time and is quite large. Instead we use `libgit2` to read the JSON from //! > the raw git objects. This in turn can be slow (aka show up high in @@ -43,14 +59,14 @@ //! (first time being for an entire computer) Cargo will load the contents //! (slowly via libgit2) from the registry. It will then (slowly) parse every //! single line to learn about its versions. Afterwards, however, Cargo will -//! emit a new file (a cache) which is amenable for speedily parsing in future -//! invocations. +//! emit a new file (a cache, representing as [`SummariesCache`]) which is +//! amenable for speedily parsing in future invocations. //! //! This cache file is currently organized by basically having the semver -//! version extracted from each JSON blob. That way Cargo can quickly and easily -//! parse all versions contained and which JSON blob they're associated with. -//! The JSON blob then doesn't actually need to get parsed unless the version is -//! parsed. +//! version extracted from each JSON blob. That way Cargo can quickly and +//! easily parse all versions contained and which JSON blob they're associated +//! with. The JSON blob then doesn't actually need to get parsed unless the +//! version is parsed. //! //! Altogether the initial measurements of this shows a massive improvement for //! Cargo null build performance. It's expected that the improvements earned @@ -65,6 +81,9 @@ //! Note that this is just a high-level overview, there's of course lots of //! details like invalidating caches and whatnot which are handled below, but //! hopefully those are more obvious inline in the code itself. +//! +//! [`RemoteRegistry`]: super::remote::RemoteRegistry +//! [`Dependency`]: crate::core::Dependency use crate::core::{PackageId, SourceId, Summary}; use crate::sources::registry::{LoadResponse, RegistryData, RegistryPackage, INDEX_V_MAX}; @@ -83,17 +102,24 @@ use std::task::{ready, Poll}; /// Manager for handling the on-disk index. /// -/// Note that local and remote registries store the index differently. Local -/// is a simple on-disk tree of files of the raw index. Remote registries are -/// stored as a raw git repository. The different means of access are handled -/// via the [`RegistryData`] trait abstraction. +/// Different kinds of registries store the index differently: +/// +/// * [`LocalRegistry`]` is a simple on-disk tree of files of the raw index. +/// * [`RemoteRegistry`] is stored as a raw git repository. +/// * [`HttpRegistry`] fills the on-disk index cache directly without keeping +/// any raw index. /// +/// These means of access are handled via the [`RegistryData`] trait abstraction. /// This transparently handles caching of the index in a more efficient format. +/// +/// [`LocalRegistry`]: super::local::LocalRegistry +/// [`RemoteRegistry`]: super::remote::RemoteRegistry +/// [`HttpRegistry`]: super::http_remote::HttpRegistry pub struct RegistryIndex<'cfg> { source_id: SourceId, /// Root directory of the index for the registry. path: Filesystem, - /// Cache of summary data. + /// In-memory cache of summary data. /// /// This is keyed off the package name. The [`Summaries`] value handles /// loading the summary data. It keeps an optimized on-disk representation @@ -110,14 +136,16 @@ pub struct RegistryIndex<'cfg> { /// /// A list of summaries are loaded from disk via one of two methods: /// -/// 1. Primarily Cargo will parse the corresponding file for a crate in the -/// upstream crates.io registry. That's just a JSON blob per line which we -/// can parse, extract the version, and then store here. +/// 1. From raw registry index --- Primarily Cargo will parse the corresponding +/// file for a crate in the upstream crates.io registry. That's just a JSON +/// blob per line which we can parse, extract the version, and then store here. +/// See [`RegistryPackage`] and [`IndexSummary::parse`]. /// -/// 2. Alternatively, if Cargo has previously run, we'll have a cached index of -/// dependencies for the upstream index. This is a file that Cargo maintains -/// lazily on the local filesystem and is much faster to parse since it -/// doesn't involve parsing all of the JSON. +/// 2. From on-disk index cache --- If Cargo has previously run, we'll have a +/// cached index of dependencies for the upstream index. This is a file that +/// Cargo maintains lazily on the local filesystem and is much faster to +/// parse since it doesn't involve parsing all of the JSON. +/// See [`SummariesCache`]. /// /// The outward-facing interface of this doesn't matter too much where it's /// loaded from, but it's important when reading the implementation to note that @@ -134,37 +162,100 @@ struct Summaries { versions: HashMap, } -/// A lazily parsed `IndexSummary`. +/// A lazily parsed [`IndexSummary`]. enum MaybeIndexSummary { /// A summary which has not been parsed, The `start` and `end` are pointers - /// into `Summaries::raw_data` which this is an entry of. + /// into [`Summaries::raw_data`] which this is an entry of. Unparsed { start: usize, end: usize }, /// An actually parsed summary. Parsed(IndexSummary), } -/// A parsed representation of a summary from the index. +/// A parsed representation of a summary from the index. This is usually parsed +/// from a line from a raw index file, or a JSON blob from on-disk index cache. /// -/// In addition to a full `Summary` we have information on whether it is `yanked`. +/// In addition to a full [`Summary`], we have information on whether it is `yanked`. pub struct IndexSummary { pub summary: Summary, pub yanked: bool, - /// Schema version, see [`RegistryPackage`]. + /// Schema version, see [`RegistryPackage::v`]. v: u32, } /// A representation of the cache on disk that Cargo maintains of summaries. +/// /// Cargo will initially parse all summaries in the registry and will then /// serialize that into this form and place it in a new location on disk, /// ensuring that access in the future is much speedier. +/// +/// For serialization and deserialization of this on-disk index cache of +/// summaries, see [`SummariesCache::serialize`] and [`SummariesCache::parse`]. +/// +/// # The format of the index cache +/// +/// The idea of this format is that it's a very easy file for Cargo to parse in +/// future invocations. The read from disk should be fast and then afterwards +/// all we need to know is what versions correspond to which JSON blob. +/// +/// Currently the format looks like: +/// +/// ```text +/// +---------------+----------------------+--------------------+---+ +/// | cache version | index format version | index file version | 0 | +/// +---------------+----------------------+--------------------+---+ +/// ``` +/// +/// followed by one or more (version + JSON blob) pairs... +/// +/// ```text +/// +----------------+---+-----------+---+ +/// | semver version | 0 | JSON blob | 0 | ... +/// +----------------+---+-----------+---+ +/// ``` +/// +/// Each field represents: +/// +/// * _cache version_ --- Intended to ensure that there's some level of +/// future compatibility against changes to this cache format so if different +/// versions of Cargo share the same cache they don't get too confused. +/// * _index format version_ --- The version of the raw index file. +/// See [`RegistryPackage::v`] for the detail. +/// * _index file version_ --- Tracks when a cache needs to be regenerated. +/// A cache regeneration is required whenever the index file itself updates. +/// * _semver version_ --- The version for each JSON blob. Extracted from the +/// blob for fast queries without parsing the entire blob. +/// * _JSON blob_ --- The actual metadata for each version of the package. It +/// has the same representation as [`RegistryPackage`]. +/// +/// # Changes between each cache version +/// +/// * `1`: The original version. +/// * `2`: Added the "index format version" field so that if the index format +/// changes, different versions of cargo won't get confused reading each +/// other's caches. +/// * `3`: Bumped the version to work around an issue where multiple versions of +/// a package were published that differ only by semver metadata. For +/// example, openssl-src 110.0.0 and 110.0.0+1.1.0f. Previously, the cache +/// would be incorrectly populated with two entries, both 110.0.0. After +/// this, the metadata will be correctly included. This isn't really a format +/// change, just a version bump to clear the incorrect cache entries. Note: +/// the index shouldn't allow these, but unfortunately crates.io doesn't +/// check it. +/// +/// See [`CURRENT_CACHE_VERSION`] for the current cache version. #[derive(Default)] struct SummariesCache<'a> { + /// JSON blobs of the summaries. Each JSON blob has a [`Version`] beside, + /// so that Cargo can query a version without full JSON parsing. versions: Vec<(Version, &'a [u8])>, + /// For cache invalidation, we tracks the index file version to determine + /// when to regenerate the cache itself. index_version: &'a str, } impl<'cfg> RegistryIndex<'cfg> { + /// Creates an empty registry index at `path`. pub fn new( source_id: SourceId, path: &Filesystem, @@ -178,7 +269,9 @@ impl<'cfg> RegistryIndex<'cfg> { } } - /// Returns the hash listed for a specified `PackageId`. + /// Returns the hash listed for a specified `PackageId`. Primarily for + /// checking the integrity of a downloaded package matching the checksum in + /// the index file, aka [`IndexSummary`]. pub fn hash(&mut self, pkg: PackageId, load: &mut dyn RegistryData) -> Poll> { let req = OptVersionReq::exact(pkg.version()); let summary = self.summaries(&pkg.name(), &req, load)?; @@ -191,10 +284,14 @@ impl<'cfg> RegistryIndex<'cfg> { } /// Load a list of summaries for `name` package in this registry which - /// match `req` + /// match `req`. + /// + /// This function will semantically + /// + /// 1. parse the index file (either raw or cache), + /// 2. match all versions, + /// 3. and then return an iterator over all summaries which matched. /// - /// This function will semantically parse the on-disk index, match all - /// versions, and then return an iterator over all summaries which matched. /// Internally there's quite a few layer of caching to amortize this cost /// though since this method is called quite a lot on null builds in Cargo. pub fn summaries<'a, 'b>( @@ -209,10 +306,7 @@ impl<'cfg> RegistryIndex<'cfg> { let source_id = self.source_id; let config = self.config; - // First up actually parse what summaries we have available. If Cargo - // has run previously this will parse a Cargo-specific cache file rather - // than the registry itself. In effect this is intended to be a quite - // cheap operation. + // First up parse what summaries we have available. let name = InternedString::new(name); let summaries = ready!(self.load_summaries(name, load)?); @@ -251,13 +345,28 @@ impl<'cfg> RegistryIndex<'cfg> { }))) } + /// Actually parses what summaries we have available. + /// + /// If Cargo has run previously, this tries in this order: + /// + /// 1. Returns from in-memory cache, aka [`RegistryIndex::summaries_cache`]. + /// 2. If missing, hands over to [`Summaries::parse`] to parse an index file. + /// + /// The actual kind index file being parsed depends on which kind of + /// [`RegistryData`] the `load` argument is given. For example, a + /// Git-based [`RemoteRegistry`] will first try a on-disk index cache + /// file, and then try parsing registry raw index fomr Git repository. + /// + /// In effect, this is intended to be a quite cheap operation. + /// + /// [`RemoteRegistry`]: super::remote::RemoteRegistry fn load_summaries( &mut self, name: InternedString, load: &mut dyn RegistryData, ) -> Poll> { // If we've previously loaded what versions are present for `name`, just - // return that since our cache should still be valid. + // return that since our in-memory cache should still be valid. if self.summaries_cache.contains_key(&name) { return Poll::Ready(Ok(self.summaries_cache.get_mut(&name).unwrap())); } @@ -295,6 +404,9 @@ impl<'cfg> RegistryIndex<'cfg> { self.summaries_cache.clear(); } + /// Attempts to find the packages that match a `name` and a version `req`. + /// + /// This is primarily used by [`Source::query`](super::Source). pub fn query_inner( &mut self, name: &str, @@ -324,6 +436,10 @@ impl<'cfg> RegistryIndex<'cfg> { .map_ok(|_| ()) } + /// Inner implementation of [`Self::query_inner`]. Returns the number of + /// summaries we've got. + /// + /// The `online` controls whether Cargo can access the network when needed. fn query_inner_with_online( &mut self, name: &str, @@ -404,6 +520,7 @@ impl<'cfg> RegistryIndex<'cfg> { Poll::Ready(Ok(count)) } + /// Looks into the summaries to check if a package has been yanked. pub fn is_yanked( &mut self, pkg: PackageId, @@ -418,23 +535,26 @@ impl<'cfg> RegistryIndex<'cfg> { } impl Summaries { - /// Parse out a `Summaries` instances from on-disk state. + /// Parse out a [`Summaries`] instances from on-disk state. + /// + /// This will do the followings in order: /// - /// This will attempt to prefer parsing a previous cache file that already - /// exists from a previous invocation of Cargo (aka you're typing `cargo - /// build` again after typing it previously). If parsing fails or the cache - /// isn't found, then we take a slower path which loads the full descriptor - /// for `relative` from the underlying index (aka typically libgit2 with - /// crates.io) and then parse everything in there. + /// 1. Attempt to prefer parsing a previous index cache file that already + /// exists from a previous invocation of Cargo (aka you're typing `cargo + /// build` again after typing it previously). + /// 2. If parsing fails, or the cache isn't found or is invalid, we then + /// take a slower path which loads the full descriptor for `relative` + /// from the underlying index (aka libgit2 with crates.io, or from a + /// remote HTTP index) and then parse everything in there. /// - /// * `root` - this is the root argument passed to `load` - /// * `cache_root` - this is the root on the filesystem itself of where to - /// store cache files. - /// * `relative` - this is the file we're loading from cache or the index + /// * `root` --- this is the root argument passed to `load` + /// * `cache_root` --- this is the root on the filesystem itself of where + /// to store cache files. + /// * `relative` --- this is the file we're loading from cache or the index /// data - /// * `source_id` - the registry's SourceId used when parsing JSON blobs to - /// create summaries. - /// * `load` - the actual index implementation which may be very slow to + /// * `source_id` --- the registry's SourceId used when parsing JSON blobs + /// to create summaries. + /// * `load` --- the actual index implementation which may be very slow to /// call. We avoid this if we can. pub fn parse( root: &Path, @@ -549,8 +669,8 @@ impl Summaries { } } - /// Parses an open `File` which represents information previously cached by - /// Cargo. + /// Parses the contents of an on-disk cache, aka [`SummariesCache`], which + /// represents information previously cached by Cargo. pub fn parse_cache(contents: Vec) -> CargoResult<(Summaries, InternedString)> { let cache = SummariesCache::parse(&contents)?; let index_version = InternedString::new(cache.index_version); @@ -577,46 +697,11 @@ impl Summaries { } } -// Implementation of serializing/deserializing the cache of summaries on disk. -// Currently the format looks like: -// -// +--------------------+----------------------+-------------+---+ -// | cache version byte | index format version | git sha rev | 0 | -// +--------------------+----------------------+-------------+---+ -// -// followed by... -// -// +----------------+---+------------+---+ -// | semver version | 0 | JSON blob | 0 | ... -// +----------------+---+------------+---+ -// -// The idea is that this is a very easy file for Cargo to parse in future -// invocations. The read from disk should be quite fast and then afterwards all -// we need to know is what versions correspond to which JSON blob. -// -// The leading version byte is intended to ensure that there's some level of -// future compatibility against changes to this cache format so if different -// versions of Cargo share the same cache they don't get too confused. The git -// sha lets us know when the file needs to be regenerated (it needs regeneration -// whenever the index itself updates). -// -// Cache versions: -// * `1`: The original version. -// * `2`: Added the "index format version" field so that if the index format -// changes, different versions of cargo won't get confused reading each -// other's caches. -// * `3`: Bumped the version to work around an issue where multiple versions of -// a package were published that differ only by semver metadata. For -// example, openssl-src 110.0.0 and 110.0.0+1.1.0f. Previously, the cache -// would be incorrectly populated with two entries, both 110.0.0. After -// this, the metadata will be correctly included. This isn't really a format -// change, just a version bump to clear the incorrect cache entries. Note: -// the index shouldn't allow these, but unfortunately crates.io doesn't -// check it. - +/// The current version of [`SummariesCache`]. const CURRENT_CACHE_VERSION: u8 = 3; impl<'a> SummariesCache<'a> { + /// Deserializes an on-disk cache. fn parse(data: &'a [u8]) -> CargoResult> { // NB: keep this method in sync with `serialize` below let (first_byte, rest) = data @@ -655,6 +740,7 @@ impl<'a> SummariesCache<'a> { Ok(ret) } + /// Serializes itself with a given `index_version`. fn serialize(&self, index_version: &str) -> Vec { // NB: keep this method in sync with `parse` above let size = self @@ -709,10 +795,11 @@ impl From for MaybeIndexSummary { } impl IndexSummary { - /// Parses a line from the registry's index file into an `IndexSummary` for - /// a package. + /// Parses a line from the registry's index file into an [`IndexSummary`] + /// for a package. /// - /// The `line` provided is expected to be valid JSON. + /// The `line` provided is expected to be valid JSON. It is supposed to be + /// a [`RegistryPackage`]. fn parse(config: &Config, line: &[u8], source_id: SourceId) -> CargoResult { // ****CAUTION**** Please be extremely careful with returning errors // from this function. Entries that error are not included in the @@ -754,6 +841,7 @@ impl IndexSummary { } } +/// Like [`slice::split`] but is optimized by [`memchr`]. fn split(haystack: &[u8], needle: u8) -> impl Iterator { struct Split<'a> { haystack: &'a [u8],