From 354d70d8a4acfc830b4931a90902d46abc193592 Mon Sep 17 00:00:00 2001
From: Weihang Lo <me@weihanglo.tw>
Date: Tue, 30 May 2023 12:04:21 +0100
Subject: [PATCH 1/3] docs: add link to nightly config doc for
 `SourceConfigMap`

---
 src/cargo/sources/config.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/cargo/sources/config.rs b/src/cargo/sources/config.rs
index 5d5a4e8dbdf..4097567bbf6 100644
--- a/src/cargo/sources/config.rs
+++ b/src/cargo/sources/config.rs
@@ -14,7 +14,9 @@ use log::debug;
 use std::collections::{HashMap, HashSet};
 use url::Url;
 
-/// Represents the entire `[source]` table in Cargo configuration.
+/// Represents the entire [`[source]` replacement table][1] in Cargo configuration.
+///
+/// [1]: https://doc.rust-lang.org/nightly/cargo/reference/config.html#source
 #[derive(Clone)]
 pub struct SourceConfigMap<'cfg> {
     /// Mapping of source name to the toml configuration.

From 0b5ea836f405715bb31d38bdc95dd4f028991c4f Mon Sep 17 00:00:00 2001
From: Weihang Lo <me@weihanglo.tw>
Date: Wed, 7 Jun 2023 13:49:22 +0100
Subject: [PATCH 2/3] doc: doc comments for registry module

---
 src/cargo/sources/registry/mod.rs | 299 +++++++++++++++++++-----------
 1 file changed, 193 insertions(+), 106 deletions(-)

diff --git a/src/cargo/sources/registry/mod.rs b/src/cargo/sources/registry/mod.rs
index 4143ac16371..373d9cd555e 100644
--- a/src/cargo/sources/registry/mod.rs
+++ b/src/cargo/sources/registry/mod.rs
@@ -2,13 +2,47 @@
 //!
 //! # What's a Registry?
 //!
-//! Registries are central locations where packages can be uploaded to,
+//! [Registries] are central locations where packages can be uploaded to,
 //! discovered, and searched for. The purpose of a registry is to have a
 //! location that serves as permanent storage for versions of a crate over time.
 //!
-//! Compared to git sources, a registry provides many packages as well as many
-//! versions simultaneously. Git sources can also have commits deleted through
-//! rebasings where registries cannot have their versions deleted.
+//! Compared to git sources (see [`GitSource`]), a registry provides many
+//! packages as well as many versions simultaneously. Git sources can also
+//! have commits deleted through rebasings where registries cannot have their
+//! versions deleted.
+//!
+//! In Cargo, [`RegistryData`] is an abstraction over each kind of actual
+//! registry, and [`RegistrySource`] connects those implementations to
+//! [`Source`] trait. Two prominent features these abstractions provide are
+//!
+//! * A way to query the metadata of a package from a registry. The metadata
+//!   comes from the index.
+//! * A way to download package contents (a.k.a source files) that are required
+//!   when building the package itself.
+//!
+//! We'll cover each functionality later.
+//!
+//! [Registries]: https://doc.rust-lang.org/nightly/cargo/reference/registries.html
+//! [`GitSource`]: super::GitSource
+//!
+//! # Different Kinds of Registries
+//!
+//! Cargo provides multiple kinds of registries. Each of them serves the index
+//! and package contents in a slightly different way. Namely,
+//!
+//! * [`LocalRegistry`] --- Serves the index and package contents entirely on
+//!   a local filesystem.
+//! * [`RemoteRegistry`] --- Serves the index ahead of time from a Git
+//!   repository, and package contents are downloaded as needed.
+//! * [`HttpRegistry`] --- Serves both the index and package contents on demand
+//!   over a HTTP-based registry API. This is the default starting from 1.70.0.
+//!
+//! Each registry has its own [`RegistryData`] implementation, and can be
+//! created from either [`RegistrySource::local`] or [`RegistrySource::remote`].
+//!
+//! [`LocalRegistry`]: local::LocalRegistry
+//! [`RemoteRegistry`]: remote::RemoteRegistry
+//! [`HttpRegistry`]: http_remote::HttpRegistry
 //!
 //! # The Index of a Registry
 //!
@@ -20,36 +54,16 @@
 //! available on a registry, what versions are available, and what the
 //! dependencies for each version is.
 //!
-//! One method of doing so would be having the registry expose an HTTP endpoint
-//! which can be queried with a list of packages and a response of their
-//! dependencies and versions is returned. This is somewhat inefficient however
-//! as we may have to hit the endpoint many times and we may have already
-//! queried for much of the data locally already (for other packages, for
-//! example). This also involves inventing a transport format between the
-//! registry and Cargo itself, so this route was not taken.
-//!
-//! Instead, Cargo communicates with registries through a git repository
-//! referred to as the Index. The Index of a registry is essentially an easily
-//! query-able version of the registry's database for a list of versions of a
-//! package as well as a list of dependencies for each version.
+//! To solve the problem, a registry must provide an index of package metadata.
+//! The index of a registry is essentially an easily query-able version of the
+//! registry's database for a list of versions of a package as well as a list
+//! of dependencies for each version. The exact format of the index is
+//! described later.
 //!
-//! Using git to host this index provides a number of benefits:
+//! See the [`index`] module for topics about the management, parsing, caching,
+//! and versioning for the on-disk index.
 //!
-//! * The entire index can be stored efficiently locally on disk. This means
-//!   that all queries of a registry can happen locally and don't need to touch
-//!   the network.
-//!
-//! * Updates of the index are quite efficient. Using git buys incremental
-//!   updates, compressed transmission, etc for free. The index must be updated
-//!   each time we need fresh information from a registry, but this is one
-//!   update of a git repository that probably hasn't changed a whole lot so
-//!   it shouldn't be too expensive.
-//!
-//!   Additionally, each modification to the index is just appending a line at
-//!   the end of a file (the exact format is described later). This means that
-//!   the commits for an index are quite small and easily applied/compressible.
-//!
-//! ## The format of the Index
+//! ## The Format of The Index
 //!
 //! The index is a store for the list of versions for all packages known, so its
 //! format on disk is optimized slightly to ensure that `ls registry` doesn't
@@ -59,9 +73,12 @@
 //! about the format of the registry:
 //!
 //! 1. Each crate will have one file corresponding to it. Each version for a
-//!    crate will just be a line in this file.
+//!    crate will just be a line in this file (see [`RegistryPackage`] for its
+//!    representation).
 //! 2. There will be two tiers of directories for crate names, under which
 //!    crates corresponding to those tiers will be located.
+//!    (See [`cargo_util::registry::make_dep_path`] for the implementation of
+//!    this layout hierarchy.)
 //!
 //! As an example, this is an example hierarchy of an index:
 //!
@@ -99,26 +116,30 @@
 //! The purpose of this layout is to hopefully cut down on `ls` sizes as well as
 //! efficient lookup based on the crate name itself.
 //!
-//! ## Crate files
+//! See [The Cargo Book: Registry Index][registry-index] for the public
+//! interface on the index format.
+//!
+//! [registry-index]: https://doc.rust-lang.org/nightly/cargo/reference/registry-index.html
+//!
+//! ## The Index Files
 //!
 //! Each file in the index is the history of one crate over time. Each line in
 //! the file corresponds to one version of a crate, stored in JSON format (see
-//! the `RegistryPackage` structure below).
+//! the [`RegistryPackage`] structure below).
 //!
-//! As new versions are published, new lines are appended to this file. The only
-//! modifications to this file that should happen over time are yanks of a
-//! particular version.
+//! As new versions are published, new lines are appended to this file. **The
+//! only modifications to this file that should happen over time are yanks of a
+//! particular version.**
 //!
 //! # Downloading Packages
 //!
-//! The purpose of the Index was to provide an efficient method to resolve the
-//! dependency graph for a package. So far we only required one network
-//! interaction to update the registry's repository (yay!). After resolution has
-//! been performed, however we need to download the contents of packages so we
-//! can read the full manifest and build the source code.
+//! The purpose of the index was to provide an efficient method to resolve the
+//! dependency graph for a package. After resolution has been performed, we need
+//! to download the contents of packages so we can read the full manifest and
+//! build the source code.
 //!
-//! To accomplish this, this source's `download` method will make an HTTP
-//! request per-package requested to download tarballs into a local cache. These
+//! To accomplish this, [`RegistryData::download`] will "make" an HTTP request
+//! per-package requested to download tarballs into a local cache. These
 //! tarballs will then be unpacked into a destination folder.
 //!
 //! Note that because versions uploaded to the registry are frozen forever that
@@ -128,7 +149,8 @@
 //!
 //! # Filesystem Hierarchy
 //!
-//! Overall, the `$HOME/.cargo` looks like this when talking about the registry:
+//! Overall, the `$HOME/.cargo` looks like this when talking about the registry
+//! (remote registries, specifically):
 //!
 //! ```notrust
 //! # A folder under which all registry metadata is hosted (similar to
@@ -144,8 +166,8 @@
 //!         registry2-<hash>/
 //!         ...
 //!
-//!     # This folder is a cache for all downloaded tarballs from a registry.
-//!     # Once downloaded and verified, a tarball never changes.
+//!     # This folder is a cache for all downloaded tarballs (`.crate` file)
+//!     # from a registry. Once downloaded and verified, a tarball never changes.
 //!     cache/
 //!         registry1-<hash>/<pkg>-<version>.crate
 //!         ...
@@ -153,6 +175,7 @@
 //!     # Location in which all tarballs are unpacked. Each tarball is known to
 //!     # be frozen after downloading, so transitively this folder is also
 //!     # frozen once its unpacked (it's never unpacked again)
+//!     # CAVEAT: They are not read-only. See rust-lang/cargo#9455.
 //!     src/
 //!         registry1-<hash>/<pkg>-<version>/...
 //!         ...
@@ -186,26 +209,35 @@ use crate::util::{
     restricted_names, CargoResult, Config, Filesystem, LimitErrorReader, OptVersionReq,
 };
 
+/// The `.cargo-ok` file is used to track if the source is already unpacked.
+/// See [`RegistrySource::unpack_package`] for more.
+///
+/// Not to be confused with `.cargo-ok` file in git sources.
 const PACKAGE_SOURCE_LOCK: &str = ".cargo-ok";
+
 pub const CRATES_IO_INDEX: &str = "https://github.com/rust-lang/crates.io-index";
 pub const CRATES_IO_HTTP_INDEX: &str = "sparse+https://index.crates.io/";
 pub const CRATES_IO_REGISTRY: &str = "crates-io";
 pub const CRATES_IO_DOMAIN: &str = "crates.io";
+
 const CRATE_TEMPLATE: &str = "{crate}";
 const VERSION_TEMPLATE: &str = "{version}";
 const PREFIX_TEMPLATE: &str = "{prefix}";
 const LOWER_PREFIX_TEMPLATE: &str = "{lowerprefix}";
 const CHECKSUM_TEMPLATE: &str = "{sha256-checksum}";
+
 const MAX_UNPACK_SIZE: u64 = 512 * 1024 * 1024;
 const MAX_COMPRESSION_RATIO: usize = 20; // 20:1
 
-/// A "source" for a local (see `local::LocalRegistry`) or remote (see
-/// `remote::RemoteRegistry`) registry.
+/// A [`Source`] implementation for a local or a remote registry.
 ///
-/// This contains common functionality that is shared between the two registry
-/// kinds, with the registry-specific logic implemented as part of the
+/// This contains common functionality that is shared between each registry
+/// kind, with the registry-specific logic implemented as part of the
 /// [`RegistryData`] trait referenced via the `ops` field.
+///
+/// For general concepts of registries, see the [module-level documentation](crate::sources::registry).
 pub struct RegistrySource<'cfg> {
+    /// The unique identifier of this source.
     source_id: SourceId,
     /// The path where crate files are extracted (`$CARGO_HOME/registry/src/$REG-HASH`).
     src_path: Filesystem,
@@ -225,7 +257,19 @@ pub struct RegistrySource<'cfg> {
     yanked_whitelist: HashSet<PackageId>,
 }
 
-/// The `config.json` file stored in the index.
+/// The [`config.json`] file stored in the index.
+///
+/// The config file may look like:
+///
+/// ```json
+/// {
+///     "dl": "https://example.com/api/{crate}/{version}/download",
+///     "api": "https://example.com/api",
+///     "auth-required": false             # unstable feature (RFC 3139)
+/// }
+/// ```
+///
+/// [`config.json`]: https://doc.rust-lang.org/nightly/cargo/reference/registry-index.html#index-configuration
 #[derive(Deserialize, Debug, Clone)]
 #[serde(rename_all = "kebab-case")]
 pub struct RegistryConfig {
@@ -252,22 +296,29 @@ pub struct RegistryConfig {
     /// If this is None, the registry does not support API commands.
     pub api: Option<String>,
 
-    /// Whether all operations require authentication.
+    /// Whether all operations require authentication. See [RFC 3139].
+    ///
+    /// [RFC 3139]: https://rust-lang.github.io/rfcs/3139-cargo-alternative-registry-auth.html
     #[serde(default)]
     pub auth_required: bool,
 }
 
-/// The maximum version of the `v` field in the index this version of cargo
-/// understands.
+/// The maximum schema version of the `v` field in the index this version of
+/// cargo understands. See [`RegistryPackage::v`] for the detail.
 pub(crate) const INDEX_V_MAX: u32 = 2;
 
 /// A single line in the index representing a single version of a package.
 #[derive(Deserialize)]
 pub struct RegistryPackage<'a> {
+    /// Name of the pacakge.
     name: InternedString,
+    /// The version of this dependency.
     vers: Version,
+    /// All kinds of direct dependencies of the package, including dev and
+    /// build dependencies.
     #[serde(borrow)]
     deps: Vec<RegistryDependency<'a>>,
+    /// Set of features defined for the package, i.e., `[features]` table.
     features: BTreeMap<InternedString, Vec<InternedString>>,
     /// This field contains features with new, extended syntax. Specifically,
     /// namespaced features (`dep:`) and weak dependencies (`pkg?/feat`).
@@ -276,6 +327,7 @@ pub struct RegistryPackage<'a> {
     /// will fail to load due to not being able to parse the new syntax, even
     /// with a `Cargo.lock` file.
     features2: Option<BTreeMap<InternedString, Vec<InternedString>>>,
+    /// Checksum for verifying the integrity of the corresponding downloaded package.
     cksum: String,
     /// If `true`, Cargo will skip this version when resolving.
     ///
@@ -349,19 +401,33 @@ fn escaped_char_in_json() {
     .unwrap();
 }
 
-/// A dependency as encoded in the index JSON.
+/// A dependency as encoded in the [`RegistryPackage`] index JSON.
 #[derive(Deserialize)]
 struct RegistryDependency<'a> {
+    /// Name of the dependency. If the dependency is renamed, the original
+    /// would be stored in [`RegistryDependency::package`].
     name: InternedString,
+    /// The SemVer requirement for this dependency.
     #[serde(borrow)]
     req: Cow<'a, str>,
+    /// Set of features enabled for this dependency.
     features: Vec<InternedString>,
+    /// Whether or not this is an optional dependency.
     optional: bool,
+    /// Whether or not default features are enabled.
     default_features: bool,
+    /// The target platform for this dependency.
     target: Option<Cow<'a, str>>,
+    /// The dependency kind. "dev", "build", and "normal".
     kind: Option<Cow<'a, str>>,
+    // The URL of the index of the registry where this dependency is from.
+    // `None` if it is from the same index.
     registry: Option<Cow<'a, str>>,
+    /// The original name if the dependency is renamed.
     package: Option<InternedString>,
+    /// Whether or not this is a public dependency. Unstable. See [RFC 1977].
+    ///
+    /// [RFC 1977]: https://rust-lang.github.io/rfcs/1977-public-private-dependencies.html
     public: Option<bool>,
 }
 
@@ -437,6 +503,7 @@ pub enum LoadResponse {
     /// The cache is out of date. Returned data should be used.
     Data {
         raw_data: Vec<u8>,
+        /// Version of this data to determine whether it is out of date.
         index_version: Option<String>,
     },
 
@@ -444,10 +511,11 @@ pub enum LoadResponse {
     NotFound,
 }
 
-/// An abstract interface to handle both a local (see `local::LocalRegistry`)
-/// and remote (see `remote::RemoteRegistry`) registry.
+/// An abstract interface to handle both a local and and remote registry.
 ///
-/// This allows [`RegistrySource`] to abstractly handle both registry kinds.
+/// This allows [`RegistrySource`] to abstractly handle each registry kind.
+///
+/// For general concepts of registries, see the [module-level documentation](crate::sources::registry).
 pub trait RegistryData {
     /// Performs initialization for the registry.
     ///
@@ -458,14 +526,15 @@ pub trait RegistryData {
     /// Returns the path to the index.
     ///
     /// Note that different registries store the index in different formats
-    /// (remote=git, local=files).
+    /// (remote = git, http & local = files).
     fn index_path(&self) -> &Filesystem;
 
     /// Loads the JSON for a specific named package from the index.
     ///
     /// * `root` is the root path to the index.
     /// * `path` is the relative path to the package to load (like `ca/rg/cargo`).
-    /// * `index_version` is the version of the requested crate data currently in cache.
+    /// * `index_version` is the version of the requested crate data currently
+    ///    in cache. This is useful for checking if a local cache is outdated.
     fn load(
         &mut self,
         root: &Path,
@@ -556,6 +625,8 @@ mod index;
 mod local;
 mod remote;
 
+/// Generates a unique name for [`SourceId`] to have a unique path to put their
+/// index files.
 fn short_name(id: SourceId, is_shallow: bool) -> String {
     let hash = hex::short_hash(&id);
     let ident = id.url().host_str().unwrap_or("").to_string();
@@ -567,6 +638,11 @@ fn short_name(id: SourceId, is_shallow: bool) -> String {
 }
 
 impl<'cfg> RegistrySource<'cfg> {
+    /// Creates a [`Source`] of a "remote" registry.
+    /// It could be either an HTTP-based [`http_remote::HttpRegistry`] or
+    /// a Git-based [`remote::RemoteRegistry`].
+    ///
+    /// * `yanked_whitelist` --- Packages allowed to be used, even if they are yanked.
     pub fn remote(
         source_id: SourceId,
         yanked_whitelist: &HashSet<PackageId>,
@@ -596,6 +672,10 @@ impl<'cfg> RegistrySource<'cfg> {
         ))
     }
 
+    /// Creates a [`Source`] of a local registry, with [`local::LocalRegistry`] under the hood.
+    ///
+    /// * `path` --- The root path of a local registry on the file system.
+    /// * `yanked_whitelist` --- Packages allowed to be used, even if they are yanked.
     pub fn local(
         source_id: SourceId,
         path: &Path,
@@ -607,6 +687,10 @@ impl<'cfg> RegistrySource<'cfg> {
         RegistrySource::new(source_id, config, &name, Box::new(ops), yanked_whitelist)
     }
 
+    /// Creates a source of a registry. This is a inner helper function.
+    ///
+    /// * `name` --- Unique name for this source to store source files (`.crate` tarballs) are stored.
+    /// * `ops` --- The underlying [`RegistryData`] type.
     fn new(
         source_id: SourceId,
         config: &'cfg Config,
@@ -624,7 +708,7 @@ impl<'cfg> RegistrySource<'cfg> {
         }
     }
 
-    /// Decode the configuration stored within the registry.
+    /// Decode the [configuration](RegistryConfig) stored within the registry.
     ///
     /// This requires that the index has been at least checked out.
     pub fn config(&mut self) -> Poll<CargoResult<Option<RegistryConfig>>> {
@@ -635,9 +719,46 @@ impl<'cfg> RegistrySource<'cfg> {
     /// compiled.
     ///
     /// No action is taken if the source looks like it's already unpacked.
+    ///
+    /// # History of interruption detection with `.cargo-lock` file
+    ///
+    /// Cargo has always included a `.cargo-ok` file ([`PACKAGE_SOURCE_LOCK`])
+    /// to detect if extraction was interrupted, but it was originally empty.
+    ///
+    /// In 1.34, Cargo was changed to create the `.cargo-ok` file before it
+    /// started extraction to implement fine-grained locking. After it was
+    /// finished extracting, it wrote two bytes to indicate it was complete.
+    /// It would use the length check to detect if it was possibly interrupted.
+    ///
+    /// In 1.36, Cargo changed to not use fine-grained locking, and instead used
+    /// a global lock. The use of `.cargo-ok` was no longer needed for locking
+    /// purposes, but was kept to detect when extraction was interrupted.
+    ///
+    /// In 1.49, Cargo changed to not create the `.cargo-ok` file before it
+    /// started extraction to deal with `.crate` files that inexplicably had
+    /// a `.cargo-ok` file in them.
+    ///
+    /// In 1.64, Cargo changed to detect `.crate` files with `.cargo-ok` files
+    /// in them in response to [CVE-2022-36113], which dealt with malicious
+    /// `.crate` files making `.cargo-ok` a symlink causing cargo to write "ok"
+    /// to any arbitrary file on the filesystem it has permission to.
+    ///
+    /// This is all a long-winded way of explaining the circumstances that might
+    /// cause a directory to contain a `.cargo-ok` file that is empty or
+    /// otherwise corrupted. Either this was extracted by a version of Rust
+    /// before 1.34, in which case everything should be fine. However, an empty
+    /// file created by versions 1.36 to 1.49 indicates that the extraction was
+    /// interrupted and that we need to start again.
+    ///
+    /// Another possibility is that the filesystem is simply corrupted, in
+    /// which case deleting the directory might be the safe thing to do. That
+    /// is probably unlikely, though.
+    ///
+    /// To be safe, we deletes the directory and starts over again if an empty
+    /// `.cargo-ok` file is found.
+    ///
+    /// [CVE-2022-36113]: https://blog.rust-lang.org/2022/09/14/cargo-cves.html#arbitrary-file-corruption-cve-2022-36113
     fn unpack_package(&self, pkg: PackageId, tarball: &File) -> CargoResult<PathBuf> {
-        // The `.cargo-ok` file is used to track if the source is already
-        // unpacked.
         let package_dir = format!("{}-{}", pkg.name(), pkg.version());
         let dst = self.src_path.join(&package_dir);
         let path = dst.join(PACKAGE_SOURCE_LOCK);
@@ -646,47 +767,7 @@ impl<'cfg> RegistrySource<'cfg> {
         match path.metadata() {
             Ok(meta) if meta.len() > 0 => return Ok(unpack_dir.to_path_buf()),
             Ok(_meta) => {
-                // The `.cargo-ok` file is not in a state we expect it to be
-                // (with two bytes containing "ok").
-                //
-                // Cargo has always included a `.cargo-ok` file to detect if
-                // extraction was interrupted, but it was originally empty.
-                //
-                // In 1.34, Cargo was changed to create the `.cargo-ok` file
-                // before it started extraction to implement fine-grained
-                // locking. After it was finished extracting, it wrote two
-                // bytes to indicate it was complete. It would use the length
-                // check to detect if it was possibly interrupted.
-                //
-                // In 1.36, Cargo changed to not use fine-grained locking, and
-                // instead used a global lock. The use of `.cargo-ok` was no
-                // longer needed for locking purposes, but was kept to detect
-                // when extraction was interrupted.
-                //
-                // In 1.49, Cargo changed to not create the `.cargo-ok` file
-                // before it started extraction to deal with `.crate` files
-                // that inexplicably had a `.cargo-ok` file in them.
-                //
-                // In 1.64, Cargo changed to detect `.crate` files with
-                // `.cargo-ok` files in them in response to CVE-2022-36113,
-                // which dealt with malicious `.crate` files making
-                // `.cargo-ok` a symlink causing cargo to write "ok" to any
-                // arbitrary file on the filesystem it has permission to.
-                //
-                // This is all a long-winded way of explaining the
-                // circumstances that might cause a directory to contain a
-                // `.cargo-ok` file that is empty or otherwise corrupted.
-                // Either this was extracted by a version of Rust before 1.34,
-                // in which case everything should be fine. However, an empty
-                // file created by versions 1.36 to 1.49 indicates that the
-                // extraction was interrupted and that we need to start again.
-                //
-                // Another possibility is that the filesystem is simply
-                // corrupted, in which case deleting the directory might be
-                // the safe thing to do. That is probably unlikely, though.
-                //
-                // To be safe, this deletes the directory and starts over
-                // again.
+                // See comment of `unpack_package` about why removing all stuff.
                 log::warn!("unexpected length of {path:?}, clearing cache");
                 paths::remove_dir_all(dst.as_path_unlocked())?;
             }
@@ -758,6 +839,12 @@ impl<'cfg> RegistrySource<'cfg> {
         Ok(unpack_dir.to_path_buf())
     }
 
+    /// Turns the downloaded `.crate` tarball file into a [`Package`].
+    ///
+    /// This unconditionally sets checksum for the returned package, so it
+    /// should only be called after doing integrity check. That is to say,
+    /// you need to call either [`RegistryData::download`] or
+    /// [`RegistryData::finish_download`] before calling this method.
     fn get_pkg(&mut self, package: PackageId, path: &File) -> CargoResult<Package> {
         let path = self
             .unpack_package(package, path)

From 788816718290b6e00cf4347afabc26f9719762b3 Mon Sep 17 00:00:00 2001
From: Weihang Lo <me@weihanglo.tw>
Date: Wed, 7 Jun 2023 13:47:19 +0100
Subject: [PATCH 3/3] doc: doc comments for registry index

---
 src/cargo/sources/registry/index.rs | 282 ++++++++++++++++++----------
 1 file changed, 185 insertions(+), 97 deletions(-)

diff --git a/src/cargo/sources/registry/index.rs b/src/cargo/sources/registry/index.rs
index d857a053ed2..1c5587a4a8c 100644
--- a/src/cargo/sources/registry/index.rs
+++ b/src/cargo/sources/registry/index.rs
@@ -1,11 +1,26 @@
-//! Management of the index of a registry source
+//! Management of the index of a registry source.
 //!
 //! This module contains management of the index and various operations, such as
 //! actually parsing the index, looking for crates, etc. This is intended to be
-//! abstract over remote indices (downloaded via git) and local registry indices
-//! (which are all just present on the filesystem).
+//! abstract over remote indices (downloaded via Git or HTTP) and local registry
+//! indices (which are all just present on the filesystem).
 //!
-//! ## Index Performance
+//! ## How the index works
+//!
+//! Here is a simple flow when loading a [`Summary`] (metadata) from the index:
+//!
+//! 1. A query is fired via [`RegistryIndex::query_inner`].
+//! 2. Tries loading all summaries via [`RegistryIndex::load_summaries`], and
+//!    under the hood calling [`Summaries::parse`] to parse an index file.
+//!     1. If an on-disk index cache is present, loads it via
+//!        [`Summaries::parse_cache`].
+//!     2. Otherwise goes to the slower path [`RegistryData::load`] to get the
+//!        specific index file.
+//! 3. A [`Summary`] is now ready in callback `f` in [`RegistryIndex::query_inner`].
+//!
+//! This is just an overview. To know the rationale behind, continue reading.
+//!
+//! ## A layer of on-disk index cache for performance
 //!
 //! One important aspect of the index is that we want to optimize the "happy
 //! path" as much as possible. Whenever you type `cargo build` Cargo will
@@ -20,19 +35,20 @@
 //! don't need them. Most secondary optimizations are centered around removing
 //! allocations and such, but avoiding parsing JSON is the #1 optimization.
 //!
-//! When we get queries from the resolver we're given a `Dependency`. This
+//! When we get queries from the resolver we're given a [`Dependency`]. This
 //! dependency in turn has a version requirement, and with lock files that
 //! already exist these version requirements are exact version requirements
 //! `=a.b.c`. This means that we in theory only need to parse one line of JSON
 //! per query in the registry, the one that matches version `a.b.c`.
 //!
 //! The crates.io index, however, is not amenable to this form of query. Instead
-//! the crates.io index simply is a file where each line is a JSON blob. To
-//! learn about the versions in each JSON blob we would need to parse the JSON,
-//! defeating the purpose of trying to parse as little as possible.
+//! the crates.io index simply is a file where each line is a JSON blob, aka
+//! [`RegistryPackage`]. To learn about the versions in each JSON blob we
+//! would need to parse the JSON via [`IndexSummary::parse`], defeating the
+//! purpose of trying to parse as little as possible.
 //!
 //! > Note that as a small aside even *loading* the JSON from the registry is
-//! > actually pretty slow. For crates.io and remote registries we don't
+//! > actually pretty slow. For crates.io and [`RemoteRegistry`] we don't
 //! > actually check out the git index on disk because that takes quite some
 //! > time and is quite large. Instead we use `libgit2` to read the JSON from
 //! > the raw git objects. This in turn can be slow (aka show up high in
@@ -43,14 +59,14 @@
 //! (first time being for an entire computer) Cargo will load the contents
 //! (slowly via libgit2) from the registry. It will then (slowly) parse every
 //! single line to learn about its versions. Afterwards, however, Cargo will
-//! emit a new file (a cache) which is amenable for speedily parsing in future
-//! invocations.
+//! emit a new file (a cache, representing as [`SummariesCache`]) which is
+//! amenable for speedily parsing in future invocations.
 //!
 //! This cache file is currently organized by basically having the semver
-//! version extracted from each JSON blob. That way Cargo can quickly and easily
-//! parse all versions contained and which JSON blob they're associated with.
-//! The JSON blob then doesn't actually need to get parsed unless the version is
-//! parsed.
+//! version extracted from each JSON blob. That way Cargo can quickly and
+//! easily parse all versions contained and which JSON blob they're associated
+//! with. The JSON blob then doesn't actually need to get parsed unless the
+//! version is parsed.
 //!
 //! Altogether the initial measurements of this shows a massive improvement for
 //! Cargo null build performance. It's expected that the improvements earned
@@ -65,6 +81,9 @@
 //! Note that this is just a high-level overview, there's of course lots of
 //! details like invalidating caches and whatnot which are handled below, but
 //! hopefully those are more obvious inline in the code itself.
+//!
+//! [`RemoteRegistry`]: super::remote::RemoteRegistry
+//! [`Dependency`]: crate::core::Dependency
 
 use crate::core::{PackageId, SourceId, Summary};
 use crate::sources::registry::{LoadResponse, RegistryData, RegistryPackage, INDEX_V_MAX};
@@ -83,17 +102,24 @@ use std::task::{ready, Poll};
 
 /// Manager for handling the on-disk index.
 ///
-/// Note that local and remote registries store the index differently. Local
-/// is a simple on-disk tree of files of the raw index. Remote registries are
-/// stored as a raw git repository. The different means of access are handled
-/// via the [`RegistryData`] trait abstraction.
+/// Different kinds of registries store the index differently:
+///
+/// * [`LocalRegistry`]` is a simple on-disk tree of files of the raw index.
+/// * [`RemoteRegistry`] is stored as a raw git repository.
+/// * [`HttpRegistry`] fills the on-disk index cache directly without keeping
+///   any raw index.
 ///
+/// These means of access are handled via the [`RegistryData`] trait abstraction.
 /// This transparently handles caching of the index in a more efficient format.
+///
+/// [`LocalRegistry`]: super::local::LocalRegistry
+/// [`RemoteRegistry`]: super::remote::RemoteRegistry
+/// [`HttpRegistry`]: super::http_remote::HttpRegistry
 pub struct RegistryIndex<'cfg> {
     source_id: SourceId,
     /// Root directory of the index for the registry.
     path: Filesystem,
-    /// Cache of summary data.
+    /// In-memory cache of summary data.
     ///
     /// This is keyed off the package name. The [`Summaries`] value handles
     /// loading the summary data. It keeps an optimized on-disk representation
@@ -110,14 +136,16 @@ pub struct RegistryIndex<'cfg> {
 ///
 /// A list of summaries are loaded from disk via one of two methods:
 ///
-/// 1. Primarily Cargo will parse the corresponding file for a crate in the
-///    upstream crates.io registry. That's just a JSON blob per line which we
-///    can parse, extract the version, and then store here.
+/// 1. From raw registry index --- Primarily Cargo will parse the corresponding
+///    file for a crate in the upstream crates.io registry. That's just a JSON
+///    blob per line which we can parse, extract the version, and then store here.
+///    See [`RegistryPackage`] and [`IndexSummary::parse`].
 ///
-/// 2. Alternatively, if Cargo has previously run, we'll have a cached index of
-///    dependencies for the upstream index. This is a file that Cargo maintains
-///    lazily on the local filesystem and is much faster to parse since it
-///    doesn't involve parsing all of the JSON.
+/// 2. From on-disk index cache --- If Cargo has previously run, we'll have a
+///    cached index of dependencies for the upstream index. This is a file that
+///    Cargo maintains lazily on the local filesystem and is much faster to
+///    parse since it doesn't involve parsing all of the JSON.
+///    See [`SummariesCache`].
 ///
 /// The outward-facing interface of this doesn't matter too much where it's
 /// loaded from, but it's important when reading the implementation to note that
@@ -134,37 +162,100 @@ struct Summaries {
     versions: HashMap<Version, MaybeIndexSummary>,
 }
 
-/// A lazily parsed `IndexSummary`.
+/// A lazily parsed [`IndexSummary`].
 enum MaybeIndexSummary {
     /// A summary which has not been parsed, The `start` and `end` are pointers
-    /// into `Summaries::raw_data` which this is an entry of.
+    /// into [`Summaries::raw_data`] which this is an entry of.
     Unparsed { start: usize, end: usize },
 
     /// An actually parsed summary.
     Parsed(IndexSummary),
 }
 
-/// A parsed representation of a summary from the index.
+/// A parsed representation of a summary from the index. This is usually parsed
+/// from a line from a raw index file, or a JSON blob from on-disk index cache.
 ///
-/// In addition to a full `Summary` we have information on whether it is `yanked`.
+/// In addition to a full [`Summary`], we have information on whether it is `yanked`.
 pub struct IndexSummary {
     pub summary: Summary,
     pub yanked: bool,
-    /// Schema version, see [`RegistryPackage`].
+    /// Schema version, see [`RegistryPackage::v`].
     v: u32,
 }
 
 /// A representation of the cache on disk that Cargo maintains of summaries.
+///
 /// Cargo will initially parse all summaries in the registry and will then
 /// serialize that into this form and place it in a new location on disk,
 /// ensuring that access in the future is much speedier.
+///
+/// For serialization and deserialization of this on-disk index cache of
+/// summaries, see [`SummariesCache::serialize`]  and [`SummariesCache::parse`].
+///
+/// # The format of the index cache
+///
+/// The idea of this format is that it's a very easy file for Cargo to parse in
+/// future invocations. The read from disk should be fast and then afterwards
+/// all we need to know is what versions correspond to which JSON blob.
+///
+/// Currently the format looks like:
+///
+/// ```text
+/// +---------------+----------------------+--------------------+---+
+/// | cache version | index format version | index file version | 0 |
+/// +---------------+----------------------+--------------------+---+
+/// ```
+///
+/// followed by one or more (version + JSON blob) pairs...
+///
+/// ```text
+/// +----------------+---+-----------+---+
+/// | semver version | 0 | JSON blob | 0 | ...
+/// +----------------+---+-----------+---+
+/// ```
+///
+/// Each field represents:
+///
+/// * _cache version_ --- Intended to ensure that there's some level of
+///   future compatibility against changes to this cache format so if different
+///   versions of Cargo share the same cache they don't get too confused.
+/// * _index format version_ --- The version of the raw index file.
+///   See [`RegistryPackage::v`] for the detail.
+/// * _index file version_ --- Tracks when a cache needs to be regenerated.
+///   A cache regeneration is required whenever the index file itself updates.
+/// * _semver version_ --- The version for each JSON blob. Extracted from the
+///   blob for fast queries without parsing the entire blob.
+/// * _JSON blob_ --- The actual metadata for each version of the package. It
+///   has the same representation as [`RegistryPackage`].
+///
+/// # Changes between each cache version
+///
+/// * `1`: The original version.
+/// * `2`: Added the "index format version" field so that if the index format
+///   changes, different versions of cargo won't get confused reading each
+///   other's caches.
+/// * `3`: Bumped the version to work around an issue where multiple versions of
+///   a package were published that differ only by semver metadata. For
+///   example, openssl-src 110.0.0 and 110.0.0+1.1.0f. Previously, the cache
+///   would be incorrectly populated with two entries, both 110.0.0. After
+///   this, the metadata will be correctly included. This isn't really a format
+///   change, just a version bump to clear the incorrect cache entries. Note:
+///   the index shouldn't allow these, but unfortunately crates.io doesn't
+///   check it.
+///
+/// See [`CURRENT_CACHE_VERSION`] for the current cache version.
 #[derive(Default)]
 struct SummariesCache<'a> {
+    /// JSON blobs of the summaries. Each JSON blob has a [`Version`] beside,
+    /// so that Cargo can query a version without full JSON parsing.
     versions: Vec<(Version, &'a [u8])>,
+    /// For cache invalidation, we tracks the index file version to determine
+    /// when to regenerate the cache itself.
     index_version: &'a str,
 }
 
 impl<'cfg> RegistryIndex<'cfg> {
+    /// Creates an empty registry index at `path`.
     pub fn new(
         source_id: SourceId,
         path: &Filesystem,
@@ -178,7 +269,9 @@ impl<'cfg> RegistryIndex<'cfg> {
         }
     }
 
-    /// Returns the hash listed for a specified `PackageId`.
+    /// Returns the hash listed for a specified `PackageId`. Primarily for
+    /// checking the integrity of a downloaded package matching the checksum in
+    /// the index file, aka [`IndexSummary`].
     pub fn hash(&mut self, pkg: PackageId, load: &mut dyn RegistryData) -> Poll<CargoResult<&str>> {
         let req = OptVersionReq::exact(pkg.version());
         let summary = self.summaries(&pkg.name(), &req, load)?;
@@ -191,10 +284,14 @@ impl<'cfg> RegistryIndex<'cfg> {
     }
 
     /// Load a list of summaries for `name` package in this registry which
-    /// match `req`
+    /// match `req`.
+    ///
+    /// This function will semantically
+    ///
+    /// 1. parse the index file (either raw or cache),
+    /// 2. match all versions,
+    /// 3. and then return an iterator over all summaries which matched.
     ///
-    /// This function will semantically parse the on-disk index, match all
-    /// versions, and then return an iterator over all summaries which matched.
     /// Internally there's quite a few layer of caching to amortize this cost
     /// though since this method is called quite a lot on null builds in Cargo.
     pub fn summaries<'a, 'b>(
@@ -209,10 +306,7 @@ impl<'cfg> RegistryIndex<'cfg> {
         let source_id = self.source_id;
         let config = self.config;
 
-        // First up actually parse what summaries we have available. If Cargo
-        // has run previously this will parse a Cargo-specific cache file rather
-        // than the registry itself. In effect this is intended to be a quite
-        // cheap operation.
+        // First up parse what summaries we have available.
         let name = InternedString::new(name);
         let summaries = ready!(self.load_summaries(name, load)?);
 
@@ -251,13 +345,28 @@ impl<'cfg> RegistryIndex<'cfg> {
             })))
     }
 
+    /// Actually parses what summaries we have available.
+    ///
+    /// If Cargo has run previously, this tries in this order:
+    ///
+    /// 1. Returns from in-memory cache, aka [`RegistryIndex::summaries_cache`].
+    /// 2. If missing, hands over to [`Summaries::parse`] to parse an index file.
+    ///
+    ///    The actual kind index file being parsed depends on which kind of
+    ///    [`RegistryData`] the `load` argument is given. For example, a
+    ///    Git-based [`RemoteRegistry`] will first try a on-disk index cache
+    ///    file, and then try parsing registry raw index fomr Git repository.
+    ///
+    /// In effect, this is intended to be a quite cheap operation.
+    ///
+    /// [`RemoteRegistry`]: super::remote::RemoteRegistry
     fn load_summaries(
         &mut self,
         name: InternedString,
         load: &mut dyn RegistryData,
     ) -> Poll<CargoResult<&mut Summaries>> {
         // If we've previously loaded what versions are present for `name`, just
-        // return that since our cache should still be valid.
+        // return that since our in-memory cache should still be valid.
         if self.summaries_cache.contains_key(&name) {
             return Poll::Ready(Ok(self.summaries_cache.get_mut(&name).unwrap()));
         }
@@ -295,6 +404,9 @@ impl<'cfg> RegistryIndex<'cfg> {
         self.summaries_cache.clear();
     }
 
+    /// Attempts to find the packages that match a `name` and a version `req`.
+    ///
+    /// This is primarily used by [`Source::query`](super::Source).
     pub fn query_inner(
         &mut self,
         name: &str,
@@ -324,6 +436,10 @@ impl<'cfg> RegistryIndex<'cfg> {
             .map_ok(|_| ())
     }
 
+    /// Inner implementation of [`Self::query_inner`]. Returns the number of
+    /// summaries we've got.
+    ///
+    /// The `online` controls whether Cargo can access the network when needed.
     fn query_inner_with_online(
         &mut self,
         name: &str,
@@ -404,6 +520,7 @@ impl<'cfg> RegistryIndex<'cfg> {
         Poll::Ready(Ok(count))
     }
 
+    /// Looks into the summaries to check if a package has been yanked.
     pub fn is_yanked(
         &mut self,
         pkg: PackageId,
@@ -418,23 +535,26 @@ impl<'cfg> RegistryIndex<'cfg> {
 }
 
 impl Summaries {
-    /// Parse out a `Summaries` instances from on-disk state.
+    /// Parse out a [`Summaries`] instances from on-disk state.
+    ///
+    /// This will do the followings in order:
     ///
-    /// This will attempt to prefer parsing a previous cache file that already
-    /// exists from a previous invocation of Cargo (aka you're typing `cargo
-    /// build` again after typing it previously). If parsing fails or the cache
-    /// isn't found, then we take a slower path which loads the full descriptor
-    /// for `relative` from the underlying index (aka typically libgit2 with
-    /// crates.io) and then parse everything in there.
+    /// 1. Attempt to prefer parsing a previous index cache file that already
+    ///    exists from a previous invocation of Cargo (aka you're typing `cargo
+    ///    build` again after typing it previously).
+    /// 2. If parsing fails, or the cache isn't found or is invalid, we then
+    ///    take a slower path which loads the full descriptor for `relative`
+    ///    from the underlying index (aka libgit2 with crates.io, or from a
+    ///    remote HTTP index) and then parse everything in there.
     ///
-    /// * `root` - this is the root argument passed to `load`
-    /// * `cache_root` - this is the root on the filesystem itself of where to
-    ///   store cache files.
-    /// * `relative` - this is the file we're loading from cache or the index
+    /// * `root` --- this is the root argument passed to `load`
+    /// * `cache_root` --- this is the root on the filesystem itself of where
+    ///   to store cache files.
+    /// * `relative` --- this is the file we're loading from cache or the index
     ///   data
-    /// * `source_id` - the registry's SourceId used when parsing JSON blobs to
-    ///   create summaries.
-    /// * `load` - the actual index implementation which may be very slow to
+    /// * `source_id` --- the registry's SourceId used when parsing JSON blobs
+    ///   to create summaries.
+    /// * `load` --- the actual index implementation which may be very slow to
     ///   call. We avoid this if we can.
     pub fn parse(
         root: &Path,
@@ -549,8 +669,8 @@ impl Summaries {
         }
     }
 
-    /// Parses an open `File` which represents information previously cached by
-    /// Cargo.
+    /// Parses the contents of an on-disk cache, aka [`SummariesCache`], which
+    /// represents information previously cached by Cargo.
     pub fn parse_cache(contents: Vec<u8>) -> CargoResult<(Summaries, InternedString)> {
         let cache = SummariesCache::parse(&contents)?;
         let index_version = InternedString::new(cache.index_version);
@@ -577,46 +697,11 @@ impl Summaries {
     }
 }
 
-// Implementation of serializing/deserializing the cache of summaries on disk.
-// Currently the format looks like:
-//
-// +--------------------+----------------------+-------------+---+
-// | cache version byte | index format version | git sha rev | 0 |
-// +--------------------+----------------------+-------------+---+
-//
-// followed by...
-//
-// +----------------+---+------------+---+
-// | semver version | 0 |  JSON blob | 0 | ...
-// +----------------+---+------------+---+
-//
-// The idea is that this is a very easy file for Cargo to parse in future
-// invocations. The read from disk should be quite fast and then afterwards all
-// we need to know is what versions correspond to which JSON blob.
-//
-// The leading version byte is intended to ensure that there's some level of
-// future compatibility against changes to this cache format so if different
-// versions of Cargo share the same cache they don't get too confused. The git
-// sha lets us know when the file needs to be regenerated (it needs regeneration
-// whenever the index itself updates).
-//
-// Cache versions:
-// * `1`: The original version.
-// * `2`: Added the "index format version" field so that if the index format
-//   changes, different versions of cargo won't get confused reading each
-//   other's caches.
-// * `3`: Bumped the version to work around an issue where multiple versions of
-//   a package were published that differ only by semver metadata. For
-//   example, openssl-src 110.0.0 and 110.0.0+1.1.0f. Previously, the cache
-//   would be incorrectly populated with two entries, both 110.0.0. After
-//   this, the metadata will be correctly included. This isn't really a format
-//   change, just a version bump to clear the incorrect cache entries. Note:
-//   the index shouldn't allow these, but unfortunately crates.io doesn't
-//   check it.
-
+/// The current version of [`SummariesCache`].
 const CURRENT_CACHE_VERSION: u8 = 3;
 
 impl<'a> SummariesCache<'a> {
+    /// Deserializes an on-disk cache.
     fn parse(data: &'a [u8]) -> CargoResult<SummariesCache<'a>> {
         // NB: keep this method in sync with `serialize` below
         let (first_byte, rest) = data
@@ -655,6 +740,7 @@ impl<'a> SummariesCache<'a> {
         Ok(ret)
     }
 
+    /// Serializes itself with a given `index_version`.
     fn serialize(&self, index_version: &str) -> Vec<u8> {
         // NB: keep this method in sync with `parse` above
         let size = self
@@ -709,10 +795,11 @@ impl From<IndexSummary> for MaybeIndexSummary {
 }
 
 impl IndexSummary {
-    /// Parses a line from the registry's index file into an `IndexSummary` for
-    /// a package.
+    /// Parses a line from the registry's index file into an [`IndexSummary`]
+    /// for a package.
     ///
-    /// The `line` provided is expected to be valid JSON.
+    /// The `line` provided is expected to be valid JSON. It is supposed to be
+    /// a [`RegistryPackage`].
     fn parse(config: &Config, line: &[u8], source_id: SourceId) -> CargoResult<IndexSummary> {
         // ****CAUTION**** Please be extremely careful with returning errors
         // from this function. Entries that error are not included in the
@@ -754,6 +841,7 @@ impl IndexSummary {
     }
 }
 
+/// Like [`slice::split`] but is optimized by [`memchr`].
 fn split(haystack: &[u8], needle: u8) -> impl Iterator<Item = &[u8]> {
     struct Split<'a> {
         haystack: &'a [u8],