diff --git a/Cargo.lock b/Cargo.lock index 2451ddf19f03..2f2c120e7f89 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -362,12 +362,35 @@ dependencies = [ "serde", ] +[[package]] +name = "cargo-util" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77042b5b585f701f1cfb4b6b12ebc02b9b0cefbc8dcce235906b6bf376d4245d" +dependencies = [ + "anyhow", + "core-foundation", + "filetime", + "hex", + "jobserver", + "libc", + "miow", + "same-file", + "sha2", + "shell-escape", + "tempfile", + "tracing", + "walkdir", + "windows-sys 0.48.0", +] + [[package]] name = "cc" version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" dependencies = [ + "jobserver", "libc", ] @@ -956,6 +979,27 @@ version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0" +[[package]] +name = "git2" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbf97ba92db08df386e10c8ede66a2a0369bd277090afd8710e19e38de9ec0cd" +dependencies = [ + "bitflags 2.4.1", + "libc", + "libgit2-sys", + "log", + "openssl-probe", + "openssl-sys", + "url", +] + +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "globset" version = "0.4.13" @@ -1412,6 +1456,15 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +[[package]] +name = "jobserver" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d" +dependencies = [ + "libc", +] + [[package]] name = "js-sys" version = "0.3.64" @@ -1433,6 +1486,46 @@ version = "0.2.149" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" +[[package]] +name = "libgit2-sys" +version = "0.16.1+1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2a2bb3680b094add03bb3732ec520ece34da31a8cd2d633d1389d0f0fb60d0c" +dependencies = [ + "cc", + "libc", + "libssh2-sys", + "libz-sys", + "openssl-sys", + "pkg-config", +] + +[[package]] +name = "libssh2-sys" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dc8a030b787e2119a731f1951d6a773e2280c660f8ec4b0f5e1505a386e71ee" +dependencies = [ + "cc", + "libc", + "libz-sys", + "openssl-sys", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "libz-sys" +version = "1.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d97137b25e321a73eef1418d1d5d2eda4d77e12813f8e6dead84bc52c5870a7b" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "line-wrap" version = "0.1.1" @@ -1582,6 +1675,15 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "miow" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "359f76430b20a79f9e20e115b3428614e654f04fab314482fc0fda0ebd3c6044" +dependencies = [ + "windows-sys 0.48.0", +] + [[package]] name = "normalize-line-endings" version = "0.3.0" @@ -1638,6 +1740,34 @@ version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + +[[package]] +name = "openssl-src" +version = "300.1.6+3.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439fac53e092cd7442a3660c85dde4643ab3b5bd39040912388dcdabf6b88085" +dependencies = [ + "cc", +] + +[[package]] +name = "openssl-sys" +version = "0.9.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db4d56a4c0478783083cfafcc42493dd4a981d41669da64b4572a2a089b51b1d" +dependencies = [ + "cc", + "libc", + "openssl-src", + "pkg-config", + "vcpkg", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -1778,6 +1908,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkg-config" +version = "0.3.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" + [[package]] name = "plain" version = "0.2.3" @@ -2110,6 +2246,25 @@ dependencies = [ "url", ] +[[package]] +name = "puffin-git" +version = "0.0.1" +dependencies = [ + "anyhow", + "cargo-util", + "git2", + "glob", + "hex", + "once_cell", + "puffin-cache", + "rand", + "reqwest", + "serde", + "tokio", + "tracing", + "url", +] + [[package]] name = "puffin-installer" version = "0.0.1" @@ -2218,6 +2373,7 @@ dependencies = [ "pubgrub", "puffin-client", "puffin-distribution", + "puffin-git", "puffin-interpreter", "puffin-normalize", "puffin-package", @@ -2841,6 +2997,12 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "shell-escape" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45bb67a18fa91266cc7807181f62f9178a6873bfad7dc788c42e6430db40184f" + [[package]] name = "similar" version = "2.3.0" @@ -3505,6 +3667,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.4" diff --git a/Cargo.toml b/Cargo.toml index 326dbbd11c34..943d3b24f43d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ fs-err = { version = "2.9.0" } fs2 = { version = "0.4.3" } futures = { version = "0.3.28" } fxhash = { version = "0.2.1" } +glob = { version = "0.3.1" } goblin = { version = "0.7.1" } hex = { version = "0.4.3" } http-cache-reqwest = { version = "0.11.3" } @@ -42,6 +43,7 @@ petgraph = { version = "0.6.4" } platform-info = { version = "2.0.2" } plist = { version = "1.5.0" } pyproject-toml = { version = "0.7.0" } +rand = { version = "0.8.5" } rayon = { version = "1.8.0" } reflink-copy = { version = "0.1.10" } regex = { version = "1.9.6" } diff --git a/crates/puffin-build/src/lib.rs b/crates/puffin-build/src/lib.rs index 57022cd7ec9d..fe0954cc7543 100644 --- a/crates/puffin-build/src/lib.rs +++ b/crates/puffin-build/src/lib.rs @@ -117,7 +117,8 @@ pub struct SourceDistributionBuilder { } impl SourceDistributionBuilder { - /// Extract the source distribution and create a venv with the required packages + /// Create a virtual environment in which to build a source distribution, extracting the + /// contents from an archive if necessary. pub async fn setup( sdist: &Path, interpreter_info: &InterpreterInfo, @@ -126,9 +127,13 @@ impl SourceDistributionBuilder { let temp_dir = tempdir()?; // TODO(konstin): Parse and verify filenames - debug!("Unpacking for build {}", sdist.display()); - let extracted = temp_dir.path().join("extracted"); - let source_tree = extract_archive(sdist, &extracted)?; + let source_tree = if fs::metadata(sdist)?.is_dir() { + sdist.to_path_buf() + } else { + debug!("Unpacking for build: {}", sdist.display()); + let extracted = temp_dir.path().join("extracted"); + extract_archive(sdist, &extracted)? + }; // Check if we have a PEP 517 build, a legacy setup.py, or an edge case let build_system = if source_tree.join("pyproject.toml").is_file() { diff --git a/crates/puffin-cli/tests/pip_compile.rs b/crates/puffin-cli/tests/pip_compile.rs index 5417e11c3a49..85e10bfaf280 100644 --- a/crates/puffin-cli/tests/pip_compile.rs +++ b/crates/puffin-cli/tests/pip_compile.rs @@ -627,6 +627,46 @@ fn compile_sdist_url_dependency() -> Result<()> { Ok(()) } +/// Resolve a specific Flask source distribution via a Git HTTPS dependency. +#[test] +fn compile_git_https_dependency() -> Result<()> { + let temp_dir = assert_fs::TempDir::new()?; + let cache_dir = assert_fs::TempDir::new()?; + let venv = temp_dir.child(".venv"); + + Command::new(get_cargo_bin(BIN_NAME)) + .arg("venv") + .arg(venv.as_os_str()) + .arg("--cache-dir") + .arg(cache_dir.path()) + .current_dir(&temp_dir) + .assert() + .success(); + venv.assert(predicates::path::is_dir()); + + let requirements_in = temp_dir.child("requirements.in"); + requirements_in.touch()?; + requirements_in.write_str("flask @ git+https://github.com/pallets/flask.git")?; + + insta::with_settings!({ + filters => vec![ + (r"(\d|\.)+(ms|s)", "[TIME]"), + (r"# .* pip-compile", "# [BIN_PATH] pip-compile"), + (r"--cache-dir .*", "--cache-dir [CACHE_DIR]"), + ] + }, { + assert_cmd_snapshot!(Command::new(get_cargo_bin(BIN_NAME)) + .arg("pip-compile") + .arg("requirements.in") + .arg("--cache-dir") + .arg(cache_dir.path()) + .env("VIRTUAL_ENV", venv.as_os_str()) + .current_dir(&temp_dir)); + }); + + Ok(()) +} + /// Request Flask, but include a URL dependency for Werkzeug, which should avoid adding a /// duplicate dependency from `PyPI`. #[test] diff --git a/crates/puffin-cli/tests/snapshots/pip_compile__compile_git_https_dependency.snap b/crates/puffin-cli/tests/snapshots/pip_compile__compile_git_https_dependency.snap new file mode 100644 index 000000000000..393b62e7718c --- /dev/null +++ b/crates/puffin-cli/tests/snapshots/pip_compile__compile_git_https_dependency.snap @@ -0,0 +1,36 @@ +--- +source: crates/puffin-cli/tests/pip_compile.rs +info: + program: puffin + args: + - pip-compile + - requirements.in + - "--cache-dir" + - /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpbvYz3u + env: + VIRTUAL_ENV: /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpHYkK5F/.venv +--- +success: true +exit_code: 0 +----- stdout ----- +# This file was autogenerated by Puffin v0.0.1 via the following command: +# [BIN_PATH] pip-compile requirements.in --cache-dir [CACHE_DIR] +blinker==1.7.0 + # via flask +click==8.1.7 + # via flask +flask @ git+https://github.com/pallets/flask.git +itsdangerous==2.1.2 + # via flask +jinja2==3.1.2 + # via flask +markupsafe==2.1.3 + # via + # jinja2 + # werkzeug +werkzeug==3.0.1 + # via flask + +----- stderr ----- +Resolved 7 packages in [TIME] + diff --git a/crates/puffin-git/Cargo.toml b/crates/puffin-git/Cargo.toml new file mode 100644 index 000000000000..9e4cf4c720bb --- /dev/null +++ b/crates/puffin-git/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "puffin-git" +version = "0.0.1" +edition = { workspace = true } +rust-version = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } + +[dependencies] +puffin-cache = { path = "../puffin-cache" } + +anyhow = { workspace = true } +cargo-util = { version = "0.2.6" } +git2 = { version = "0.18.1" } +glob = { workspace = true } +hex = { workspace = true } +once_cell = { workspace = true } +rand = { workspace = true } +serde = { workspace = true } +tracing = { workspace = true } +url = { workspace = true } +reqwest = { workspace = true, features = ["blocking"] } +tokio.workspace = true + +[features] +vendored-libgit2 = ["git2/vendored-libgit2"] +vendored-openssl = ["git2/vendored-openssl"] diff --git a/crates/puffin-git/src/git.rs b/crates/puffin-git/src/git.rs new file mode 100644 index 000000000000..ebc43b3d3014 --- /dev/null +++ b/crates/puffin-git/src/git.rs @@ -0,0 +1,1365 @@ +//! Git support is derived from Cargo's implementation. +//! Cargo is dual-licensed under either Apache 2.0 or MIT, at the user's choice. +//! Source: +use std::borrow::Cow; +use std::path::{Path, PathBuf}; +use std::process::Command; +use std::{env, str}; + +use anyhow::{anyhow, Context as _, Result}; +use cargo_util::{paths, ProcessBuilder}; +use git2::{self, ErrorClass, ObjectType}; +use reqwest::Client; +use reqwest::StatusCode; +use tracing::{debug, info, warn}; +use url::Url; + +use crate::util::retry; +use crate::{FetchStrategy, GitReference}; + +/// A file indicates that if present, `git reset` has been done and a repo +/// checkout is ready to go. See [`GitCheckout::reset`] for why we need this. +const CHECKOUT_READY_LOCK: &str = ".ok"; + +/// A short abbreviated OID. +/// +/// Exists for avoiding extra allocations in [`GitDatabase::to_short_id`]. +pub(crate) struct GitShortID(git2::Buf); + +impl GitShortID { + /// Views the short ID as a `str`. + pub(crate) fn as_str(&self) -> &str { + self.0.as_str().unwrap() + } +} + +/// A remote repository. It gets cloned into a local [`GitDatabase`]. +#[derive(PartialEq, Clone, Debug)] +pub(crate) struct GitRemote { + /// URL to a remote repository. + url: Url, +} + +/// A local clone of a remote repository's database. Multiple [`GitCheckout`]s +/// can be cloned from a single [`GitDatabase`]. +pub(crate) struct GitDatabase { + /// The remote repository where this database is fetched from. + remote: GitRemote, + /// Path to the root of the underlying Git repository on the local filesystem. + path: PathBuf, + /// Underlying Git repository instance for this database. + repo: git2::Repository, +} + +/// A local checkout of a particular revision from a [`GitDatabase`]. +pub(crate) struct GitCheckout<'a> { + /// The git database where this checkout is cloned from. + database: &'a GitDatabase, + /// Path to the root of the underlying Git repository on the local filesystem. + path: PathBuf, + /// The git revision this checkout is for. + revision: git2::Oid, + /// Underlying Git repository instance for this checkout. + repo: git2::Repository, +} + +impl GitRemote { + /// Creates an instance for a remote repository URL. + pub(crate) fn new(url: &Url) -> GitRemote { + GitRemote { url: url.clone() } + } + + /// Gets the remote repository URL. + pub(crate) fn url(&self) -> &Url { + &self.url + } + + /// Fetches and checkouts to a reference or a revision from this remote + /// into a local path. + /// + /// This ensures that it gets the up-to-date commit when a named reference + /// is given (tag, branch, refs/*). Thus, network connection is involved. + /// + /// When `locked_rev` is provided, it takes precedence over `reference`. + /// + /// If we have a previous instance of [`GitDatabase`] then fetch into that + /// if we can. If that can successfully load our revision then we've + /// populated the database with the latest version of `reference`, so + /// return that database and the rev we resolve to. + pub(crate) fn checkout( + &self, + into: &Path, + db: Option, + reference: &GitReference, + locked_rev: Option, + strategy: FetchStrategy, + client: &Client, + ) -> Result<(GitDatabase, git2::Oid)> { + let locked_ref = locked_rev.map(|oid| GitReference::Rev(oid.to_string())); + let reference = locked_ref.as_ref().unwrap_or(reference); + if let Some(mut db) = db { + fetch(&mut db.repo, self.url.as_str(), reference, strategy, client) + .with_context(|| format!("failed to fetch into: {}", into.display()))?; + + let resolved_commit_hash = match locked_rev { + Some(rev) => db.contains(rev).then_some(rev), + None => reference.resolve(&db.repo).ok(), + }; + if let Some(rev) = resolved_commit_hash { + return Ok((db, rev)); + } + } + + // Otherwise start from scratch to handle corrupt git repositories. + // After our fetch (which is interpreted as a clone now) we do the same + // resolution to figure out what we cloned. + if into.exists() { + paths::remove_dir_all(into)?; + } + paths::create_dir_all(into)?; + let mut repo = init(into, true)?; + fetch(&mut repo, self.url.as_str(), reference, strategy, client) + .with_context(|| format!("failed to clone into: {}", into.display()))?; + let rev = match locked_rev { + Some(rev) => rev, + None => reference.resolve(&repo)?, + }; + + Ok(( + GitDatabase { + remote: self.clone(), + path: into.to_path_buf(), + repo, + }, + rev, + )) + } + + /// Creates a [`GitDatabase`] of this remote at `db_path`. + pub(crate) fn db_at(&self, db_path: &Path) -> Result { + let repo = git2::Repository::open(db_path)?; + Ok(GitDatabase { + remote: self.clone(), + path: db_path.to_path_buf(), + repo, + }) + } +} + +impl GitDatabase { + /// Checkouts to a revision at `destination` from this database. + pub(crate) fn copy_to( + &self, + rev: git2::Oid, + destination: &Path, + strategy: FetchStrategy, + client: &Client, + ) -> Result> { + // If the existing checkout exists, and it is fresh, use it. + // A non-fresh checkout can happen if the checkout operation was + // interrupted. In that case, the checkout gets deleted and a new + // clone is created. + let checkout = match git2::Repository::open(destination) + .ok() + .map(|repo| GitCheckout::new(self, rev, repo)) + .filter(GitCheckout::is_fresh) + { + Some(co) => co, + None => GitCheckout::clone_into(destination, self, rev)?, + }; + checkout.update_submodules(strategy, client)?; + Ok(checkout) + } + + /// Get a short OID for a `revision`, usually 7 chars or more if ambiguous. + pub(crate) fn to_short_id(&self, revision: git2::Oid) -> Result { + let obj = self.repo.find_object(revision, None)?; + Ok(GitShortID(obj.short_id()?)) + } + + /// Checks if the database contains the object of this `oid`. + pub(crate) fn contains(&self, oid: git2::Oid) -> bool { + self.repo.revparse_single(&oid.to_string()).is_ok() + } +} + +impl GitReference { + /// Resolves self to an object ID with objects the `repo` currently has. + pub(crate) fn resolve(&self, repo: &git2::Repository) -> Result { + let id = match self { + // Note that we resolve the named tag here in sync with where it's + // fetched into via `fetch` below. + GitReference::Tag(s) => (|| -> Result { + let refname = format!("refs/remotes/origin/tags/{s}"); + let id = repo.refname_to_id(&refname)?; + let obj = repo.find_object(id, None)?; + let obj = obj.peel(ObjectType::Commit)?; + Ok(obj.id()) + })() + .with_context(|| format!("failed to find tag `{s}`"))?, + + // Resolve the remote name since that's all we're configuring in + // `fetch` below. + GitReference::Branch(s) => { + let name = format!("origin/{s}"); + let b = repo + .find_branch(&name, git2::BranchType::Remote) + .with_context(|| format!("failed to find branch `{s}`"))?; + b.get() + .target() + .ok_or_else(|| anyhow::format_err!("branch `{s}` did not have a target"))? + } + + // We'll be using the HEAD commit + GitReference::DefaultBranch => { + let head_id = repo.refname_to_id("refs/remotes/origin/HEAD")?; + let head = repo.find_object(head_id, None)?; + head.peel(ObjectType::Commit)?.id() + } + + GitReference::Rev(s) => { + let obj = repo.revparse_single(s)?; + match obj.as_tag() { + Some(tag) => tag.target_id(), + None => obj.id(), + } + } + }; + Ok(id) + } +} + +impl<'a> GitCheckout<'a> { + /// Creates an instance of [`GitCheckout`]. This doesn't imply the checkout + /// is done. Use [`GitCheckout::is_fresh`] to check. + /// + /// * The `database` is where this checkout is from. + /// * The `repo` will be the checked out Git repository. + fn new( + database: &'a GitDatabase, + revision: git2::Oid, + repo: git2::Repository, + ) -> GitCheckout<'a> { + let path = repo.workdir().unwrap_or_else(|| repo.path()); + GitCheckout { + path: path.to_path_buf(), + database, + revision, + repo, + } + } + + /// Gets the remote repository URL. + fn remote_url(&self) -> &Url { + self.database.remote.url() + } + + /// Clone a repo for a `revision` into a local path from a `datatabase`. + /// This is a filesystem-to-filesystem clone. + fn clone_into( + into: &Path, + database: &'a GitDatabase, + revision: git2::Oid, + ) -> Result> { + let dirname = into.parent().unwrap(); + paths::create_dir_all(dirname)?; + if into.exists() { + paths::remove_dir_all(into)?; + } + + // we're doing a local filesystem-to-filesystem clone so there should + // be no need to respect global configuration options, so pass in + // an empty instance of `git2::Config` below. + let git_config = git2::Config::new()?; + + // Clone the repository, but make sure we use the "local" option in + // libgit2 which will attempt to use hardlinks to set up the database. + // This should speed up the clone operation quite a bit if it works. + // + // Note that we still use the same fetch options because while we don't + // need authentication information we may want progress bars and such. + let url = Url::from_file_path(&database.path) + .map_err(|()| anyhow::format_err!("Invalid path URL: {}", database.path.display()))?; + let mut repo = None; + with_fetch_options(&git_config, url.as_str(), &mut |fopts| { + let mut checkout = git2::build::CheckoutBuilder::new(); + checkout.dry_run(); // we'll do this below during a `reset` + + let r = git2::build::RepoBuilder::new() + // use hard links and/or copy the database, we're doing a + // filesystem clone so this'll speed things up quite a bit. + .clone_local(git2::build::CloneLocal::Local) + .with_checkout(checkout) + .fetch_options(fopts) + .clone(url.as_str(), into)?; + // `git2` doesn't seem to handle shallow repos correctly when doing + // a local clone. Fortunately all that's needed is the copy of the + // one file that defines the shallow boundary, the commits which + // have their parents omitted as part of the shallow clone. + // + // TODO(git2): remove this when git2 supports shallow clone correctly + if database.repo.is_shallow() { + std::fs::copy( + database.repo.path().join("shallow"), + r.path().join("shallow"), + )?; + } + repo = Some(r); + Ok(()) + })?; + let repo = repo.unwrap(); + + let checkout = GitCheckout::new(database, revision, repo); + checkout.reset()?; + Ok(checkout) + } + + /// Checks if the `HEAD` of this checkout points to the expected revision. + fn is_fresh(&self) -> bool { + match self.repo.revparse_single("HEAD") { + Ok(ref head) if head.id() == self.revision => { + // See comments in reset() for why we check this + self.path.join(CHECKOUT_READY_LOCK).exists() + } + _ => false, + } + } + + /// Similar to [`reset()`]. This roughly performs `git reset --hard` to the + /// revision of this checkout, with additional interrupt protection by a + /// dummy file [`CHECKOUT_READY_LOCK`]. + /// + /// If we're interrupted while performing a `git reset` (e.g., we die + /// because of a signal) Cargo needs to be sure to try to check out this + /// repo again on the next go-round. + /// + /// To enable this we have a dummy file in our checkout, [`.cargo-ok`], + /// which if present means that the repo has been successfully reset and is + /// ready to go. Hence if we start to do a reset, we make sure this file + /// *doesn't* exist, and then once we're done we create the file. + /// + /// [`.cargo-ok`]: CHECKOUT_READY_LOCK + fn reset(&self) -> Result<()> { + let ok_file = self.path.join(CHECKOUT_READY_LOCK); + let _ = paths::remove_file(&ok_file); + info!("reset {} to {}", self.repo.path().display(), self.revision); + + // Ensure libgit2 won't mess with newlines when we vendor. + if let Ok(mut git_config) = self.repo.config() { + git_config.set_bool("core.autocrlf", false)?; + } + + let object = self.repo.find_object(self.revision, None)?; + reset(&self.repo, &object)?; + paths::create(ok_file)?; + Ok(()) + } + + /// Like `git submodule update --recursive` but for this git checkout. + /// + /// This function respects `submodule..update = none`[^1] git config. + /// Submodules set to `none` won't be fetched. + /// + /// [^1]: + fn update_submodules(&self, strategy: FetchStrategy, client: &Client) -> Result<()> { + /// Like `Cow`, but without a requirement on `Clone`. + enum Repo<'a> { + Borrowed(&'a git2::Repository), + Owned(git2::Repository), + } + + impl std::ops::Deref for Repo<'_> { + type Target = git2::Repository; + + fn deref(&self) -> &Self::Target { + match self { + Repo::Borrowed(repo) => repo, + Repo::Owned(repo) => repo, + } + } + } + + debug!( + "Update submodules for: {}", + self.repo.workdir().unwrap().display() + ); + + // Initialize a stack with the root repository. + let mut stack = vec![( + Repo::Borrowed(&self.repo), + Cow::Borrowed(self.remote_url().as_str()), + )]; + + while let Some((repo, parent_remote_url)) = stack.pop() { + for mut child in repo.submodules()? { + child.init(false)?; + + let child_url_str = child.url().ok_or_else(|| { + anyhow::format_err!("non-utf8 url for submodule {:?}?", child.path()) + })?; + + // Skip the submodule if the config says not to update it. + if child.update_strategy() == git2::SubmoduleUpdate::None { + debug!( + "Skipping git submodule `{}` due to update strategy in .gitmodules", + child_url_str + ); + continue; + } + + let child_remote_url = + absolute_submodule_url(&parent_remote_url, child_url_str)?.to_string(); + + // A submodule which is listed in .gitmodules but not actually + // checked out will not have a head id, so we should ignore it. + let Some(head) = child.head_id() else { + continue; + }; + + // If the submodule hasn't been checked out yet, we need to + // clone it. If it has been checked out and the head is the same + // as the submodule's head, then we can skip an update and keep + // recursing. + let head_and_repo = child.open().and_then(|repo| { + let target = repo.head()?.target(); + Ok((target, repo)) + }); + let mut repo = if let Ok((head, repo)) = head_and_repo { + if child.head_id() == head { + stack.push((Repo::Owned(repo), Cow::Owned(child_remote_url))); + continue; + } + repo + } else { + let path = repo.workdir().unwrap().join(child.path()); + let _ = paths::remove_dir_all(&path); + init(&path, false)? + }; + + // Fetch data from origin and reset to the head commit + debug!("Updating Git submodule: {}", child_remote_url); + let reference = GitReference::Rev(head.to_string()); + fetch(&mut repo, &child_remote_url, &reference, strategy, client).with_context( + || { + format!( + "failed to fetch submodule `{}` from {}", + child.name().unwrap_or(""), + child_remote_url + ) + }, + )?; + + let obj = repo.find_object(head, None)?; + reset(&repo, &obj)?; + drop(obj); + + // Push the current submodule onto the stack. + stack.push((Repo::Owned(repo), Cow::Owned(child_remote_url))); + } + } + + Ok(()) + } +} + +/// Constructs an absolute URL for a child submodule URL with its parent base URL. +/// +/// Git only assumes a submodule URL is a relative path if it starts with `./` +/// or `../` [^1]. To fetch the correct repo, we need to construct an absolute +/// submodule URL. +/// +/// At this moment it comes with some limitations: +/// +/// * GitHub doesn't accept non-normalized URLs with relative paths. +/// (`ssh://git@github.com/rust-lang/cargo.git/relative/..` is invalid) +/// * `url` crate cannot parse SCP-like URLs. +/// (`git@github.com:rust-lang/cargo.git` is not a valid WHATWG URL) +/// +/// To overcome these, this patch always tries [`Url::parse`] first to normalize +/// the path. If it couldn't, append the relative path as the last resort and +/// pray the remote git service supports non-normalized URLs. +/// +/// See also rust-lang/cargo#12404 and rust-lang/cargo#12295. +/// +/// [^1]: +fn absolute_submodule_url<'s>(base_url: &str, submodule_url: &'s str) -> Result> { + let absolute_url = if ["./", "../"].iter().any(|p| submodule_url.starts_with(p)) { + if let Ok(mut base_url) = Url::parse(base_url) { + let path = base_url.path(); + if !path.ends_with('/') { + base_url.set_path(&format!("{path}/")); + } + let absolute_url = base_url.join(submodule_url).with_context(|| { + format!( + "Failed to parse relative child submodule URL `{submodule_url}` using parent base URL `{base_url}`" + ) + })?; + Cow::from(absolute_url.to_string()) + } else { + let mut absolute_url = base_url.to_string(); + if !absolute_url.ends_with('/') { + absolute_url.push('/'); + } + absolute_url.push_str(submodule_url); + Cow::from(absolute_url) + } + } else { + Cow::from(submodule_url) + }; + + Ok(absolute_url) +} + +/// Prepare the authentication callbacks for cloning a git repository. +/// +/// The main purpose of this function is to construct the "authentication +/// callback" which is used to clone a repository. This callback will attempt to +/// find the right authentication on the system (without user input) and will +/// guide libgit2 in doing so. +/// +/// The callback is provided `allowed` types of credentials, and we try to do as +/// much as possible based on that: +/// +/// * Prioritize SSH keys from the local ssh agent as they're likely the most +/// reliable. The username here is prioritized from the credential +/// callback, then from whatever is configured in git itself, and finally +/// we fall back to the generic user of `git`. +/// +/// * If a username/password is allowed, then we fallback to git2-rs's +/// implementation of the credential helper. This is what is configured +/// with `credential.helper` in git, and is the interface for the macOS +/// keychain, for example. +/// +/// * After the above two have failed, we just kinda grapple attempting to +/// return *something*. +/// +/// If any form of authentication fails, libgit2 will repeatedly ask us for +/// credentials until we give it a reason to not do so. To ensure we don't +/// just sit here looping forever we keep track of authentications we've +/// attempted and we don't try the same ones again. +fn with_authentication(url: &str, cfg: &git2::Config, mut f: F) -> Result +where + F: FnMut(&mut git2::Credentials<'_>) -> Result, +{ + let mut cred_helper = git2::CredentialHelper::new(url); + cred_helper.config(cfg); + + let mut ssh_username_requested = false; + let mut cred_helper_bad = None; + let mut ssh_agent_attempts = Vec::new(); + let mut any_attempts = false; + let mut tried_sshkey = false; + let mut url_attempt = None; + + let orig_url = url; + let mut res = f(&mut |url, username, allowed| { + any_attempts = true; + if url != orig_url { + url_attempt = Some(url.to_string()); + } + // libgit2's "USERNAME" authentication actually means that it's just + // asking us for a username to keep going. This is currently only really + // used for SSH authentication and isn't really an authentication type. + // The logic currently looks like: + // + // let user = ...; + // if (user.is_null()) + // user = callback(USERNAME, null, ...); + // + // callback(SSH_KEY, user, ...) + // + // So if we're being called here then we know that (a) we're using ssh + // authentication and (b) no username was specified in the URL that + // we're trying to clone. We need to guess an appropriate username here, + // but that may involve a few attempts. Unfortunately we can't switch + // usernames during one authentication session with libgit2, so to + // handle this we bail out of this authentication session after setting + // the flag `ssh_username_requested`, and then we handle this below. + if allowed.contains(git2::CredentialType::USERNAME) { + debug_assert!(username.is_none()); + ssh_username_requested = true; + return Err(git2::Error::from_str("gonna try usernames later")); + } + + // An "SSH_KEY" authentication indicates that we need some sort of SSH + // authentication. This can currently either come from the ssh-agent + // process or from a raw in-memory SSH key. Cargo only supports using + // ssh-agent currently. + // + // If we get called with this then the only way that should be possible + // is if a username is specified in the URL itself (e.g., `username` is + // Some), hence the unwrap() here. We try custom usernames down below. + if allowed.contains(git2::CredentialType::SSH_KEY) && !tried_sshkey { + // If ssh-agent authentication fails, libgit2 will keep + // calling this callback asking for other authentication + // methods to try. Make sure we only try ssh-agent once, + // to avoid looping forever. + tried_sshkey = true; + let username = username.unwrap(); + debug_assert!(!ssh_username_requested); + ssh_agent_attempts.push(username.to_string()); + return git2::Cred::ssh_key_from_agent(username); + } + + // Sometimes libgit2 will ask for a username/password in plaintext. This + // is where Cargo would have an interactive prompt if we supported it, + // but we currently don't! Right now the only way we support fetching a + // plaintext password is through the `credential.helper` support, so + // fetch that here. + // + // If ssh-agent authentication fails, libgit2 will keep calling this + // callback asking for other authentication methods to try. Check + // cred_helper_bad to make sure we only try the git credential helper + // once, to avoid looping forever. + if allowed.contains(git2::CredentialType::USER_PASS_PLAINTEXT) && cred_helper_bad.is_none() + { + let r = git2::Cred::credential_helper(cfg, url, username); + cred_helper_bad = Some(r.is_err()); + return r; + } + + // I'm... not sure what the DEFAULT kind of authentication is, but seems + // easy to support? + if allowed.contains(git2::CredentialType::DEFAULT) { + return git2::Cred::default(); + } + + // Whelp, we tried our best + Err(git2::Error::from_str("no authentication methods succeeded")) + }); + + // Ok, so if it looks like we're going to be doing ssh authentication, we + // want to try a few different usernames as one wasn't specified in the URL + // for us to use. In order, we'll try: + // + // * A credential helper's username for this URL, if available. + // * This account's username. + // * "git" + // + // We have to restart the authentication session each time (due to + // constraints in libssh2 I guess? maybe this is inherent to ssh?), so we + // call our callback, `f`, in a loop here. + if ssh_username_requested { + debug_assert!(res.is_err()); + let mut attempts = vec![String::from("git")]; + if let Ok(s) = env::var("USER").or_else(|_| env::var("USERNAME")) { + attempts.push(s); + } + if let Some(ref s) = cred_helper.username { + attempts.push(s.clone()); + } + + while let Some(s) = attempts.pop() { + // We should get `USERNAME` first, where we just return our attempt, + // and then after that we should get `SSH_KEY`. If the first attempt + // fails we'll get called again, but we don't have another option so + // we bail out. + let mut attempts = 0; + res = f(&mut |_url, username, allowed| { + if allowed.contains(git2::CredentialType::USERNAME) { + return git2::Cred::username(&s); + } + if allowed.contains(git2::CredentialType::SSH_KEY) { + debug_assert_eq!(Some(&s[..]), username); + attempts += 1; + if attempts == 1 { + ssh_agent_attempts.push(s.to_string()); + return git2::Cred::ssh_key_from_agent(&s); + } + } + Err(git2::Error::from_str("no authentication methods succeeded")) + }); + + // If we made two attempts then that means: + // + // 1. A username was requested, we returned `s`. + // 2. An ssh key was requested, we returned to look up `s` in the + // ssh agent. + // 3. For whatever reason that lookup failed, so we were asked again + // for another mode of authentication. + // + // Essentially, if `attempts == 2` then in theory the only error was + // that this username failed to authenticate (e.g., no other network + // errors happened). Otherwise something else is funny so we bail + // out. + if attempts != 2 { + break; + } + } + } + let mut err = match res { + Ok(e) => return Ok(e), + Err(e) => e, + }; + + // In the case of an authentication failure (where we tried something) then + // we try to give a more helpful error message about precisely what we + // tried. + if any_attempts { + let mut msg = "failed to authenticate when downloading repository".to_string(); + + if let Some(attempt) = &url_attempt { + if url != attempt { + msg.push_str(": "); + msg.push_str(attempt); + } + } + msg.push('\n'); + if !ssh_agent_attempts.is_empty() { + let names = ssh_agent_attempts + .iter() + .map(|agent| format!("`{agent}`")) + .collect::>() + .join(", "); + msg.push_str(&format!( + "\n* attempted ssh-agent authentication, but \ + no usernames succeeded: {names}" + )); + } + if let Some(failed_cred_helper) = cred_helper_bad { + if failed_cred_helper { + msg.push_str( + "\n* attempted to find username/password via \ + git's `credential.helper` support, but failed", + ); + } else { + msg.push_str( + "\n* attempted to find username/password via \ + `credential.helper`, but maybe the found \ + credentials were incorrect", + ); + } + } + msg.push_str("\n\n"); + msg.push_str("if the git CLI succeeds then `net.git-fetch-with-cli` may help here\n"); + msg.push_str("https://doc.rust-lang.org/cargo/reference/config.html#netgit-fetch-with-cli"); + err = err.context(msg); + + // Otherwise if we didn't even get to the authentication phase them we may + // have failed to set up a connection, in these cases hint on the + // `net.git-fetch-with-cli` configuration option. + } else if let Some(e) = err.downcast_ref::() { + match e.class() { + ErrorClass::Net + | ErrorClass::Ssl + | ErrorClass::Submodule + | ErrorClass::FetchHead + | ErrorClass::Ssh + | ErrorClass::Http => { + let mut msg = "network failure seems to have happened\n".to_string(); + msg.push_str( + "if a proxy or similar is necessary `net.git-fetch-with-cli` may help here\n", + ); + msg.push_str( + "https://doc.rust-lang.org/cargo/reference/config.html#netgit-fetch-with-cli", + ); + err = err.context(msg); + } + ErrorClass::Callback => { + // This unwraps the git2 error. We're using the callback error + // specifically to convey errors from Rust land through the C + // callback interface. We don't need the `; class=Callback + // (26)` that gets tacked on to the git2 error message. + err = anyhow::format_err!("{}", e.message()); + } + _ => {} + } + } + + Err(err) +} + +/// `git reset --hard` to the given `obj` for the `repo`. +/// +/// The `obj` is a commit-ish to which the head should be moved. +fn reset(repo: &git2::Repository, obj: &git2::Object<'_>) -> Result<()> { + // let mut pb = Progress::new("Checkout", config); + let mut opts = git2::build::CheckoutBuilder::new(); + // opts.progress(|_, cur, max| { + // drop(pb.tick(cur, max, "")); + // }); + debug!("doing reset"); + repo.reset(obj, git2::ResetType::Hard, Some(&mut opts))?; + debug!("reset done"); + Ok(()) +} + +/// Prepares the callbacks for fetching a git repository. +/// +/// The main purpose of this function is to construct everything before a fetch. +/// This will attempt to setup a progress bar, the authentication for git, +/// ssh known hosts check, and the network retry mechanism. +/// +/// The callback is provided a fetch options, which can be used by the actual +/// git fetch. +pub(crate) fn with_fetch_options( + git_config: &git2::Config, + url: &str, + cb: &mut dyn FnMut(git2::FetchOptions<'_>) -> Result<()>, +) -> Result<()> { + retry::with_retry(|| { + with_authentication(url, git_config, |f| { + // TODO(charlie): Restore progress reporting. + let mut rcb = git2::RemoteCallbacks::new(); + rcb.credentials(f); + + // Create a local anonymous remote in the repository to fetch the url. + let mut opts = git2::FetchOptions::new(); + opts.remote_callbacks(rcb); + cb(opts) + })?; + Ok(()) + }) +} + +/// Attempts to fetch the given git `reference` for a Git repository. +/// +/// This is the main entry for git clone/fetch. It does the followings: +/// +/// * Turns [`GitReference`] into refspecs accordingly. +/// * Dispatches `git fetch` using libgit2 or git CLI. +/// +/// The `remote_url` argument is the git remote URL where we want to fetch from. +pub(crate) fn fetch( + repo: &mut git2::Repository, + remote_url: &str, + reference: &GitReference, + strategy: FetchStrategy, + client: &Client, +) -> Result<()> { + let oid_to_fetch = match github_fast_path(repo, remote_url, reference, client) { + Ok(FastPathRev::UpToDate) => return Ok(()), + Ok(FastPathRev::NeedsFetch(rev)) => Some(rev), + Ok(FastPathRev::Indeterminate) => None, + Err(e) => { + debug!("failed to check github {:?}", e); + None + } + }; + + maybe_gc_repo(repo)?; + + clean_repo_temp_files(repo); + + // Translate the reference desired here into an actual list of refspecs + // which need to get fetched. Additionally record if we're fetching tags. + let mut refspecs = Vec::new(); + let mut tags = false; + // The `+` symbol on the refspec means to allow a forced (fast-forward) + // update which is needed if there is ever a force push that requires a + // fast-forward. + match reference { + // For branches and tags we can fetch simply one reference and copy it + // locally, no need to fetch other branches/tags. + GitReference::Branch(branch) => { + refspecs.push(format!("+refs/heads/{branch}:refs/remotes/origin/{branch}")); + } + + GitReference::Tag(tag) => { + refspecs.push(format!("+refs/tags/{tag}:refs/remotes/origin/tags/{tag}")); + } + + GitReference::DefaultBranch => { + refspecs.push(String::from("+HEAD:refs/remotes/origin/HEAD")); + } + + GitReference::Rev(rev) => { + if rev.starts_with("refs/") { + refspecs.push(format!("+{rev}:{rev}")); + } else if let Some(oid_to_fetch) = oid_to_fetch { + refspecs.push(format!("+{oid_to_fetch}:refs/commit/{oid_to_fetch}")); + } else if rev.parse::().is_ok() { + // There is a specific commit to fetch and we will do so in shallow-mode only + // to not disturb the previous logic. + // Note that with typical settings for shallowing, we will just fetch a single `rev` + // as single commit. + // The reason we write to `refs/remotes/origin/HEAD` is that it's of special significance + // when during `GitReference::resolve()`, but otherwise it shouldn't matter. + refspecs.push(format!("+{rev}:refs/remotes/origin/HEAD")); + } else { + // We don't know what the rev will point to. To handle this + // situation we fetch all branches and tags, and then we pray + // it's somewhere in there. + refspecs.push(String::from("+refs/heads/*:refs/remotes/origin/*")); + refspecs.push(String::from("+HEAD:refs/remotes/origin/HEAD")); + tags = true; + } + } + } + + debug!("Performing a Git fetch for: {remote_url}"); + match strategy { + FetchStrategy::Cli => fetch_with_cli(repo, remote_url, &refspecs, tags), + FetchStrategy::Libgit2 => { + let git_config = git2::Config::open_default()?; + with_fetch_options(&git_config, remote_url, &mut |mut opts| { + if tags { + opts.download_tags(git2::AutotagOption::All); + } + // The `fetch` operation here may fail spuriously due to a corrupt + // repository. It could also fail, however, for a whole slew of other + // reasons (aka network related reasons). We want Cargo to automatically + // recover from corrupt repositories, but we don't want Cargo to stomp + // over other legitimate errors. + // + // Consequently we save off the error of the `fetch` operation and if it + // looks like a "corrupt repo" error then we blow away the repo and try + // again. If it looks like any other kind of error, or if we've already + // blown away the repository, then we want to return the error as-is. + let mut repo_reinitialized = false; + loop { + debug!("initiating fetch of {refspecs:?} from {remote_url}"); + let res = + repo.remote_anonymous(remote_url)? + .fetch(&refspecs, Some(&mut opts), None); + let err = match res { + Ok(()) => break, + Err(e) => e, + }; + debug!("fetch failed: {}", err); + + if !repo_reinitialized + && matches!(err.class(), ErrorClass::Reference | ErrorClass::Odb) + { + repo_reinitialized = true; + debug!( + "looks like this is a corrupt repository, reinitializing \ + and trying again" + ); + if reinitialize(repo).is_ok() { + continue; + } + } + + return Err(err.into()); + } + Ok(()) + }) + } + } +} + +/// Attempts to use `git` CLI installed on the system to fetch a repository, +/// when the config value [`net.git-fetch-with-cli`][1] is set. +/// +/// Unfortunately `libgit2` is notably lacking in the realm of authentication +/// when compared to the `git` command line. As a result, allow an escape +/// hatch for users that would prefer to use `git`-the-CLI for fetching +/// repositories instead of `libgit2`-the-library. This should make more +/// flavors of authentication possible while also still giving us all the +/// speed and portability of using `libgit2`. +/// +/// [1]: https://doc.rust-lang.org/nightly/cargo/reference/config.html#netgit-fetch-with-cli +fn fetch_with_cli( + repo: &mut git2::Repository, + url: &str, + refspecs: &[String], + tags: bool, +) -> Result<()> { + let mut cmd = ProcessBuilder::new("git"); + cmd.arg("fetch"); + if tags { + cmd.arg("--tags"); + } + cmd.arg("--force") // handle force pushes + .arg("--update-head-ok") // see discussion in #2078 + .arg(url) + .args(refspecs) + // If cargo is run by git (for example, the `exec` command in `git + // rebase`), the GIT_DIR is set by git and will point to the wrong + // location (this takes precedence over the cwd). Make sure this is + // unset so git will look at cwd for the repo. + .env_remove("GIT_DIR") + // The reset of these may not be necessary, but I'm including them + // just to be extra paranoid and avoid any issues. + .env_remove("GIT_WORK_TREE") + .env_remove("GIT_INDEX_FILE") + .env_remove("GIT_OBJECT_DIRECTORY") + .env_remove("GIT_ALTERNATE_OBJECT_DIRECTORIES") + .cwd(repo.path()); + cmd.exec()?; + Ok(()) +} + +/// Attempts to `git gc` a repository. +/// +/// Cargo has a bunch of long-lived git repositories in its global cache and +/// some, like the index, are updated very frequently. Right now each update +/// creates a new "pack file" inside the git database, and over time this can +/// cause bad performance and bad current behavior in libgit2. +/// +/// One pathological use case today is where libgit2 opens hundreds of file +/// descriptors, getting us dangerously close to blowing out the OS limits of +/// how many fds we can have open. This is detailed in [#4403]. +/// +/// To try to combat this problem we attempt a `git gc` here. Note, though, that +/// we may not even have `git` installed on the system! As a result we +/// opportunistically try a `git gc` when the pack directory looks too big, and +/// failing that we just blow away the repository and start over. +/// +/// In theory this shouldn't be too expensive compared to the network request +/// we're about to issue. +/// +/// [#4403]: https://github.com/rust-lang/cargo/issues/4403 +fn maybe_gc_repo(repo: &mut git2::Repository) -> Result<()> { + // Here we arbitrarily declare that if you have more than 100 files in your + // `pack` folder that we need to do a gc. + let entries = if let Ok(e) = repo.path().join("objects/pack").read_dir() { + e.count() + } else { + debug!("skipping gc as pack dir appears gone"); + return Ok(()); + }; + let max = env::var("__CARGO_PACKFILE_LIMIT") + .ok() + .and_then(|s| s.parse::().ok()) + .unwrap_or(100); + if entries < max { + debug!("skipping gc as there's only {} pack files", entries); + return Ok(()); + } + + // First up, try a literal `git gc` by shelling out to git. This is pretty + // likely to fail though as we may not have `git` installed. Note that + // libgit2 doesn't currently implement the gc operation, so there's no + // equivalent there. + match Command::new("git") + .arg("gc") + .current_dir(repo.path()) + .output() + { + Ok(out) => { + debug!( + "git-gc status: {}\n\nstdout ---\n{}\nstderr ---\n{}", + out.status, + String::from_utf8_lossy(&out.stdout), + String::from_utf8_lossy(&out.stderr) + ); + if out.status.success() { + let new = git2::Repository::open(repo.path())?; + *repo = new; + return Ok(()); + } + } + Err(e) => debug!("git-gc failed to spawn: {}", e), + } + + // Alright all else failed, let's start over. + reinitialize(repo) +} + +/// Removes temporary files left from previous activity. +/// +/// If libgit2 is interrupted while indexing pack files, it will leave behind +/// some temporary files that it doesn't clean up. These can be quite large in +/// size, so this tries to clean things up. +/// +/// This intentionally ignores errors. This is only an opportunistic cleaning, +/// and we don't really care if there are issues (there's unlikely anything +/// that can be done). +/// +/// The git CLI has similar behavior (its temp files look like +/// `objects/pack/tmp_pack_9kUSA8`). Those files are normally deleted via `git +/// prune` which is run by `git gc`. However, it doesn't know about libgit2's +/// filenames, so they never get cleaned up. +fn clean_repo_temp_files(repo: &git2::Repository) { + let path = repo.path().join("objects/pack/pack_git2_*"); + let Some(pattern) = path.to_str() else { + warn!("cannot convert {path:?} to a string"); + return; + }; + let Ok(paths) = glob::glob(pattern) else { + return; + }; + for path in paths.flatten() { + match paths::remove_file(&path) { + Ok(()) => debug!("removed stale temp git file {path:?}"), + Err(e) => { + warn!("failed to remove {path:?} while cleaning temp files: {e}"); + } + } + } +} + +/// Reinitializes a given Git repository. This is useful when a Git repository +/// seems corrupted and we want to start over. +fn reinitialize(repo: &mut git2::Repository) -> Result<()> { + // Here we want to drop the current repository object pointed to by `repo`, + // so we initialize temporary repository in a sub-folder, blow away the + // existing git folder, and then recreate the git repo. Finally we blow away + // the `tmp` folder we allocated. + let path = repo.path().to_path_buf(); + debug!("reinitializing git repo at {:?}", path); + let tmp = path.join("tmp"); + let bare = !repo.path().ends_with(".git"); + *repo = init(&tmp, false)?; + for entry in path.read_dir()? { + let entry = entry?; + if entry.file_name().to_str() == Some("tmp") { + continue; + } + let path = entry.path(); + drop(paths::remove_file(&path).or_else(|_| paths::remove_dir_all(&path))); + } + *repo = init(&path, bare)?; + paths::remove_dir_all(&tmp)?; + Ok(()) +} + +/// Initializes a Git repository at `path`. +fn init(path: &Path, bare: bool) -> Result { + let mut opts = git2::RepositoryInitOptions::new(); + // Skip anything related to templates, they just call all sorts of issues as + // we really don't want to use them yet they insist on being used. See #6240 + // for an example issue that comes up. + opts.external_template(false); + opts.bare(bare); + Ok(git2::Repository::init_opts(path, &opts)?) +} + +/// The result of GitHub fast path check. See [`github_fast_path`] for more. +enum FastPathRev { + /// The local rev (determined by `reference.resolve(repo)`) is already up to + /// date with what this rev resolves to on GitHub's server. + UpToDate, + /// The following SHA must be fetched in order for the local rev to become + /// up to date. + NeedsFetch(git2::Oid), + /// Don't know whether local rev is up to date. We'll fetch _all_ branches + /// and tags from the server and see what happens. + Indeterminate, +} + +/// Attempts GitHub's special fast path for testing if we've already got an +/// up-to-date copy of the repository. +/// +/// Updating the index is done pretty regularly so we want it to be as fast as +/// possible. For registries hosted on GitHub (like the crates.io index) there's +/// a fast path available to use[^1] to tell us that there's no updates to be +/// made. +/// +/// Note that this function should never cause an actual failure because it's +/// just a fast path. As a result, a caller should ignore `Err` returned from +/// this function and move forward on the normal path. +/// +/// [^1]: +fn github_fast_path( + repo: &mut git2::Repository, + url: &str, + reference: &GitReference, + client: &Client, +) -> Result { + let url = Url::parse(url)?; + if !is_github(&url) { + return Ok(FastPathRev::Indeterminate); + } + + let local_object = reference.resolve(repo).ok(); + let github_branch_name = match reference { + GitReference::Branch(branch) => branch, + GitReference::Tag(tag) => tag, + GitReference::DefaultBranch => "HEAD", + GitReference::Rev(rev) => { + if rev.starts_with("refs/") { + rev + } else if looks_like_commit_hash(rev) { + // `revparse_single` (used by `resolve`) is the only way to turn + // short hash -> long hash, but it also parses other things, + // like branch and tag names, which might coincidentally be + // valid hex. + // + // We only return early if `rev` is a prefix of the object found + // by `revparse_single`. Don't bother talking to GitHub in that + // case, since commit hashes are permanent. If a commit with the + // requested hash is already present in the local clone, its + // contents must be the same as what is on the server for that + // hash. + // + // If `rev` is not found locally by `revparse_single`, we'll + // need GitHub to resolve it and get a hash. If `rev` is found + // but is not a short hash of the found object, it's probably a + // branch and we also need to get a hash from GitHub, in case + // the branch has moved. + if let Some(local_object) = local_object { + if is_short_hash_of(rev, local_object) { + return Ok(FastPathRev::UpToDate); + } + } + rev + } else { + debug!("can't use github fast path with `rev = \"{}\"`", rev); + return Ok(FastPathRev::Indeterminate); + } + } + }; + + // This expects GitHub urls in the form `github.com/user/repo` and nothing + // else + let mut pieces = url + .path_segments() + .ok_or_else(|| anyhow!("no path segments on url"))?; + let username = pieces + .next() + .ok_or_else(|| anyhow!("couldn't find username"))?; + let repository = pieces + .next() + .ok_or_else(|| anyhow!("couldn't find repository name"))?; + if pieces.next().is_some() { + anyhow::bail!("too many segments on URL"); + } + + // Trim off the `.git` from the repository, if present, since that's + // optional for GitHub and won't work when we try to use the API as well. + let repository = repository.strip_suffix(".git").unwrap_or(repository); + + let url = format!( + "https://api.github.com/repos/{username}/{repository}/commits/{github_branch_name}" + ); + + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?; + + runtime.block_on(async move { + debug!("Attempting GitHub fast path for: {url}"); + let mut request = client.get(&url); + request = request.header("Accept", "application/vnd.github.3.sha"); + request = request.header("User-Agent", "puffin"); + if let Some(local_object) = local_object { + request = request.header("If-None-Match", local_object.to_string()); + } + + let response = request.send().await?; + response.error_for_status_ref()?; + let response_code = response.status(); + if response_code == StatusCode::NOT_MODIFIED { + Ok(FastPathRev::UpToDate) + } else if response_code == StatusCode::OK { + let oid_to_fetch = response.text().await?.parse::()?; + Ok(FastPathRev::NeedsFetch(oid_to_fetch)) + } else { + // Usually response_code == 404 if the repository does not exist, and + // response_code == 422 if exists but GitHub is unable to resolve the + // requested rev. + Ok(FastPathRev::Indeterminate) + } + }) +} + +/// Whether a `url` is one from GitHub. +fn is_github(url: &Url) -> bool { + url.host_str() == Some("github.com") +} + +/// Whether a `rev` looks like a commit hash (ASCII hex digits). +fn looks_like_commit_hash(rev: &str) -> bool { + rev.len() >= 7 && rev.chars().all(|ch| ch.is_ascii_hexdigit()) +} + +/// Whether `rev` is a shorter hash of `oid`. +fn is_short_hash_of(rev: &str, oid: git2::Oid) -> bool { + let long_hash = oid.to_string(); + match long_hash.get(..rev.len()) { + Some(truncated_long_hash) => truncated_long_hash.eq_ignore_ascii_case(rev), + None => false, + } +} + +#[cfg(test)] +mod tests { + use super::absolute_submodule_url; + + #[test] + fn test_absolute_submodule_url() { + let cases = [ + ( + "ssh://git@gitub.com/rust-lang/cargo", + "git@github.com:rust-lang/cargo.git", + "git@github.com:rust-lang/cargo.git", + ), + ( + "ssh://git@gitub.com/rust-lang/cargo", + "./", + "ssh://git@gitub.com/rust-lang/cargo/", + ), + ( + "ssh://git@gitub.com/rust-lang/cargo", + "../", + "ssh://git@gitub.com/rust-lang/", + ), + ( + "ssh://git@gitub.com/rust-lang/cargo", + "./foo", + "ssh://git@gitub.com/rust-lang/cargo/foo", + ), + ( + "ssh://git@gitub.com/rust-lang/cargo/", + "./foo", + "ssh://git@gitub.com/rust-lang/cargo/foo", + ), + ( + "ssh://git@gitub.com/rust-lang/cargo/", + "../foo", + "ssh://git@gitub.com/rust-lang/foo", + ), + ( + "ssh://git@gitub.com/rust-lang/cargo", + "../foo", + "ssh://git@gitub.com/rust-lang/foo", + ), + ( + "ssh://git@gitub.com/rust-lang/cargo", + "../foo/bar/../baz", + "ssh://git@gitub.com/rust-lang/foo/baz", + ), + ( + "git@github.com:rust-lang/cargo.git", + "ssh://git@gitub.com/rust-lang/cargo", + "ssh://git@gitub.com/rust-lang/cargo", + ), + ( + "git@github.com:rust-lang/cargo.git", + "./", + "git@github.com:rust-lang/cargo.git/./", + ), + ( + "git@github.com:rust-lang/cargo.git", + "../", + "git@github.com:rust-lang/cargo.git/../", + ), + ( + "git@github.com:rust-lang/cargo.git", + "./foo", + "git@github.com:rust-lang/cargo.git/./foo", + ), + ( + "git@github.com:rust-lang/cargo.git/", + "./foo", + "git@github.com:rust-lang/cargo.git/./foo", + ), + ( + "git@github.com:rust-lang/cargo.git", + "../foo", + "git@github.com:rust-lang/cargo.git/../foo", + ), + ( + "git@github.com:rust-lang/cargo.git/", + "../foo", + "git@github.com:rust-lang/cargo.git/../foo", + ), + ( + "git@github.com:rust-lang/cargo.git", + "../foo/bar/../baz", + "git@github.com:rust-lang/cargo.git/../foo/bar/../baz", + ), + ]; + + for (base_url, submodule_url, expected) in cases { + let url = absolute_submodule_url(base_url, submodule_url).unwrap(); + assert_eq!( + expected, url, + "base `{base_url}`; submodule `{submodule_url}`" + ); + } + } +} diff --git a/crates/puffin-git/src/lib.rs b/crates/puffin-git/src/lib.rs new file mode 100644 index 000000000000..a07592396022 --- /dev/null +++ b/crates/puffin-git/src/lib.rs @@ -0,0 +1,73 @@ +use url::Url; + +pub use self::source::GitSource; + +mod git; +mod source; +mod util; + +/// A reference to a Git repository. +#[derive(Debug, Clone)] +pub struct Git { + /// The URL of the Git repository, with any query parameters and fragments removed. + url: Url, + /// The reference to the commit to use, which could be a branch, tag or revision. + reference: GitReference, + /// The precise commit to use, if known. + precise: Option, +} + +impl TryFrom for Git { + type Error = anyhow::Error; + + /// Initialize a [`Git`] source from a URL. + fn try_from(mut url: Url) -> Result { + let mut reference = GitReference::DefaultBranch; + for (k, v) in url.query_pairs() { + match &k[..] { + // Map older 'ref' to branch. + "branch" | "ref" => reference = GitReference::Branch(v.into_owned()), + "rev" => reference = GitReference::Rev(v.into_owned()), + "tag" => reference = GitReference::Tag(v.into_owned()), + _ => {} + } + } + let precise = url.fragment().map(git2::Oid::from_str).transpose()?; + url.set_fragment(None); + url.set_query(None); + + Ok(Self { + url, + reference, + precise, + }) + } +} + +impl std::fmt::Display for Git { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.url) + } +} + +/// Information to find a specific commit in a Git repository. +#[derive(Debug, Clone)] +pub enum GitReference { + /// From a tag. + Tag(String), + /// From a branch. + Branch(String), + /// From a specific revision. Can be a commit hash (either short or full), + /// or a named reference like `refs/pull/493/head`. + Rev(String), + /// The default branch of the repository, the reference named `HEAD`. + DefaultBranch, +} + +#[derive(Debug, Clone, Copy)] +pub enum FetchStrategy { + /// Fetch Git repositories using libgit2. + Libgit2, + /// Fetch Git repositories using the `git` CLI. + Cli, +} diff --git a/crates/puffin-git/src/source.rs b/crates/puffin-git/src/source.rs new file mode 100644 index 000000000000..4f8ae3328b34 --- /dev/null +++ b/crates/puffin-git/src/source.rs @@ -0,0 +1,91 @@ +//! Git support is derived from Cargo's implementation. +//! Cargo is dual-licensed under either Apache 2.0 or MIT, at the user's choice. +//! Source: +use std::path::PathBuf; + +use anyhow::Result; +use reqwest::Client; +use tracing::debug; + +use puffin_cache::{digest, CanonicalUrl}; + +use crate::git::GitRemote; +use crate::{FetchStrategy, Git, GitReference}; + +/// A remote Git source that can be checked out locally. +pub struct GitSource { + /// The git remote which we're going to fetch from. + remote: GitRemote, + /// The Git reference from the manifest file. + manifest_reference: GitReference, + /// The revision which a git source is locked to. + /// This is expected to be set after the Git repository is fetched. + locked_rev: Option, + /// The identifier of this source for Cargo's Git cache directory. + /// See [`ident`] for more. + ident: String, + /// The HTTP client to use for fetching. + client: Client, + /// The fetch strategy to use when cloning. + strategy: FetchStrategy, + /// The path to the Git source database. + git: PathBuf, +} + +impl GitSource { + pub fn new(reference: Git, git: PathBuf) -> Self { + Self { + remote: GitRemote::new(&reference.url), + manifest_reference: reference.reference, + locked_rev: reference.precise, + ident: digest(&CanonicalUrl::new(&reference.url)), + client: Client::new(), + strategy: FetchStrategy::Libgit2, + git, + } + } + + pub fn fetch(self) -> Result { + // The path to the repo, within the Git database. + let db_path = self.git.join("db").join(&self.ident); + + let (db, actual_rev) = match (self.locked_rev, self.remote.db_at(&db_path).ok()) { + // If we have a locked revision, and we have a preexisting database + // which has that revision, then no update needs to happen. + (Some(rev), Some(db)) if db.contains(rev) => (db, rev), + + // ... otherwise we use this state to update the git database. Note + // that we still check for being offline here, for example in the + // situation that we have a locked revision but the database + // doesn't have it. + (locked_rev, db) => { + debug!("Updating Git source: `{:?}`", self.remote); + + self.remote.checkout( + &db_path, + db, + &self.manifest_reference, + locked_rev, + self.strategy, + &self.client, + )? + } + }; + + // Don’t use the full hash, in order to contribute less to reaching the + // path length limit on Windows. + let short_id = db.to_short_id(actual_rev)?; + + // Check out `actual_rev` from the database to a scoped location on the + // filesystem. This will use hard links and such to ideally make the + // checkout operation here pretty fast. + let checkout_path = self + .git + .join("checkouts") + .join(&self.ident) + .join(short_id.as_str()); + db.copy_to(actual_rev, &checkout_path, self.strategy, &self.client)?; + + Ok(checkout_path) + } +} diff --git a/crates/puffin-git/src/util/errors.rs b/crates/puffin-git/src/util/errors.rs new file mode 100644 index 000000000000..337461cff174 --- /dev/null +++ b/crates/puffin-git/src/util/errors.rs @@ -0,0 +1,45 @@ +//! Git support is derived from Cargo's implementation. +//! Cargo is dual-licensed under either Apache 2.0 or MIT, at the user's choice. +//! Source: +use std::fmt::{self, Write}; + +use super::truncate_with_ellipsis; + +#[derive(Debug)] +pub(crate) struct HttpNotSuccessful { + pub(crate) code: u32, + pub(crate) url: String, + pub(crate) ip: Option, + pub(crate) body: Vec, +} + +impl HttpNotSuccessful { + fn render(&self) -> String { + let mut result = String::new(); + let body = std::str::from_utf8(&self.body).map_or_else( + |_| format!("[{} non-utf8 bytes]", self.body.len()), + |s| truncate_with_ellipsis(s, 512), + ); + + write!( + result, + "failed to get successful HTTP response from `{}`", + self.url + ) + .unwrap(); + if let Some(ip) = &self.ip { + write!(result, " ({ip})").unwrap(); + } + writeln!(result, ", got {}", self.code).unwrap(); + write!(result, "body:\n{body}").unwrap(); + result + } +} + +impl fmt::Display for HttpNotSuccessful { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.render()) + } +} + +impl std::error::Error for HttpNotSuccessful {} diff --git a/crates/puffin-git/src/util/mod.rs b/crates/puffin-git/src/util/mod.rs new file mode 100644 index 000000000000..68795f593b37 --- /dev/null +++ b/crates/puffin-git/src/util/mod.rs @@ -0,0 +1,17 @@ +//! Git support is derived from Cargo's implementation. +//! Cargo is dual-licensed under either Apache 2.0 or MIT, at the user's choice. +//! Source: +pub(crate) mod errors; +pub(crate) mod retry; + +pub(crate) fn truncate_with_ellipsis(s: &str, max_width: usize) -> String { + // We should truncate at grapheme-boundary and compute character-widths, + // yet the dependencies on unicode-segmentation and unicode-width are + // not worth it. + let mut chars = s.chars(); + let mut prefix = (&mut chars).take(max_width - 1).collect::(); + if chars.next().is_some() { + prefix.push('…'); + } + prefix +} diff --git a/crates/puffin-git/src/util/retry.rs b/crates/puffin-git/src/util/retry.rs new file mode 100644 index 000000000000..9135fcc71135 --- /dev/null +++ b/crates/puffin-git/src/util/retry.rs @@ -0,0 +1,187 @@ +//! Utilities for retrying a network operation. +//! +//! Some network errors are considered "spurious", meaning it is not a real +//! error (such as a 404 not found) and is likely a transient error (like a +//! bad network connection) that we can hope will resolve itself shortly. The +//! [`Retry`] type offers a way to repeatedly perform some kind of network +//! operation with a delay if it detects one of these possibly transient +//! errors. +//! +//! This supports errors from [`git2`], [`reqwest`], and [`HttpNotSuccessful`] +//! 5xx HTTP errors. +//! +//! The number of retries can be configured by the user via the `net.retry` +//! config option. This indicates the number of times to retry the operation +//! (default 3 times for a total of 4 attempts). +//! +//! There are hard-coded constants that indicate how long to sleep between +//! retries. The constants are tuned to balance a few factors, such as the +//! responsiveness to the user (we don't want cargo to hang for too long +//! retrying things), and accommodating things like Cloudfront's default +//! negative TTL of 10 seconds (if Cloudfront gets a 5xx error for whatever +//! reason it won't try to fetch again for 10 seconds). +//! +//! The timeout also implements a primitive form of random jitter. This is so +//! that if multiple requests fail at the same time that they don't all flood +//! the server at the same time when they are retried. This jitter still has +//! some clumping behavior, but should be good enough. +//! +//! [`Retry`] is the core type for implementing retry logic. The +//! [`Retry::try`] method can be called with a callback, and it will +//! indicate if it needs to be called again sometime in the future if there +//! was a possibly transient error. The caller is responsible for sleeping the +//! appropriate amount of time and then calling [`Retry::try`] again. +//! +//! [`with_retry`] is a convenience function that will create a [`Retry`] and +//! handle repeatedly running a callback until it succeeds, or it runs out of +//! retries. +//! +//! Some interesting resources about retries: +//! - +//! - +//! - + +//! Git support is derived from Cargo's implementation. +//! Cargo is dual-licensed under either Apache 2.0 or MIT, at the user's choice. +//! Source: +use std::cmp::min; +use std::time::Duration; + +use anyhow::{Error, Result}; +use rand::Rng; +use tracing::warn; + +use crate::util::errors::HttpNotSuccessful; + +/// State for managing retrying a network operation. +pub(crate) struct Retry { + /// The number of failed attempts that have been done so far. + /// + /// Starts at 0, and increases by one each time an attempt fails. + retries: u64, + /// The maximum number of times the operation should be retried. + /// + /// 0 means it should never retry. + max_retries: u64, +} + +/// The result of attempting some operation via [`Retry::try`]. +pub(crate) enum RetryResult { + /// The operation was successful. + /// + /// The wrapped value is the return value of the callback function. + Success(T), + /// The operation was an error, and it should not be tried again. + Err(Error), + /// The operation failed, and should be tried again in the future. + /// + /// The wrapped value is the number of milliseconds to wait before trying + /// again. The caller is responsible for waiting this long and then + /// calling [`Retry::try`] again. + Retry(u64), +} + +/// Maximum amount of time a single retry can be delayed (milliseconds). +const MAX_RETRY_SLEEP_MS: u64 = 10 * 1000; +/// The minimum initial amount of time a retry will be delayed (milliseconds). +/// +/// The actual amount of time will be a random value above this. +const INITIAL_RETRY_SLEEP_BASE_MS: u64 = 500; +/// The maximum amount of additional time the initial retry will take (milliseconds). +/// +/// The initial delay will be [`INITIAL_RETRY_SLEEP_BASE_MS`] plus a random range +/// from 0 to this value. +const INITIAL_RETRY_JITTER_MS: u64 = 1000; + +impl Retry { + pub(crate) fn new() -> Retry { + Retry { + retries: 0, + max_retries: 3, + } + } + + /// Calls the given callback, and returns a [`RetryResult`] which + /// indicates whether or not this needs to be called again at some point + /// in the future to retry the operation if it failed. + pub(crate) fn r#try(&mut self, f: impl FnOnce() -> Result) -> RetryResult { + match f() { + Err(ref err) if maybe_spurious(err) && self.retries < self.max_retries => { + let err_msg = err.downcast_ref::().map_or_else( + || err.root_cause().to_string(), + HttpNotSuccessful::to_string, + ); + warn!( + "Spurious network error ({} tries remaining): {err_msg}", + self.max_retries - self.retries, + ); + self.retries += 1; + RetryResult::Retry(self.next_sleep_ms()) + } + Err(e) => RetryResult::Err(e), + Ok(r) => RetryResult::Success(r), + } + } + + /// Gets the next sleep duration in milliseconds. + fn next_sleep_ms(&self) -> u64 { + if self.retries == 1 { + let mut rng = rand::thread_rng(); + INITIAL_RETRY_SLEEP_BASE_MS + rng.gen_range(0..INITIAL_RETRY_JITTER_MS) + } else { + min( + ((self.retries - 1) * 3) * 1000 + INITIAL_RETRY_SLEEP_BASE_MS, + MAX_RETRY_SLEEP_MS, + ) + } + } +} + +fn maybe_spurious(err: &Error) -> bool { + if let Some(git_err) = err.downcast_ref::() { + match git_err.class() { + git2::ErrorClass::Net + | git2::ErrorClass::Os + | git2::ErrorClass::Zlib + | git2::ErrorClass::Http => return git_err.code() != git2::ErrorCode::Certificate, + _ => (), + } + } + if let Some(reqwest_err) = err.downcast_ref::() { + if reqwest_err.is_timeout() + || reqwest_err.is_connect() + || reqwest_err + .status() + .map_or(false, |status| status.is_server_error()) + { + return true; + } + } + if let Some(not_200) = err.downcast_ref::() { + if 500 <= not_200.code && not_200.code < 600 { + return true; + } + } + + false +} + +/// Wrapper method for network call retry logic. +/// +/// Retry counts provided by Config object `net.retry`. Config shell outputs +/// a warning on per retry. +/// +/// Closure must return a `Result`. +pub(crate) fn with_retry(mut callback: F) -> Result +where + F: FnMut() -> Result, +{ + let mut retry = Retry::new(); + loop { + match retry.r#try(&mut callback) { + RetryResult::Success(r) => return Ok(r), + RetryResult::Err(e) => return Err(e), + RetryResult::Retry(sleep) => std::thread::sleep(Duration::from_millis(sleep)), + } + } +} diff --git a/crates/puffin-resolver/Cargo.toml b/crates/puffin-resolver/Cargo.toml index ef1455686d50..c42ded9fea1e 100644 --- a/crates/puffin-resolver/Cargo.toml +++ b/crates/puffin-resolver/Cargo.toml @@ -21,6 +21,7 @@ puffin-distribution = { path = "../puffin-distribution" } puffin-normalize = { path = "../puffin-normalize" } puffin-package = { path = "../puffin-package" } puffin-traits = { path = "../puffin-traits" } +puffin-git = { path = "../puffin-git" } distribution-filename = { path = "../distribution-filename" } anyhow = { workspace = true } diff --git a/crates/puffin-resolver/src/resolver.rs b/crates/puffin-resolver/src/resolver.rs index 60e3fa93d55d..cbf688426887 100644 --- a/crates/puffin-resolver/src/resolver.rs +++ b/crates/puffin-resolver/src/resolver.rs @@ -687,14 +687,20 @@ impl<'a, Context: BuildContext + Sync> Resolver<'a, Context> { let build_tree = SourceDistributionBuildTree::new(self.build_context); let distribution = RemoteDistributionRef::from_url(&package_name, &url); let metadata = match build_tree.find_dist_info(&distribution, self.tags) { - Ok(Some(metadata)) => metadata, - Ok(None) => build_tree - .download_and_build_sdist(&distribution, self.client) - .await - .map_err(|err| ResolveError::UrlDistribution { - url: url.clone(), - err, - })?, + Ok(Some(metadata)) => { + debug!("Found source distribution metadata in cache: {url}"); + metadata + } + Ok(None) => { + debug!("Downloading source distribution from: {url}"); + build_tree + .download_and_build_sdist(&distribution, self.client) + .await + .map_err(|err| ResolveError::UrlDistribution { + url: url.clone(), + err, + })? + } Err(err) => { error!( "Failed to read source distribution {distribution} from cache: {err}", @@ -715,18 +721,22 @@ impl<'a, Context: BuildContext + Sync> Resolver<'a, Context> { let build_tree = SourceDistributionBuildTree::new(self.build_context); let distribution = RemoteDistributionRef::from_url(&package_name, &url); let metadata = match build_tree.find_dist_info(&distribution, self.tags) { - Ok(Some(metadata)) => metadata, - Ok(None) => build_tree - .download_wheel(&distribution, self.client) - .await - .map_err(|err| ResolveError::UrlDistribution { - url: url.clone(), - err, - })?, + Ok(Some(metadata)) => { + debug!("Found wheel metadata in cache: {url}"); + metadata + } + Ok(None) => { + debug!("Downloading wheel from: {url}"); + build_tree + .download_wheel(&distribution, self.client) + .await + .map_err(|err| ResolveError::UrlDistribution { + url: url.clone(), + err, + })? + } Err(err) => { - error!( - "Failed to read built distribution {distribution} from cache: {err}", - ); + error!("Failed to read wheel {distribution} from cache: {err}",); build_tree .download_wheel(&distribution, self.client) .await diff --git a/crates/puffin-resolver/src/source_distribution.rs b/crates/puffin-resolver/src/source_distribution.rs index adf295535295..6d3de9b071f3 100644 --- a/crates/puffin-resolver/src/source_distribution.rs +++ b/crates/puffin-resolver/src/source_distribution.rs @@ -1,17 +1,20 @@ +use std::borrow::Cow; use std::path::PathBuf; use std::str::FromStr; -use anyhow::Result; +use anyhow::{Error, Result}; use fs_err::tokio as fs; use tempfile::tempdir; use tokio_util::compat::FuturesAsyncReadCompatExt; use tracing::debug; +use url::Url; use zip::ZipArchive; use distribution_filename::WheelFilename; use platform_tags::Tags; use puffin_client::RegistryClient; use puffin_distribution::RemoteDistributionRef; +use puffin_git::{Git, GitSource}; use puffin_package::pypi_types::Metadata21; use puffin_traits::BuildContext; @@ -19,6 +22,8 @@ const BUILT_WHEELS_CACHE: &str = "built-wheels-v0"; const REMOTE_WHEELS_CACHE: &str = "remote-wheels-v0"; +const GIT_CACHE: &str = "git-v0"; + /// Stores wheels built from source distributions. We need to keep those separate from the regular /// wheel cache since a wheel with the same name may be uploaded after we made our build and in that /// case the hashes would clash. @@ -49,16 +54,36 @@ impl<'a, T: BuildContext> SourceDistributionBuildTree<'a, T> { client: &RegistryClient, ) -> Result { debug!("Building: {distribution}"); - let url = distribution.url()?; - let reader = client.stream_external(&url).await?; - let mut reader = tokio::io::BufReader::new(reader.compat()); + let temp_dir = tempdir()?; - // Download the source distribution. - let sdist_filename = distribution.filename()?; - let sdist_file = temp_dir.path().join(sdist_filename.as_ref()); - let mut writer = tokio::fs::File::create(&sdist_file).await?; - tokio::io::copy(&mut reader, &mut writer).await?; + let source = DistributionSource::try_from(distribution)?; + let sdist_file = match source { + DistributionSource::Url(url) => { + debug!("Fetching source distribution from: {url}"); + + let reader = client.stream_external(&url).await?; + let mut reader = tokio::io::BufReader::new(reader.compat()); + + // Download the source distribution. + let sdist_filename = distribution.filename()?; + let sdist_file = temp_dir.path().join(sdist_filename.as_ref()); + let mut writer = tokio::fs::File::create(&sdist_file).await?; + tokio::io::copy(&mut reader, &mut writer).await?; + + sdist_file + } + DistributionSource::Git(git) => { + debug!("Fetching source distribution from: {git}"); + + let git_dir = self.0.cache().map_or_else( + || temp_dir.path().join(GIT_CACHE), + |cache| cache.join(GIT_CACHE), + ); + let source = GitSource::new(git, git_dir); + tokio::task::spawn_blocking(move || source.fetch()).await?? + } + }; // Create a directory for the wheel. let wheel_dir = self.0.cache().map_or_else( @@ -166,3 +191,38 @@ fn read_dist_info(wheel: &CachedWheel) -> Result { )?; Ok(Metadata21::parse(dist_info.as_bytes())?) } + +/// The host source for a distribution. +#[derive(Debug)] +enum DistributionSource<'a> { + /// The distribution is available at a remote URL. This could be a dedicated URL, or a URL + /// served by a registry, like PyPI. + Url(Cow<'a, Url>), + /// The distribution is available in a remote Git repository. + Git(Git), +} + +impl<'a> TryFrom<&'a RemoteDistributionRef<'_>> for DistributionSource<'a> { + type Error = Error; + + fn try_from(value: &'a RemoteDistributionRef<'_>) -> Result { + match value { + // If a distribution is hosted on a registry, it must be available at a URL. + RemoteDistributionRef::Registry(_, _, file) => { + let url = Url::parse(&file.url)?; + Ok(Self::Url(Cow::Owned(url))) + } + // If a distribution is specified via a direct URL, it could be a URL to a hosted file, + // or a URL to a Git repository. + RemoteDistributionRef::Url(_, url) => { + if let Some(url) = url.as_str().strip_prefix("git+") { + let url = Url::parse(url)?; + let git = Git::try_from(url)?; + Ok(Self::Git(git)) + } else { + Ok(Self::Url(Cow::Borrowed(url))) + } + } + } + } +}