From 057f2b35c2c7645742c9f1ed8f4ace47c06766e6 Mon Sep 17 00:00:00 2001 From: Colin Rofls Date: Thu, 19 Sep 2024 12:40:51 -0400 Subject: [PATCH 1/2] Rework repo discovery We don't store the repo name, since it can be derived from the url; instead validate that the url is well formed. --- src/lib.rs | 89 ++++++++++++++---------------------------------- src/repo_info.rs | 54 +++++++++++++++++++++++++---- 2 files changed, 74 insertions(+), 69 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index b71789c..0a5fb76 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,7 +24,7 @@ //! ``` use std::{ - collections::{BTreeMap, BTreeSet, HashSet}, + collections::{BTreeSet, HashSet}, path::{Path, PathBuf}, sync::{ atomic::{AtomicBool, Ordering}, @@ -79,6 +79,9 @@ pub fn run(args: &Args) { /// Discover repositories containing font source files. /// +/// Returns a vec of `RepoInfo` structs describing repositories containing +/// known font sources. +/// /// This looks at every font in the google/fonts github repo, looks to see if /// we have a known upstream repository for that font, and then looks to see if /// that repo contains a config.yaml file. @@ -86,7 +89,7 @@ pub fn run(args: &Args) { /// The 'fonts_repo_path' is the path to a local checkout of the [google/fonts] /// repository. If this is `None`, we will clone that repository to a tempdir. /// -/// The 'sources_dir' is the path to a directory where repositories will be +/// The 'git_cache_dir' is the path to a directory where repositories will be /// checked out, if necessary. Because we check out lots of repos (and it is /// likely that the caller will want to check these out again later) it makes /// sense to cache these in most cases. @@ -94,7 +97,7 @@ pub fn run(args: &Args) { /// [google/fonts]: https://github.com/google/fonts pub fn discover_sources( fonts_repo_path: Option<&Path>, - sources_dir: Option<&Path>, + git_cache_dir: Option<&Path>, verbose: bool, ) -> Vec { let candidates = match fonts_repo_path { @@ -102,14 +105,14 @@ pub fn discover_sources( None => get_candidates_from_remote(verbose), }; - let have_repo = pruned_candidates(&candidates); + let have_repo = candidates_with_known_repo(&candidates); log::info!( "checking {} repositories for config.yaml files", have_repo.len() ); - let has_config_files = if let Some(font_path) = sources_dir { - find_config_files(&have_repo, font_path) + let repos_with_config_files = if let Some(git_cache) = git_cache_dir { + find_config_files(&have_repo, git_cache) } else { let tempdir = tempfile::tempdir().unwrap(); find_config_files(&have_repo, tempdir.path()) @@ -124,36 +127,16 @@ pub fn discover_sources( log::debug!( "{} of {} have sources/config.yaml", - has_config_files.len(), + repos_with_config_files.len(), have_repo.len() ); } - let mut repos: Vec<_> = have_repo - .iter() - .filter_map(|meta| { - has_config_files - .get(&meta.name) - .map(|(configs, rev)| RepoInfo { - repo_name: meta - .repo_url - .as_deref() - .and_then(repo_name_from_url) - .expect("already checked") - .to_owned(), - repo_url: meta.repo_url.clone().unwrap(), - rev: rev.clone(), - config_files: configs.clone(), - }) - }) - .collect(); - - repos.sort(); - repos + repos_with_config_files } /// Returns the set of candidates that have a unique repository URL -fn pruned_candidates(candidates: &BTreeSet) -> BTreeSet { +fn candidates_with_known_repo(candidates: &BTreeSet) -> BTreeSet { let mut seen_repos = HashSet::new(); let mut result = BTreeSet::new(); for metadata in candidates { @@ -180,37 +163,25 @@ fn pruned_candidates(candidates: &BTreeSet) -> BTreeSet { /// We naively look for the most common file names using a simple http request, /// and if we don't find anything then we clone the repo locally and inspect /// its contents. -fn find_config_files( - fonts: &BTreeSet, - checkout_font_dir: &Path, -) -> BTreeMap, GitRev)> { +fn find_config_files(fonts: &BTreeSet, git_cache_dir: &Path) -> Vec { let n_has_repo = fonts.iter().filter(|md| md.repo_url.is_some()).count(); // messages sent from a worker thread enum Message { - Finished { - font: String, - configs: Vec, - rev: GitRev, - }, + Finished(Option), ErrorMsg(String), RateLimit(usize), } rayon::scope(|s| { - let mut result = BTreeMap::new(); + let mut result = Vec::new(); let mut seen = 0; let mut sent = 0; let mut progressbar = kdam::tqdm!(total = n_has_repo); let rate_limited = Arc::new(AtomicBool::new(false)); let (tx, rx) = channel(); - for (name, repo) in fonts - .iter() - .filter_map(|meta| meta.repo_url.as_ref().map(|repo| (&meta.name, repo))) - { - let repo = repo.clone(); - let name = name.clone(); + for repo_url in fonts.iter().filter_map(|meta| meta.repo_url.clone()) { let tx = tx.clone(); let rate_limited = rate_limited.clone(); s.spawn(move |_| { @@ -220,23 +191,15 @@ fn find_config_files( std::thread::sleep(Duration::from_secs(1)); } // then try to get configs (which may trigger rate limiting) - match config_files_and_rev_for_repo(&repo, checkout_font_dir) { - Ok((configs, rev)) => { - tx.send(Message::Finished { - font: name, - configs, - rev, - }) - .unwrap(); + match config_files_and_rev_for_repo(&repo_url, git_cache_dir) { + Ok((config_files, rev)) if !config_files.is_empty() => { + let info = RepoInfo::new(repo_url, rev, config_files); + tx.send(Message::Finished(info)).unwrap(); break; } - Err(ConfigFetchIssue::NoConfigFound) => { - tx.send(Message::Finished { - font: name, - configs: Default::default(), - rev: Default::default(), - }) - .unwrap(); + // no configs found or looking for configs failed: + Err(ConfigFetchIssue::NoConfigFound) | Ok(_) => { + tx.send(Message::Finished(None)).unwrap(); break; } // if we're rate limited, set the flag telling other threads @@ -266,9 +229,9 @@ fn find_config_files( while seen < sent { match rx.recv() { - Ok(Message::Finished { font, configs, rev }) => { - if !configs.is_empty() { - result.insert(font, (configs, rev)); + Ok(Message::Finished(info)) => { + if let Some(info) = info { + result.push(info); } seen += 1; } diff --git a/src/repo_info.rs b/src/repo_info.rs index 96a7263..0c9d860 100644 --- a/src/repo_info.rs +++ b/src/repo_info.rs @@ -6,26 +6,61 @@ use crate::{error::LoadRepoError, Config}; /// Information about a git repository containing font sources #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)] +#[non_exhaustive] pub struct RepoInfo { - /// The name of the repository. - /// - /// This is everything after the trailing '/' in e.g. `https://github.com/PaoloBiagini/Joan` - pub repo_name: String, /// The repository's url pub repo_url: String, /// The commit rev of the repository's main branch, at discovery time. - pub rev: String, + //NOTE: this isn't private because we want to force the use of `new` for + //construction, so we can ensure urls are well formed + rev: String, /// The names of config files that exist in this repository's source directory pub config_files: Vec, } impl RepoInfo { + /// Create a `RepoInfo` after some validation. + /// + /// Returns `None` if the url has some unexpected format, or if there are + /// no config files + pub(crate) fn new(repo_url: String, rev: String, config_files: Vec) -> Option { + if repo_name_and_org_from_url(&repo_url).is_none() { + log::warn!("unexpected repo url '{repo_url}'"); + return None; + } + Some(Self { + repo_url, + rev, + config_files, + }) + } + + /// The name of the user or org that the repository lives under. + /// + /// This is 'googlefonts' for the repo `https://github.com/googlefonts/google-fonts-sources` + pub fn repo_org(&self) -> &str { + // unwrap is safe because we validate at construction time + repo_name_and_org_from_url(&self.repo_url).unwrap().0 + } + + /// The name of the repository. + /// + /// This is everything after the trailing '/' in e.g. `https://github.com/PaoloBiagini/Joan` + pub fn repo_name(&self) -> &str { + repo_name_and_org_from_url(&self.repo_url).unwrap().1 + } + + /// The commit rev of the repository's main branch, at discovery time. + pub fn git_rev(&self) -> &str { + &self.rev + } + /// Return the a `Vec` of source files in this respository. /// /// If necessary, this will create a new checkout of this repo at /// '{font_dir}/{repo_name}'. pub fn get_sources(&self, font_repos_dir: &Path) -> Result, LoadRepoError> { - let font_dir = font_repos_dir.join(&self.repo_name); + let font_dir = font_repos_dir.join(self.repo_name()); if !font_dir.exists() { std::fs::create_dir_all(&font_dir)?; @@ -62,3 +97,10 @@ impl RepoInfo { Ok(sources) } } + +fn repo_name_and_org_from_url(url: &str) -> Option<(&str, &str)> { + let url = url.trim_end_matches('/'); + let (rest, name) = url.rsplit_once('/')?; + let (_, org) = rest.rsplit_once('/')?; + Some((org, name)) +} From 4cf9388162262e36813d85560a279b2515eda138 Mon Sep 17 00:00:00 2001 From: Colin Rofls Date: Thu, 19 Sep 2024 12:46:37 -0400 Subject: [PATCH 2/2] Check that source exists before returning it It's possible for a config to name a source that doesn't actually exist in the repo. --- src/repo_info.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/repo_info.rs b/src/repo_info.rs index 0c9d860..5e41aa6 100644 --- a/src/repo_info.rs +++ b/src/repo_info.rs @@ -58,9 +58,9 @@ impl RepoInfo { /// Return the a `Vec` of source files in this respository. /// /// If necessary, this will create a new checkout of this repo at - /// '{font_dir}/{repo_name}'. - pub fn get_sources(&self, font_repos_dir: &Path) -> Result, LoadRepoError> { - let font_dir = font_repos_dir.join(self.repo_name()); + /// '{git_cache_dir}/{repo_name}'. + pub fn get_sources(&self, git_cache_dir: &Path) -> Result, LoadRepoError> { + let font_dir = git_cache_dir.join(self.repo_name()); if !font_dir.exists() { std::fs::create_dir_all(&font_dir)?; @@ -89,7 +89,10 @@ impl RepoInfo { let mut sources = configs .iter() .flat_map(|c| c.sources.iter()) - .map(|source| source_dir.join(source)) + .filter_map(|source| { + let source = source_dir.join(source); + source.exists().then_some(source) + }) .collect::>(); sources.sort_unstable(); sources.dedup();