Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(turborepo): avoid globbing directories due to ancestor truncation #5273

Merged
merged 19 commits into from
Jun 13, 2023
Merged
5 changes: 3 additions & 2 deletions cli/turbo.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@

"inputs": [
"**/*.go",
"**/*_test.go",
"../crates/turborepo*/**" // Rust crates
"!**/*_test.go",
"../crates/turborepo*/**/*.rs", // Rust crates
"!../crates/**/target"
]
},
"e2e": {
Expand Down
114 changes: 109 additions & 5 deletions crates/turborepo-globwalk/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ use std::{
use empty_glob::InclusiveEmptyAny;
use itertools::Itertools;
use path_slash::PathExt;
use turbopath::{AbsoluteSystemPath, AbsoluteSystemPathBuf};
use wax::{Any, BuildError, Glob, Pattern};
use turbopath::{AbsoluteSystemPath, AbsoluteSystemPathBuf, PathError};
use wax::{Any, BuildError, Glob, Pattern, WalkBehavior, WalkEntry};

#[derive(Debug, PartialEq, Clone, Copy)]
pub enum WalkType {
Expand Down Expand Up @@ -50,6 +50,10 @@ pub enum WalkError {
InvalidPath,
#[error("walk error: {0}")]
WalkError(#[from] walkdir::Error),
#[error(transparent)]
Path(#[from] PathError),
#[error(transparent)]
WaxWalk(#[from] wax::WalkError),
}

/// Performs a glob walk, yielding paths that _are_ included in the include list
Expand All @@ -62,7 +66,7 @@ pub enum WalkError {
/// - collapse the path, and calculate the new base_path, which defined as
/// the longest common prefix of all the includes
/// - traversing above the root of the base_path is not allowed
pub fn globwalk(
pub fn _globwalk(
base_path: &AbsoluteSystemPath,
include: &[String],
exclude: &[String],
Expand Down Expand Up @@ -277,6 +281,79 @@ fn collapse_path(path: &str) -> Option<(Cow<str>, usize)> {
}
}

pub fn globwalk(
base_path: &AbsoluteSystemPath,
include: &[String],
exclude: &[String],
walk_type: WalkType,
) -> Result<Vec<Result<AbsoluteSystemPathBuf, WalkError>>, WalkError> {
let (base_path_new, include_paths, exclude_paths) =
preprocess_paths_and_globs(base_path, include, exclude)?;
let inc_patterns = include_paths
.iter()
.map(|g| Glob::new(g.as_str()).map(|g| g.into_owned()))
gsoltis marked this conversation as resolved.
Show resolved Hide resolved
.collect::<Result<Vec<_>, BuildError>>()?;
let ex_patterns = exclude_paths
.iter()
.map(|g| Glob::new(g.as_str()).map(|g| g.into_owned()))
gsoltis marked this conversation as resolved.
Show resolved Hide resolved
.collect::<Result<Vec<_>, _>>()?;

let result = inc_patterns
.into_iter()
.flat_map(|glob| {
// Check if the glob specifies an exact filename with no meta characters.
if let Some(prefix) = glob.variance().path() {
// We expect all of our globs to be absolute paths
debug_assert!(glob.has_root());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit, since this is a public function I'd rather have this failure expressed as a Err instead of a panic. Could do this check while building up the inc_patterns vector

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Heh, I think this is actually a pointer at a different error. This particular assert is for an internal error, preprocess_paths_and_globs should only return globs that are absolute paths (they are joined to base_path). User input into the function should not be able to trigger it.

However, from a quick read, it looks like we instead will just skip invalid globs in collapse_paths, called by preprocess_paths_and_globs, which should instead be an error.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Left the debug assert, but added the explicit error case above as well.

// We're either going to return this path or nothing. Check if it's a directory
// and if we want directories
match AbsoluteSystemPathBuf::new(prefix)
.map_err(|e| e.into())
.and_then(|path| path.symlink_metadata().map(|md| (path, md)))
gsoltis marked this conversation as resolved.
Show resolved Hide resolved
{
Err(e) => {
// If the file doesn't exist, it's not an error, there's just nothing to
// glob
if e.is_io_error(ErrorKind::NotFound) {
vec![]
} else {
vec![Err(e.into())]
}
}
Ok((path, md)) => {
if walk_type == WalkType::Files && md.is_dir() {
vec![]
} else {
vec![Ok(path)]
}
}
gsoltis marked this conversation as resolved.
Show resolved Hide resolved
}
} else {
glob.walk(&base_path_new)
.not(ex_patterns.iter().map(|g| g.to_owned()))
gsoltis marked this conversation as resolved.
Show resolved Hide resolved
// Per docs, only fails if exclusion list is too large, since we're using
// pre-compiled globs
.expect(&format!(
"Failed to compile exclusion globs: {:?}",
ex_patterns
))
gsoltis marked this conversation as resolved.
Show resolved Hide resolved
.filter_map(|entry| match entry {
Ok(entry) => {
if walk_type == WalkType::Files && entry.file_type().is_dir() {
None
} else {
Some(AbsoluteSystemPathBuf::new(entry.path()).map_err(|e| e.into()))
}
}
gsoltis marked this conversation as resolved.
Show resolved Hide resolved
Err(e) => Some(Err(e.into())),
})
.collect::<Vec<_>>()
}
})
.collect::<Vec<_>>();
Ok(result)
}

#[cfg(test)]
mod test {
use std::{collections::HashSet, path::Path};
Expand All @@ -290,6 +367,28 @@ mod test {
collapse_path, empty_glob::InclusiveEmptyAny, globwalk, MatchType, WalkError, WalkType,
};

#[test]
fn test_wax_globwalk() {
let cwd = AbsoluteSystemPathBuf::cwd().unwrap();
let repo_root = cwd
.ancestors()
.find(|dir| dir.join_component(".git").exists())
.unwrap();
let includes: &[String] = &[
"cli/{cmd,internal}/**/*.go".to_string(),
"cli/../crates/turborepo*/**/*.rs".to_string(),
"cli/package.json".to_string(),
"cli/turbo.json".to_string(),
];
let excludes: &[String] = &[
"cli/**/*_test.go".to_string(),
"cli/**/crates/**/target".to_string(),
];
let files = globwalk(&repo_root, includes, excludes, WalkType::Files).unwrap();
println!("{:?} ({})", files, files.len());
assert!(files.len() < 1000);
}

#[test_case("a/./././b", "a/b", 1 ; "test path with dot segments")]
#[test_case("a/../b", "b", 0 ; "test path with dotdot segments")]
#[test_case("a/./../b", "b", 0 ; "test path with mixed dot and dotdot segments")]
Expand Down Expand Up @@ -512,7 +611,8 @@ mod test {
#[test_case("**/*.txt", 1, 1 => matches None)]
#[test_case("**/【*", 1, 1 => matches None)]
// in the go implementation, broken-symlink is yielded,
// however in symlink mode, walkdir yields broken symlinks as errors
// however in symlink mode, walkdir yields broken symlinks as errors.
// Note that walkdir _always_ follows root symlinks.
#[test_case("broken-symlink", 1, 1 => matches None ; "broken symlinks should be yielded")]
// globs that match across a symlink should not follow the symlink
#[test_case("working-symlink/c/*", 0, 0 => matches None ; "working symlink should not be followed")]
Expand Down Expand Up @@ -1131,6 +1231,7 @@ mod test {
let (success, _): (Vec<AbsoluteSystemPathBuf>, Vec<_>) =
super::globwalk(&path, &include, &exclude, walk_type)
.unwrap()
.into_iter()
.partition_result();

let success = success
Expand Down Expand Up @@ -1220,7 +1321,9 @@ mod test {
let child = root.join_component("child");
let include = &["../*-file".to_string()];
let exclude = &[];
let iter = globwalk(&child, include, exclude, WalkType::Files).unwrap();
let iter = globwalk(&child, include, exclude, WalkType::Files)
.unwrap()
.into_iter();
let results = iter
.map(|entry| {
let entry = entry.unwrap();
Expand Down Expand Up @@ -1250,6 +1353,7 @@ mod test {
let exclude = &["apps/ignored".to_string(), "**/node_modules/**".to_string()];
let iter = globwalk(&root, include, exclude, WalkType::Files).unwrap();
let paths = iter
.into_iter()
.map(|path| {
let path = path.unwrap();
let relative = root.anchor(path).unwrap();
Expand Down
3 changes: 3 additions & 0 deletions crates/turborepo-scm/src/package_deps.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ pub fn get_package_file_hashes_from_inputs<S: AsRef<str>>(
Either::Left([package_unix_path, raw_glob.as_ref()].join("/"))
}
});
println!("glob start");
let iter = globwalk::globwalk(
turbo_root,
&inclusions,
Expand All @@ -78,7 +79,9 @@ pub fn get_package_file_hashes_from_inputs<S: AsRef<str>>(
Ok(path)
})
.collect::<Result<Vec<_>, Error>>()?;
println!("glob end");
let mut hashes = GitHashes::new();
println!("hashing {} files", to_hash.len());
hash_objects(&git_root, &full_pkg_path, to_hash, &mut hashes)?;
Ok(hashes)
}
Expand Down
10 changes: 9 additions & 1 deletion crates/turborepo-wax/src/walk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,17 @@ macro_rules! walk {
.strip_prefix(&$state.prefix)
.expect("path is not in tree");
let depth = entry.depth().saturating_sub(1);
// Globs don't include the root token, but absolute paths on unix do.
// If we have an absolute path and are on unix, skip that token so that
// matching up components will work below.
let to_skip_in_path = if cfg!(unix) && path.is_absolute() {
depth + 1
} else {
depth
};
for candidate in path
.components()
.skip(depth)
.skip(to_skip_in_path)
.filter_map(|component| match component {
Component::Normal(component) => Some(CandidatePath::from(component)),
_ => None,
Expand Down