Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(turborepo): avoid globbing directories due to ancestor truncation #5273

Merged
merged 19 commits into from
Jun 13, 2023
Merged
8 changes: 5 additions & 3 deletions cli/turbo.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@
],

"inputs": [
"**/*.go",
"**/*_test.go",
"../crates/turborepo*/**" // Rust crates
"{internal,cmd}/**/*.go",
"!**/*_test.go",
"../crates/turborepo*/**/*.rs", // Rust crates
"../crates/turborepo*/Cargo.toml",
"!../crates/**/target"
]
},
"e2e": {
Expand Down
17 changes: 4 additions & 13 deletions crates/turborepo-ffi/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ mod lockfile;

use std::{collections::HashMap, mem::ManuallyDrop, path::PathBuf};

use globwalk::{globwalk, WalkError};
use globwalk::globwalk;
pub use lockfile::{patches, subgraph, transitive_closure};
use turbopath::{AbsoluteSystemPathBuf, AnchoredSystemPathBuf};
use turborepo_env::EnvironmentVariableMap;
Expand Down Expand Up @@ -483,13 +483,13 @@ pub extern "C" fn glob(buffer: Buffer) -> Buffer {
false => globwalk::WalkType::All,
};

let iter = match globwalk(
let files = match globwalk(
&AbsoluteSystemPathBuf::new(req.base_path).expect("absolute"),
&req.include_patterns,
&req.exclude_patterns,
walk_type,
) {
Ok(iter) => iter,
Ok(files) => files,
Err(err) => {
let resp = proto::GlobResp {
response: Some(proto::glob_resp::Response::Error(err.to_string())),
Expand All @@ -498,17 +498,8 @@ pub extern "C" fn glob(buffer: Buffer) -> Buffer {
}
};

let paths = match iter.collect::<Result<Vec<_>, WalkError>>() {
Ok(paths) => paths,
Err(err) => {
let resp = proto::GlobResp {
response: Some(proto::glob_resp::Response::Error(err.to_string())),
};
return resp.into();
}
};
// TODO: is to_string_lossy the right thing to do here? We could error...
let files: Vec<_> = paths
let files: Vec<_> = files
.into_iter()
.map(|path| path.to_string_lossy().to_string())
.collect();
Expand Down
120 changes: 102 additions & 18 deletions crates/turborepo-globwalk/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ mod empty_glob;

use std::{
borrow::Cow,
collections::HashSet,
io::ErrorKind,
path::{Path, PathBuf},
};

use empty_glob::InclusiveEmptyAny;
use itertools::Itertools;
use path_slash::PathExt;
use turbopath::{AbsoluteSystemPath, AbsoluteSystemPathBuf};
use turbopath::{AbsoluteSystemPath, AbsoluteSystemPathBuf, PathError};
use wax::{Any, BuildError, Glob, Pattern};

#[derive(Debug, PartialEq, Clone, Copy)]
Expand Down Expand Up @@ -50,6 +51,12 @@ pub enum WalkError {
InvalidPath,
#[error("walk error: {0}")]
WalkError(#[from] walkdir::Error),
#[error(transparent)]
Path(#[from] PathError),
#[error(transparent)]
WaxWalk(#[from] wax::WalkError),
#[error("Internal error on glob {glob}: {error}")]
InternalError { glob: String, error: String },
}

/// Performs a glob walk, yielding paths that _are_ included in the include list
Expand All @@ -62,7 +69,7 @@ pub enum WalkError {
/// - collapse the path, and calculate the new base_path, which defined as
/// the longest common prefix of all the includes
/// - traversing above the root of the base_path is not allowed
pub fn globwalk(
pub fn _globwalk(
base_path: &AbsoluteSystemPath,
include: &[String],
exclude: &[String],
Expand Down Expand Up @@ -277,6 +284,86 @@ fn collapse_path(path: &str) -> Option<(Cow<str>, usize)> {
}
}

pub fn globwalk(
base_path: &AbsoluteSystemPath,
include: &[String],
exclude: &[String],
walk_type: WalkType,
) -> Result<HashSet<AbsoluteSystemPathBuf>, WalkError> {
let (base_path_new, include_paths, exclude_paths) =
preprocess_paths_and_globs(base_path, include, exclude)?;
let inc_patterns = include_paths
.iter()
.map(|g| {
Glob::new(g.as_str()).map_err(|e| e.into()).and_then(|g| {
if g.has_root() {
Ok(g)
} else {
Err(WalkError::InternalError {
glob: g.to_string(),
error: "expected glob to have a root".to_string(),
})
}
})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Glob::new(g.as_str()).map_err(|e| e.into()).and_then(|g| {
if g.has_root() {
Ok(g)
} else {
Err(WalkError::InternalError {
glob: g.to_string(),
error: "expected glob to have a root".to_string(),
})
}
})
match Glob::new(g.as_str()) {
Err(e) => e.into(),
Ok(g) if g.has_root() => Ok(g),
Ok(g) => Err(WalkError::InternalError {
glob: g.to_string(),
error: "expected glob to have a root".to_string(),
})
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is sadly irrelevant in the latest revision, we can't rely on globs being rooted, as they don't count as rooted on windows.

})
.collect::<Result<Vec<_>, WalkError>>()?;
let ex_patterns = exclude_paths
.iter()
.map(|g| Glob::new(g.as_str()))
.collect::<Result<Vec<_>, _>>()?;

let result = inc_patterns
.into_iter()
.flat_map(|glob| {
// Check if the glob specifies an exact filename with no meta characters.
if let Some(prefix) = glob.variance().path() {
// We expect all of our globs to be absolute paths (asserted above)
debug_assert!(glob.has_root());
// We're either going to return this path or nothing. Check if it's a directory
// and if we want directories
match AbsoluteSystemPathBuf::new(prefix).and_then(|path| {
let metadata = path.symlink_metadata()?;
Ok((path, metadata))
}) {
Err(e) if e.is_io_error(ErrorKind::NotFound) => {
// If the file doesn't exist, it's not an error, there's just nothing to
// glob
vec![]
}
Err(e) => vec![Err(e.into())],
Ok((_, md)) if walk_type == WalkType::Files && md.is_dir() => {
vec![]
}
Ok((path, _)) => vec![Ok(path)],
}
} else {
glob.walk(&base_path_new)
.not(ex_patterns.iter().cloned())
// Per docs, only fails if exclusion list is too large, since we're using
// pre-compiled globs
.unwrap_or_else(|e| {
panic!(
"Failed to compile exclusion globs: {:?}: {}",
ex_patterns, e,
)
})
.filter_map(|entry| match entry {
Ok(entry) => {
if walk_type == WalkType::Files && entry.file_type().is_dir() {
None
} else {
Some(AbsoluteSystemPathBuf::new(entry.path()).map_err(|e| e.into()))
}
}
gsoltis marked this conversation as resolved.
Show resolved Hide resolved
Err(e) => Some(Err(e.into())),
})
.collect::<Vec<_>>()
}
})
.collect::<Result<HashSet<_>, WalkError>>()?;
Ok(result)
}

#[cfg(test)]
mod test {
use std::{collections::HashSet, path::Path};
Expand Down Expand Up @@ -512,7 +599,9 @@ mod test {
#[test_case("**/*.txt", 1, 1 => matches None)]
#[test_case("**/【*", 1, 1 => matches None)]
// in the go implementation, broken-symlink is yielded,
// however in symlink mode, walkdir yields broken symlinks as errors
// however in symlink mode, walkdir yields broken symlinks as errors.
// Note that walkdir _always_ follows root symlinks. We handle this in the layer
// above wax.
#[test_case("broken-symlink", 1, 1 => matches None ; "broken symlinks should be yielded")]
// globs that match across a symlink should not follow the symlink
#[test_case("working-symlink/c/*", 0, 0 => matches None ; "working symlink should not be followed")]
Expand Down Expand Up @@ -556,11 +645,10 @@ mod test {
let dir = setup();

let path = AbsoluteSystemPathBuf::new(dir.path()).unwrap();
let (success, _error): (Vec<AbsoluteSystemPathBuf>, Vec<_>) =
match super::globwalk(&path, &[pattern.into()], &[], crate::WalkType::All) {
Ok(e) => e.into_iter().partition_result(),
Err(e) => return Some(e),
};
let success = match super::globwalk(&path, &[pattern.into()], &[], crate::WalkType::All) {
Ok(e) => e.into_iter(),
Err(e) => return Some(e),
};

assert_eq!(
success.len(),
Expand Down Expand Up @@ -1128,10 +1216,7 @@ mod test {
(crate::WalkType::Files, expected_files),
(crate::WalkType::All, expected),
] {
let (success, _): (Vec<AbsoluteSystemPathBuf>, Vec<_>) =
super::globwalk(&path, &include, &exclude, walk_type)
.unwrap()
.partition_result();
let success = super::globwalk(&path, &include, &exclude, walk_type).unwrap();

let success = success
.iter()
Expand Down Expand Up @@ -1220,12 +1305,11 @@ mod test {
let child = root.join_component("child");
let include = &["../*-file".to_string()];
let exclude = &[];
let iter = globwalk(&child, include, exclude, WalkType::Files).unwrap();
let iter = globwalk(&child, include, exclude, WalkType::Files)
.unwrap()
.into_iter();
let results = iter
.map(|entry| {
let entry = entry.unwrap();
root.anchor(entry).unwrap().to_str().unwrap().to_string()
})
.map(|entry| root.anchor(entry).unwrap().to_str().unwrap().to_string())
.collect::<Vec<_>>();
let expected = vec!["root-file".to_string()];
assert_eq!(results, expected);
Expand All @@ -1250,8 +1334,8 @@ mod test {
let exclude = &["apps/ignored".to_string(), "**/node_modules/**".to_string()];
let iter = globwalk(&root, include, exclude, WalkType::Files).unwrap();
let paths = iter
.into_iter()
.map(|path| {
let path = path.unwrap();
let relative = root.anchor(path).unwrap();
relative.to_str().unwrap().to_string()
})
Expand Down
9 changes: 3 additions & 6 deletions crates/turborepo-lib/src/package_manager/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -394,19 +394,16 @@ impl PackageManager {
pub fn get_package_jsons(
&self,
repo_root: &AbsoluteSystemPath,
) -> Result<Vec<AbsoluteSystemPathBuf>, Error> {
) -> Result<impl Iterator<Item = AbsoluteSystemPathBuf>, Error> {
let globs = self.get_workspace_globs(repo_root)?;

let walker = globwalk::globwalk(
let files = globwalk::globwalk(
repo_root,
&globs.package_json_inclusions,
&globs.raw_exclusions,
globwalk::WalkType::Files,
)?;
let items = walker
.map(|result| result.map_err(|e| e.into()))
.collect::<Result<Vec<_>, Error>>()?;
Ok(items)
Ok(files.into_iter())
}
}

Expand Down
6 changes: 3 additions & 3 deletions crates/turborepo-scm/src/package_deps.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,15 @@ pub fn get_package_file_hashes_from_inputs<S: AsRef<str>>(
Either::Left([package_unix_path, raw_glob.as_ref()].join("/"))
}
});
let iter = globwalk::globwalk(
let files = globwalk::globwalk(
turbo_root,
&inclusions,
&exclusions,
globwalk::WalkType::Files,
)?;
let to_hash = iter
let to_hash = files
.iter()
.map(|entry| {
let entry = entry?;
let path = git_root.anchor(entry)?.to_unix()?;
Ok(path)
})
Expand Down
47 changes: 46 additions & 1 deletion crates/turborepo-wax/src/walk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,17 @@ macro_rules! walk {
.strip_prefix(&$state.prefix)
.expect("path is not in tree");
let depth = entry.depth().saturating_sub(1);
// Globs don't include the root token, but absolute paths on unix do.
// If we have an absolute path and are on unix, skip that token so that
// matching up components will work below.
let to_skip_in_path = if cfg!(unix) && path.is_absolute() {
depth + 1
} else {
depth
};
for candidate in path
.components()
.skip(depth)
.skip(to_skip_in_path)
.filter_map(|component| match component {
Component::Normal(component) => Some(CandidatePath::from(component)),
_ => None,
Expand Down Expand Up @@ -570,9 +578,22 @@ pub struct Walk<'g> {
root: PathBuf,
prefix: PathBuf,
walk: walkdir::IntoIter,
// This is a hack to express an empty iterator
is_empty: bool,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't blocking, but I feel like we could either make this into an Option or have walk::walk return a generic impl Iterator and then return an empty iterator instead.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, this is definitely not the optimal way of doing this.

}

impl<'g> Walk<'g> {
fn empty() -> Self {
Self {
pattern: Cow::Owned(Regex::new("").unwrap()),
components: vec![],
root: PathBuf::new(),
prefix: PathBuf::new(),
walk: walkdir::WalkDir::new(PathBuf::new()).into_iter(),
is_empty: true,
}
}

fn compile<'t, I>(tokens: I) -> Result<Vec<Regex>, CompileError>
where
I: IntoIterator<Item = &'t Token<'t>>,
Expand Down Expand Up @@ -602,13 +623,15 @@ impl<'g> Walk<'g> {
root,
prefix,
walk,
is_empty,
} = self;
Walk {
pattern: Cow::Owned(pattern.into_owned()),
components,
root,
prefix,
walk,
is_empty,
}
}

Expand Down Expand Up @@ -704,6 +727,9 @@ impl Iterator for Walk<'_> {
type Item = WalkItem<'static>;

fn next(&mut self) -> Option<Self::Item> {
if self.is_empty {
return None;
}
walk!(self => |entry| {
return Some(entry.map(WalkEntry::into_owned));
});
Expand Down Expand Up @@ -907,6 +933,24 @@ pub fn walk<'g>(
}
},
);
if matches!(link, LinkBehavior::ReadFile) {
if let Ok(tail) = root.strip_prefix(directory) {
let found = tail
.components()
.try_fold(directory.to_path_buf(), |accum, c| {
let candidate = accum.join(c);
if candidate.is_symlink() {
None
} else {
Some(candidate)
}
})
.is_none();
if found {
return Walk::empty();
}
}
}
let components =
Walk::compile(glob.tree.as_ref().tokens()).expect("failed to compile glob sub-expressions");
Walk {
Expand All @@ -921,6 +965,7 @@ pub fn walk<'g>(
})
.max_depth(depth)
.into_iter(),
is_empty: false,
}
}

Expand Down