diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c1336ff..1a936d4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,6 +4,7 @@ repos: rev: v4.6.0 hooks: - id: check-added-large-files + exclude: assets/ - id: check-yaml args: [--unsafe] - id: check-toml diff --git a/README.md b/README.md index 3d5fc12..5d46c54 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,81 @@ # Unfolder -`unfolder` is a CLI tool to find: -- Large -- Duplicated -- Strangely named +![License: MIT](https://img.shields.io/badge/License-MIT-brightgreen) +[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit) -Files in directories of any size and structure extremely fast. +An extremely fast directory exploration tool to find: +- [x] Largest files +- [x] Duplicated files +- [ ] ... to be continued + +In directories of any size and structure. + +
+
+ A bar chart with benchmark results +

Example of analyzing the Apache Airflow codebase

+
+
+ +## Use cases + +Unfolder can be useful for: + +* **Software maintainers** to reduce repo size and eliminate duplicate files, within or across projects. +* **Project managers** to avoid extra data storage costs and have single location for each key artifact. + +## Benchmarks + +Unfolder analyzes codebases of large open-source projects in under half a second: + +| Project | Files | Folders | Elapsed time, ms | +|-----------------------------------------------------|-------|---------|------------------| +| [Apache Airflow](https://github.com/apache/airflow) | 7,558 | 1,713 | 310 | +| [Ruff](https://github.com/astral-sh/ruff) | 7,374 | 615 | 182 | +| [React](https://github.com/facebook/react) | 6,467 | 532 | 156 | +| [CPython](https://github.com/python/cpython) | 5,182 | 420 | 136 | +| [Kedro](https://github.com/kedro-org/kedro) | 527 | 122 | 176 | + +_Time values are measured during local runs on a MacBook Pro with Apple M1 Max chip, 32 GB RAM._ + +## Getting started + +### Installation + +Currently, only installation from source is supported: + +1. Make sure you have Rust toolchain set up. + - This can either be done [as the Rust guide suggests](https://www.rust-lang.org/tools/install). + - Or if you've using [RustRover IDE](https://www.jetbrains.com/rust/), it manages it automatically. +1. Clone [project repo](https://github.com/yury-fedotov/unfolder) locally, and `cd` there. +1. Run `cargo build --release` to build the binary executable file for the tool. +1. Run `cargo install --path .` to install this executable and make it available under `unfolder` namespace in CLI. + +### Usage + +The tool currently has just one CLI command which is available as: + +```bash +unfolder path/to/directory/ +``` + +In addition to path to directory, it can take 3 optional arguments: + +| Argument | Short | Long | Options | Default | +|------------------------------------------------------|-------|-----------------|---------------------------------------------------------------------------------------------------|---------------| +| List of file extensions to analyze | -e | --extensions | Comma-separated: e.g. py,png | All | +| Minimum file size to consider for duplicate analysis | | --min_file_size | One of the following alias: blank, config, code, excel, document, image, gif, audio, video, large | code (100 Kb) | +| Number of largest files to return based on size | -n | --n_top | Any positive integer | 5 | + +So for example: + +```bash +unfolder path/to/directory/ -e csv,pkl,png,gif --min_file_size image +``` + +Would: +* Analyze `path/to/directory/`. +* Consider only files of `csv`, `pkl`, `png` and `gif` extensions. +* While identifying duplicates, ignore files smaller than `image` alias implies (10 Mb). + +You can also run `unfolder -h` to get info on arguments. diff --git a/assets/airflow_demo.png b/assets/airflow_demo.png new file mode 100644 index 0000000..885a96e Binary files /dev/null and b/assets/airflow_demo.png differ diff --git a/src/cli.rs b/src/cli.rs index 8d4b794..9d63406 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -29,7 +29,7 @@ pub struct CLIArgs { pub fn parse_args() -> CLIArgs { let matches = Command::new("Directory Traversal") - .version("1.0") + .version("0.0.1") .long_about("Traverses a directory and processes files based on extensions") .arg( Arg::new("directory") @@ -49,7 +49,7 @@ pub fn parse_args() -> CLIArgs { Arg::new("min_file_size") .help("Minimum file size to consider (alias)") .long("min_file_size") - .default_value("document"), + .default_value("code"), ) .arg( Arg::new("n_top") @@ -82,7 +82,7 @@ pub fn parse_args() -> CLIArgs { .map(|s| s.parse().unwrap_or(5)) // Parse the value and default to 5 on error .unwrap_or(5); - let min_file_size = get_size_by_alias(size_alias.as_str()).unwrap_or(MEGABYTE); // Default to 1 MB if alias not found + let min_file_size = get_size_by_alias(size_alias.as_str()).unwrap_or(100 * KILOBYTE); CLIArgs { directory, diff --git a/src/results.rs b/src/results.rs index 599c2d2..a5723de 100644 --- a/src/results.rs +++ b/src/results.rs @@ -62,22 +62,39 @@ impl AnalysisResults { .bold() .color(OutputFormat::Numbers.color()), ); - println!( - "{} {} identified, {} of relevant types, {} analyzed for content", - "📄 Files:".to_string().bold(), - n_files_identified_formatted - .to_string() - .bold() - .color(OutputFormat::Numbers.color()), - n_files_considered_formatted - .to_string() - .bold() - .color(OutputFormat::Numbers.color()), - n_files_hashed_formatted - .to_string() - .bold() - .color(OutputFormat::Numbers.color()), - ); + if self.complete_statistics.n_files_identified + != self.complete_statistics.n_files_considered + { + println!( + "{} {} identified, {} of relevant types, {} largest are analyzed for being duplicates", + "📄 Files:".to_string().bold(), + n_files_identified_formatted + .to_string() + .bold() + .color(OutputFormat::Numbers.color()), + n_files_considered_formatted + .to_string() + .bold() + .color(OutputFormat::Numbers.color()), + n_files_hashed_formatted + .to_string() + .bold() + .color(OutputFormat::Numbers.color()), + ); + } else { + println!( + "{} {} identified, {} largest are analyzed for being duplicates", + "📄 Files:".to_string().bold(), + n_files_identified_formatted + .to_string() + .bold() + .color(OutputFormat::Numbers.color()), + n_files_hashed_formatted + .to_string() + .bold() + .color(OutputFormat::Numbers.color()), + ); + } println!(); println!(