diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c1336ff..1a936d4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -4,6 +4,7 @@ repos:
rev: v4.6.0
hooks:
- id: check-added-large-files
+ exclude: assets/
- id: check-yaml
args: [--unsafe]
- id: check-toml
diff --git a/README.md b/README.md
index 3d5fc12..5d46c54 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,81 @@
# Unfolder
-`unfolder` is a CLI tool to find:
-- Large
-- Duplicated
-- Strangely named
+![License: MIT](https://img.shields.io/badge/License-MIT-brightgreen)
+[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit)
-Files in directories of any size and structure extremely fast.
+An extremely fast directory exploration tool to find:
+- [x] Largest files
+- [x] Duplicated files
+- [ ] ... to be continued
+
+In directories of any size and structure.
+
+
+
+
+
Example of analyzing the Apache Airflow codebase
+
+
+
+## Use cases
+
+Unfolder can be useful for:
+
+* **Software maintainers** to reduce repo size and eliminate duplicate files, within or across projects.
+* **Project managers** to avoid extra data storage costs and have single location for each key artifact.
+
+## Benchmarks
+
+Unfolder analyzes codebases of large open-source projects in under half a second:
+
+| Project | Files | Folders | Elapsed time, ms |
+|-----------------------------------------------------|-------|---------|------------------|
+| [Apache Airflow](https://github.com/apache/airflow) | 7,558 | 1,713 | 310 |
+| [Ruff](https://github.com/astral-sh/ruff) | 7,374 | 615 | 182 |
+| [React](https://github.com/facebook/react) | 6,467 | 532 | 156 |
+| [CPython](https://github.com/python/cpython) | 5,182 | 420 | 136 |
+| [Kedro](https://github.com/kedro-org/kedro) | 527 | 122 | 176 |
+
+_Time values are measured during local runs on a MacBook Pro with Apple M1 Max chip, 32 GB RAM._
+
+## Getting started
+
+### Installation
+
+Currently, only installation from source is supported:
+
+1. Make sure you have Rust toolchain set up.
+ - This can either be done [as the Rust guide suggests](https://www.rust-lang.org/tools/install).
+ - Or if you've using [RustRover IDE](https://www.jetbrains.com/rust/), it manages it automatically.
+1. Clone [project repo](https://github.com/yury-fedotov/unfolder) locally, and `cd` there.
+1. Run `cargo build --release` to build the binary executable file for the tool.
+1. Run `cargo install --path .` to install this executable and make it available under `unfolder` namespace in CLI.
+
+### Usage
+
+The tool currently has just one CLI command which is available as:
+
+```bash
+unfolder path/to/directory/
+```
+
+In addition to path to directory, it can take 3 optional arguments:
+
+| Argument | Short | Long | Options | Default |
+|------------------------------------------------------|-------|-----------------|---------------------------------------------------------------------------------------------------|---------------|
+| List of file extensions to analyze | -e | --extensions | Comma-separated: e.g. py,png | All |
+| Minimum file size to consider for duplicate analysis | | --min_file_size | One of the following alias: blank, config, code, excel, document, image, gif, audio, video, large | code (100 Kb) |
+| Number of largest files to return based on size | -n | --n_top | Any positive integer | 5 |
+
+So for example:
+
+```bash
+unfolder path/to/directory/ -e csv,pkl,png,gif --min_file_size image
+```
+
+Would:
+* Analyze `path/to/directory/`.
+* Consider only files of `csv`, `pkl`, `png` and `gif` extensions.
+* While identifying duplicates, ignore files smaller than `image` alias implies (10 Mb).
+
+You can also run `unfolder -h` to get info on arguments.
diff --git a/assets/airflow_demo.png b/assets/airflow_demo.png
new file mode 100644
index 0000000..885a96e
Binary files /dev/null and b/assets/airflow_demo.png differ
diff --git a/src/cli.rs b/src/cli.rs
index 8d4b794..9d63406 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -29,7 +29,7 @@ pub struct CLIArgs {
pub fn parse_args() -> CLIArgs {
let matches = Command::new("Directory Traversal")
- .version("1.0")
+ .version("0.0.1")
.long_about("Traverses a directory and processes files based on extensions")
.arg(
Arg::new("directory")
@@ -49,7 +49,7 @@ pub fn parse_args() -> CLIArgs {
Arg::new("min_file_size")
.help("Minimum file size to consider (alias)")
.long("min_file_size")
- .default_value("document"),
+ .default_value("code"),
)
.arg(
Arg::new("n_top")
@@ -82,7 +82,7 @@ pub fn parse_args() -> CLIArgs {
.map(|s| s.parse().unwrap_or(5)) // Parse the value and default to 5 on error
.unwrap_or(5);
- let min_file_size = get_size_by_alias(size_alias.as_str()).unwrap_or(MEGABYTE); // Default to 1 MB if alias not found
+ let min_file_size = get_size_by_alias(size_alias.as_str()).unwrap_or(100 * KILOBYTE);
CLIArgs {
directory,
diff --git a/src/results.rs b/src/results.rs
index 599c2d2..a5723de 100644
--- a/src/results.rs
+++ b/src/results.rs
@@ -62,22 +62,39 @@ impl AnalysisResults {
.bold()
.color(OutputFormat::Numbers.color()),
);
- println!(
- "{} {} identified, {} of relevant types, {} analyzed for content",
- "📄 Files:".to_string().bold(),
- n_files_identified_formatted
- .to_string()
- .bold()
- .color(OutputFormat::Numbers.color()),
- n_files_considered_formatted
- .to_string()
- .bold()
- .color(OutputFormat::Numbers.color()),
- n_files_hashed_formatted
- .to_string()
- .bold()
- .color(OutputFormat::Numbers.color()),
- );
+ if self.complete_statistics.n_files_identified
+ != self.complete_statistics.n_files_considered
+ {
+ println!(
+ "{} {} identified, {} of relevant types, {} largest are analyzed for being duplicates",
+ "📄 Files:".to_string().bold(),
+ n_files_identified_formatted
+ .to_string()
+ .bold()
+ .color(OutputFormat::Numbers.color()),
+ n_files_considered_formatted
+ .to_string()
+ .bold()
+ .color(OutputFormat::Numbers.color()),
+ n_files_hashed_formatted
+ .to_string()
+ .bold()
+ .color(OutputFormat::Numbers.color()),
+ );
+ } else {
+ println!(
+ "{} {} identified, {} largest are analyzed for being duplicates",
+ "📄 Files:".to_string().bold(),
+ n_files_identified_formatted
+ .to_string()
+ .bold()
+ .color(OutputFormat::Numbers.color()),
+ n_files_hashed_formatted
+ .to_string()
+ .bold()
+ .color(OutputFormat::Numbers.color()),
+ );
+ }
println!();
println!(