From f046524026b9de30f6f3768b13df6ce4ebfc232a Mon Sep 17 00:00:00 2001 From: Michele Riva Date: Tue, 26 Nov 2024 19:17:02 +0100 Subject: [PATCH 1/2] chore: adds quality checks --- Cargo.lock | 91 +++++++++++++++++++++++++++++++++++ Cargo.toml | 5 ++ src/bin/quality_check.rs | 101 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 197 insertions(+) create mode 100644 src/bin/quality_check.rs diff --git a/Cargo.lock b/Cargo.lock index 28608fe..737bb62 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,12 +17,55 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + [[package]] name = "anstyle" version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" +dependencies = [ + "anstyle", + "windows-sys 0.59.0", +] + [[package]] name = "anyhow" version = "1.0.93" @@ -111,6 +154,12 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + [[package]] name = "criterion" version = "0.5.1" @@ -184,6 +233,29 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +[[package]] +name = "env_filter" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f2c92ceda6ceec50f43169f9ee8424fe2db276791afde7b2cd8bc084cb376ab" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13fa619b91fb2381732789fc5de83b45675e882f66623b7d8cb4f643017018d" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "humantime", + "log", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -223,6 +295,12 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + [[package]] name = "indexmap" version = "2.6.0" @@ -244,6 +322,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + [[package]] name = "itertools" version = "0.10.5" @@ -692,12 +776,19 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "vector_quantizer" version = "0.0.2" dependencies = [ "anyhow", "criterion", + "env_logger", "log", "ndarray", "ndarray-rand", diff --git a/Cargo.toml b/Cargo.toml index 3fce38e..19c3024 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ rand_distr = "0.4.3" rayon = "1.10.0" log = "0.4.22" thiserror = "2.0.3" +env_logger = "0.11.5" [dev-dependencies] criterion = { version = "0.5", features = ["html_reports"] } @@ -35,3 +36,7 @@ path = "src/bin/example.rs" [[bin]] name = "readme_example" path = "src/bin/readme_example.rs" + +[[bin]] +name = "quality_check" +path = "src/bin/quality_check.rs" diff --git a/src/bin/quality_check.rs b/src/bin/quality_check.rs new file mode 100644 index 0000000..a4baae2 --- /dev/null +++ b/src/bin/quality_check.rs @@ -0,0 +1,101 @@ +use anyhow::Result; +use log::info; +use ndarray::{s, Array2, ArrayView1, Axis}; +use ndarray_rand::RandomExt; +use rand_distr::Uniform; +use std::time::Instant; +use vector_quantizer::pq::PQ; + +fn euclidean_distance(a: &ArrayView1, b: &ArrayView1) -> f32 { + a.iter() + .zip(b.iter()) + .map(|(x, y)| (x - y).powi(2)) + .sum::() + .sqrt() +} + +fn calculate_reconstruction_error(original: &Array2, reconstructed: &Array2) -> f32 { + original + .outer_iter() + .zip(reconstructed.outer_iter()) + .map(|(orig, recon)| { + orig.iter() + .zip(recon.iter()) + .map(|(o, r)| (o - r).powi(2)) + .sum::() + }) + .sum::() + / original.len() as f32 +} + +fn calculate_recall(original: &Array2, compressed: &Array2, k: usize) -> Result { + let n_samples = original.len_of(Axis(0)); + let mut total_recall = 0.0; + + for i in 0..n_samples { + let query = original.slice(s![i, ..]); + + let mut true_neighbors: Vec<(usize, f32)> = (0..n_samples) + .filter(|&j| j != i) + .map(|j| (j, euclidean_distance(&query, &original.slice(s![j, ..])))) + .collect(); + true_neighbors.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + let true_neighbors: Vec = + true_neighbors.iter().take(k).map(|&(idx, _)| idx).collect(); + + let mut approx_neighbors: Vec<(usize, f32)> = (0..n_samples) + .filter(|&j| j != i) + .map(|j| { + ( + j, + euclidean_distance(&compressed.slice(s![i, ..]), &compressed.slice(s![j, ..])), + ) + }) + .collect(); + approx_neighbors.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + let approx_neighbors: Vec = approx_neighbors + .iter() + .take(k) + .map(|&(idx, _)| idx) + .collect(); + + let intersection: f32 = true_neighbors + .iter() + .filter(|&&idx| approx_neighbors.contains(&idx)) + .count() as f32; + + total_recall += intersection / k as f32; + } + + Ok(total_recall / n_samples as f32) +} + +fn main() -> Result<()> { + env_logger::init(); + + let n_samples = 1000; + let n_dims = 128; + let original_data = Array2::::random((n_samples, n_dims), Uniform::new(0.0, 1.0)); + + let m = 16; + let ks = 256; + let iterations = 10; + + let mut pq = PQ::try_new(m, ks)?; + + let fit_start = Instant::now(); + pq.fit(&original_data, iterations)?; + println!("Fit completed in {:?}", fit_start.elapsed()); + + let encode_start = Instant::now(); + let compressed_data = pq.compress(&original_data)?; + println!("Compression completed in {:?}", encode_start.elapsed()); + + let reconstruction_error = calculate_reconstruction_error(&original_data, &compressed_data); + println!("Reconstruction Error: {:.4}", reconstruction_error); + + let recall = calculate_recall(&original_data, &compressed_data, 10)?; + println!("Recall@10: {:.4}", recall); + + Ok(()) +} From 93e6b8bc2f9ac119cfcd2e42440e827d6da35208 Mon Sep 17 00:00:00 2001 From: Michele Riva Date: Tue, 26 Nov 2024 19:35:02 +0100 Subject: [PATCH 2/2] adds plots --- .gitignore | 4 +- Cargo.lock | 1 + Cargo.toml | 1 + Makefile | 14 +++++ plots/main.py | 52 ++++++++++++++++ plots/requirements.txt | 3 + src/bin/quality_check.rs | 126 +++++++++++++++++++++++++++++++++------ 7 files changed, 181 insertions(+), 20 deletions(-) create mode 100644 Makefile create mode 100644 plots/main.py create mode 100644 plots/requirements.txt diff --git a/.gitignore b/.gitignore index ec376bb..36c7cd9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ .idea -target \ No newline at end of file +target +benchmark_results.csv +benchmark_results.png \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 737bb62..818433d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -796,6 +796,7 @@ dependencies = [ "rand 0.9.0-alpha.2", "rand_distr", "rayon", + "serde", "thiserror", ] diff --git a/Cargo.toml b/Cargo.toml index 19c3024..f95f1c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ rayon = "1.10.0" log = "0.4.22" thiserror = "2.0.3" env_logger = "0.11.5" +serde = { version = "1.0.215", features = ["derive"] } [dev-dependencies] criterion = { version = "0.5", features = ["html_reports"] } diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..b008ac9 --- /dev/null +++ b/Makefile @@ -0,0 +1,14 @@ +.PHONY: quality_check + +RUST_LOG := info +PLOTS_DIR := plots +BENCHMARK_RESULTS := benchmark_results.png + +quality_check: + $(RUST_LOG) cargo run --release --bin quality_check + cd $(PLOTS_DIR) && python3 main.py + mv $(PLOTS_DIR)/$(BENCHMARK_RESULTS) ./$(BENCHMARK_RESULTS) + +clean: + rm -f $(BENCHMARK_RESULTS) + rm -f benchmark_results.csv \ No newline at end of file diff --git a/plots/main.py b/plots/main.py new file mode 100644 index 0000000..02f34c2 --- /dev/null +++ b/plots/main.py @@ -0,0 +1,52 @@ +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + +# Read the benchmark results +df = pd.read_csv('../benchmark_results.csv') + +# Set up the plotting style +plt.style.use('seaborn-v0_8-darkgrid') +sns.set_palette("husl") + +# Create a figure with multiple subplots +fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12)) + +# Plot 1: Timing metrics +ax1.plot(df['n_samples'], df['fit_time_ms'], marker='o', label='Fit Time') +ax1.plot(df['n_samples'], df['compression_time_ms'], marker='o', label='Compression Time') +ax1.set_xlabel('Number of Samples') +ax1.set_ylabel('Time (ms)') +ax1.set_title('Processing Time vs Dataset Size') +ax1.legend() +ax1.set_xscale('log') +ax1.set_yscale('log') + +# Plot 2: Quality metrics +ax2.plot(df['n_samples'], df['reconstruction_error'], marker='o', label='Reconstruction Error') +ax2.plot(df['n_samples'], df['recall'], marker='o', label='Recall@10') +ax2.set_xlabel('Number of Samples') +ax2.set_ylabel('Score') +ax2.set_title('Quality Metrics vs Dataset Size') +ax2.legend() +ax2.set_xscale('log') + +# Plot 3: Memory reduction +ax3.plot(df['n_samples'], (1 - df['memory_reduction_ratio']) * 100, marker='o') +ax3.set_xlabel('Number of Samples') +ax3.set_ylabel('Memory Reduction (%)') +ax3.set_title('Memory Reduction vs Dataset Size') +ax3.set_xscale('log') + +# Plot 4: Time per sample +df['time_per_sample'] = (df['compression_time_ms']) / df['n_samples'] +ax4.plot(df['n_samples'], df['time_per_sample'], marker='o') +ax4.set_xlabel('Number of Samples') +ax4.set_ylabel('Compression Time per Sample (ms)') +ax4.set_title('Scaling Efficiency') +ax4.set_xscale('log') +ax4.set_yscale('log') + +plt.tight_layout() +plt.savefig('benchmark_results.png', dpi=300, bbox_inches='tight') +plt.close() \ No newline at end of file diff --git a/plots/requirements.txt b/plots/requirements.txt new file mode 100644 index 0000000..3b9a24f --- /dev/null +++ b/plots/requirements.txt @@ -0,0 +1,3 @@ +pandas +matplotlib +seaborn \ No newline at end of file diff --git a/src/bin/quality_check.rs b/src/bin/quality_check.rs index a4baae2..fa405cf 100644 --- a/src/bin/quality_check.rs +++ b/src/bin/quality_check.rs @@ -3,9 +3,60 @@ use log::info; use ndarray::{s, Array2, ArrayView1, Axis}; use ndarray_rand::RandomExt; use rand_distr::Uniform; +use serde::Serialize; +use std::fs::File; +use std::io::Write; use std::time::Instant; use vector_quantizer::pq::PQ; +#[derive(Serialize)] +struct BenchmarkResult { + n_samples: usize, + n_dims: usize, + fit_time_ms: f64, + compression_time_ms: f64, + reconstruction_error: f32, + recall: f32, + memory_reduction_ratio: f32, +} + +fn run_benchmark( + n_samples: usize, + n_dims: usize, + m: usize, + ks: u32, + iterations: usize, +) -> Result { + let original_data = Array2::::random((n_samples, n_dims), Uniform::new(0.0, 1.0)); + + let mut pq = PQ::try_new(m, ks)?; + + let fit_start = Instant::now(); + pq.fit(&original_data, iterations)?; + let fit_time = fit_start.elapsed().as_secs_f64() * 1000.0; + + let compress_start = Instant::now(); + let compressed_data = pq.compress(&original_data)?; + let compression_time = compress_start.elapsed().as_secs_f64() * 1000.0; + + let reconstruction_error = calculate_reconstruction_error(&original_data, &compressed_data); + let recall = calculate_recall(&original_data, &compressed_data, 10)?; + + let original_size = n_samples * n_dims * size_of::(); + let compressed_size = n_samples * m; // Each subspace uses 1 byte + let memory_reduction_ratio = compressed_size as f32 / original_size as f32; + + Ok(BenchmarkResult { + n_samples, + n_dims, + fit_time_ms: fit_time, + compression_time_ms: compression_time, + reconstruction_error, + recall, + memory_reduction_ratio, + }) +} + fn euclidean_distance(a: &ArrayView1, b: &ArrayView1) -> f32 { a.iter() .zip(b.iter()) @@ -30,12 +81,30 @@ fn calculate_reconstruction_error(original: &Array2, reconstructed: &Array2 fn calculate_recall(original: &Array2, compressed: &Array2, k: usize) -> Result { let n_samples = original.len_of(Axis(0)); + + let max_eval_samples = 1000; + let eval_samples = if n_samples > max_eval_samples { + max_eval_samples + } else { + n_samples + }; + let mut total_recall = 0.0; + let step = n_samples / eval_samples; - for i in 0..n_samples { + for i in (0..n_samples).step_by(step) { let query = original.slice(s![i, ..]); - let mut true_neighbors: Vec<(usize, f32)> = (0..n_samples) + let search_window = if n_samples > 10000 { 5000 } else { n_samples }; + + let start_idx = if i > search_window / 2 { + i - search_window / 2 + } else { + 0 + }; + let end_idx = (i + search_window / 2).min(n_samples); + + let mut true_neighbors: Vec<(usize, f32)> = (start_idx..end_idx) .filter(|&j| j != i) .map(|j| (j, euclidean_distance(&query, &original.slice(s![j, ..])))) .collect(); @@ -43,7 +112,7 @@ fn calculate_recall(original: &Array2, compressed: &Array2, k: usize) let true_neighbors: Vec = true_neighbors.iter().take(k).map(|&(idx, _)| idx).collect(); - let mut approx_neighbors: Vec<(usize, f32)> = (0..n_samples) + let mut approx_neighbors: Vec<(usize, f32)> = (start_idx..end_idx) .filter(|&j| j != i) .map(|j| { ( @@ -67,35 +136,54 @@ fn calculate_recall(original: &Array2, compressed: &Array2, k: usize) total_recall += intersection / k as f32; } - Ok(total_recall / n_samples as f32) + Ok(total_recall / (n_samples / step) as f32) } fn main() -> Result<()> { env_logger::init(); - let n_samples = 1000; + let sample_sizes = vec![1000, 5000, 10000, 50000, 100000]; let n_dims = 128; - let original_data = Array2::::random((n_samples, n_dims), Uniform::new(0.0, 1.0)); - let m = 16; let ks = 256; let iterations = 10; - let mut pq = PQ::try_new(m, ks)?; - - let fit_start = Instant::now(); - pq.fit(&original_data, iterations)?; - println!("Fit completed in {:?}", fit_start.elapsed()); + let mut results = Vec::new(); - let encode_start = Instant::now(); - let compressed_data = pq.compress(&original_data)?; - println!("Compression completed in {:?}", encode_start.elapsed()); + for n_samples in sample_sizes { + info!("Running benchmark with {} samples...", n_samples); + let result = run_benchmark(n_samples, n_dims, m, ks, iterations)?; + results.push(result); + } - let reconstruction_error = calculate_reconstruction_error(&original_data, &compressed_data); - println!("Reconstruction Error: {:.4}", reconstruction_error); + let mut file = File::create("benchmark_results.csv")?; + writeln!(file, "n_samples,n_dims,fit_time_ms,compression_time_ms,reconstruction_error,recall,memory_reduction_ratio")?; + + for result in &results { + writeln!( + file, + "{},{},{},{},{},{},{}", + result.n_samples, + result.n_dims, + result.fit_time_ms, + result.compression_time_ms, + result.reconstruction_error, + result.recall, + result.memory_reduction_ratio + )?; + } - let recall = calculate_recall(&original_data, &compressed_data, 10)?; - println!("Recall@10: {:.4}", recall); + for result in &results { + info!("\nResults for {} samples:", result.n_samples); + info!("Fit time: {:.2}ms", result.fit_time_ms); + info!("Compression time: {:.2}ms", result.compression_time_ms); + info!("Reconstruction Error: {:.4}", result.reconstruction_error); + info!("Recall@10: {:.4}", result.recall); + info!( + "Memory reduction: {:.2}%", + (1.0 - result.memory_reduction_ratio) * 100.0 + ); + } Ok(()) }