From f046524026b9de30f6f3768b13df6ce4ebfc232a Mon Sep 17 00:00:00 2001
From: Michele Riva <michele.riva@oramasearch.com>
Date: Tue, 26 Nov 2024 19:17:02 +0100
Subject: [PATCH 1/2] chore: adds quality checks

---
 Cargo.lock               |  91 +++++++++++++++++++++++++++++++++++
 Cargo.toml               |   5 ++
 src/bin/quality_check.rs | 101 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 197 insertions(+)
 create mode 100644 src/bin/quality_check.rs

diff --git a/Cargo.lock b/Cargo.lock
index 28608fe..737bb62 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -17,12 +17,55 @@ version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
 
+[[package]]
+name = "anstream"
+version = "0.6.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
 [[package]]
 name = "anstyle"
 version = "1.0.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
 
+[[package]]
+name = "anstyle-parse"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
+dependencies = [
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
+dependencies = [
+ "anstyle",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "anyhow"
 version = "1.0.93"
@@ -111,6 +154,12 @@ version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"
 
+[[package]]
+name = "colorchoice"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
+
 [[package]]
 name = "criterion"
 version = "0.5.1"
@@ -184,6 +233,29 @@ version = "1.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
 
+[[package]]
+name = "env_filter"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f2c92ceda6ceec50f43169f9ee8424fe2db276791afde7b2cd8bc084cb376ab"
+dependencies = [
+ "log",
+ "regex",
+]
+
+[[package]]
+name = "env_logger"
+version = "0.11.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e13fa619b91fb2381732789fc5de83b45675e882f66623b7d8cb4f643017018d"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "env_filter",
+ "humantime",
+ "log",
+]
+
 [[package]]
 name = "equivalent"
 version = "1.0.1"
@@ -223,6 +295,12 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc"
 
+[[package]]
+name = "humantime"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
+
 [[package]]
 name = "indexmap"
 version = "2.6.0"
@@ -244,6 +322,12 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
+
 [[package]]
 name = "itertools"
 version = "0.10.5"
@@ -692,12 +776,19 @@ version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
 
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
 [[package]]
 name = "vector_quantizer"
 version = "0.0.2"
 dependencies = [
  "anyhow",
  "criterion",
+ "env_logger",
  "log",
  "ndarray",
  "ndarray-rand",
diff --git a/Cargo.toml b/Cargo.toml
index 3fce38e..19c3024 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,6 +20,7 @@ rand_distr = "0.4.3"
 rayon = "1.10.0"
 log = "0.4.22"
 thiserror = "2.0.3"
+env_logger = "0.11.5"
 
 [dev-dependencies]
 criterion = { version = "0.5", features = ["html_reports"] }
@@ -35,3 +36,7 @@ path = "src/bin/example.rs"
 [[bin]]
 name = "readme_example"
 path = "src/bin/readme_example.rs"
+
+[[bin]]
+name = "quality_check"
+path = "src/bin/quality_check.rs"
diff --git a/src/bin/quality_check.rs b/src/bin/quality_check.rs
new file mode 100644
index 0000000..a4baae2
--- /dev/null
+++ b/src/bin/quality_check.rs
@@ -0,0 +1,101 @@
+use anyhow::Result;
+use log::info;
+use ndarray::{s, Array2, ArrayView1, Axis};
+use ndarray_rand::RandomExt;
+use rand_distr::Uniform;
+use std::time::Instant;
+use vector_quantizer::pq::PQ;
+
+fn euclidean_distance(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> f32 {
+    a.iter()
+        .zip(b.iter())
+        .map(|(x, y)| (x - y).powi(2))
+        .sum::<f32>()
+        .sqrt()
+}
+
+fn calculate_reconstruction_error(original: &Array2<f32>, reconstructed: &Array2<f32>) -> f32 {
+    original
+        .outer_iter()
+        .zip(reconstructed.outer_iter())
+        .map(|(orig, recon)| {
+            orig.iter()
+                .zip(recon.iter())
+                .map(|(o, r)| (o - r).powi(2))
+                .sum::<f32>()
+        })
+        .sum::<f32>()
+        / original.len() as f32
+}
+
+fn calculate_recall(original: &Array2<f32>, compressed: &Array2<f32>, k: usize) -> Result<f32> {
+    let n_samples = original.len_of(Axis(0));
+    let mut total_recall = 0.0;
+
+    for i in 0..n_samples {
+        let query = original.slice(s![i, ..]);
+
+        let mut true_neighbors: Vec<(usize, f32)> = (0..n_samples)
+            .filter(|&j| j != i)
+            .map(|j| (j, euclidean_distance(&query, &original.slice(s![j, ..]))))
+            .collect();
+        true_neighbors.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
+        let true_neighbors: Vec<usize> =
+            true_neighbors.iter().take(k).map(|&(idx, _)| idx).collect();
+
+        let mut approx_neighbors: Vec<(usize, f32)> = (0..n_samples)
+            .filter(|&j| j != i)
+            .map(|j| {
+                (
+                    j,
+                    euclidean_distance(&compressed.slice(s![i, ..]), &compressed.slice(s![j, ..])),
+                )
+            })
+            .collect();
+        approx_neighbors.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
+        let approx_neighbors: Vec<usize> = approx_neighbors
+            .iter()
+            .take(k)
+            .map(|&(idx, _)| idx)
+            .collect();
+
+        let intersection: f32 = true_neighbors
+            .iter()
+            .filter(|&&idx| approx_neighbors.contains(&idx))
+            .count() as f32;
+
+        total_recall += intersection / k as f32;
+    }
+
+    Ok(total_recall / n_samples as f32)
+}
+
+fn main() -> Result<()> {
+    env_logger::init();
+
+    let n_samples = 1000;
+    let n_dims = 128;
+    let original_data = Array2::<f32>::random((n_samples, n_dims), Uniform::new(0.0, 1.0));
+
+    let m = 16;
+    let ks = 256;
+    let iterations = 10;
+
+    let mut pq = PQ::try_new(m, ks)?;
+
+    let fit_start = Instant::now();
+    pq.fit(&original_data, iterations)?;
+    println!("Fit completed in {:?}", fit_start.elapsed());
+
+    let encode_start = Instant::now();
+    let compressed_data = pq.compress(&original_data)?;
+    println!("Compression completed in {:?}", encode_start.elapsed());
+
+    let reconstruction_error = calculate_reconstruction_error(&original_data, &compressed_data);
+    println!("Reconstruction Error: {:.4}", reconstruction_error);
+
+    let recall = calculate_recall(&original_data, &compressed_data, 10)?;
+    println!("Recall@10: {:.4}", recall);
+
+    Ok(())
+}

From 93e6b8bc2f9ac119cfcd2e42440e827d6da35208 Mon Sep 17 00:00:00 2001
From: Michele Riva <michele.riva@oramasearch.com>
Date: Tue, 26 Nov 2024 19:35:02 +0100
Subject: [PATCH 2/2] adds plots

---
 .gitignore               |   4 +-
 Cargo.lock               |   1 +
 Cargo.toml               |   1 +
 Makefile                 |  14 +++++
 plots/main.py            |  52 ++++++++++++++++
 plots/requirements.txt   |   3 +
 src/bin/quality_check.rs | 126 +++++++++++++++++++++++++++++++++------
 7 files changed, 181 insertions(+), 20 deletions(-)
 create mode 100644 Makefile
 create mode 100644 plots/main.py
 create mode 100644 plots/requirements.txt

diff --git a/.gitignore b/.gitignore
index ec376bb..36c7cd9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
 .idea
-target
\ No newline at end of file
+target
+benchmark_results.csv
+benchmark_results.png
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
index 737bb62..818433d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -796,6 +796,7 @@ dependencies = [
  "rand 0.9.0-alpha.2",
  "rand_distr",
  "rayon",
+ "serde",
  "thiserror",
 ]
 
diff --git a/Cargo.toml b/Cargo.toml
index 19c3024..f95f1c4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,6 +21,7 @@ rayon = "1.10.0"
 log = "0.4.22"
 thiserror = "2.0.3"
 env_logger = "0.11.5"
+serde = { version = "1.0.215", features = ["derive"] }
 
 [dev-dependencies]
 criterion = { version = "0.5", features = ["html_reports"] }
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..b008ac9
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,14 @@
+.PHONY: quality_check
+
+RUST_LOG := info
+PLOTS_DIR := plots
+BENCHMARK_RESULTS := benchmark_results.png
+
+quality_check:
+	$(RUST_LOG) cargo run --release --bin quality_check
+	cd $(PLOTS_DIR) && python3 main.py
+	mv $(PLOTS_DIR)/$(BENCHMARK_RESULTS) ./$(BENCHMARK_RESULTS)
+
+clean:
+	rm -f $(BENCHMARK_RESULTS)
+	rm -f benchmark_results.csv
\ No newline at end of file
diff --git a/plots/main.py b/plots/main.py
new file mode 100644
index 0000000..02f34c2
--- /dev/null
+++ b/plots/main.py
@@ -0,0 +1,52 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# Read the benchmark results
+df = pd.read_csv('../benchmark_results.csv')
+
+# Set up the plotting style
+plt.style.use('seaborn-v0_8-darkgrid')
+sns.set_palette("husl")
+
+# Create a figure with multiple subplots
+fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
+
+# Plot 1: Timing metrics
+ax1.plot(df['n_samples'], df['fit_time_ms'], marker='o', label='Fit Time')
+ax1.plot(df['n_samples'], df['compression_time_ms'], marker='o', label='Compression Time')
+ax1.set_xlabel('Number of Samples')
+ax1.set_ylabel('Time (ms)')
+ax1.set_title('Processing Time vs Dataset Size')
+ax1.legend()
+ax1.set_xscale('log')
+ax1.set_yscale('log')
+
+# Plot 2: Quality metrics
+ax2.plot(df['n_samples'], df['reconstruction_error'], marker='o', label='Reconstruction Error')
+ax2.plot(df['n_samples'], df['recall'], marker='o', label='Recall@10')
+ax2.set_xlabel('Number of Samples')
+ax2.set_ylabel('Score')
+ax2.set_title('Quality Metrics vs Dataset Size')
+ax2.legend()
+ax2.set_xscale('log')
+
+# Plot 3: Memory reduction
+ax3.plot(df['n_samples'], (1 - df['memory_reduction_ratio']) * 100, marker='o')
+ax3.set_xlabel('Number of Samples')
+ax3.set_ylabel('Memory Reduction (%)')
+ax3.set_title('Memory Reduction vs Dataset Size')
+ax3.set_xscale('log')
+
+# Plot 4: Time per sample
+df['time_per_sample'] = (df['compression_time_ms']) / df['n_samples']
+ax4.plot(df['n_samples'], df['time_per_sample'], marker='o')
+ax4.set_xlabel('Number of Samples')
+ax4.set_ylabel('Compression Time per Sample (ms)')
+ax4.set_title('Scaling Efficiency')
+ax4.set_xscale('log')
+ax4.set_yscale('log')
+
+plt.tight_layout()
+plt.savefig('benchmark_results.png', dpi=300, bbox_inches='tight')
+plt.close()
\ No newline at end of file
diff --git a/plots/requirements.txt b/plots/requirements.txt
new file mode 100644
index 0000000..3b9a24f
--- /dev/null
+++ b/plots/requirements.txt
@@ -0,0 +1,3 @@
+pandas
+matplotlib
+seaborn
\ No newline at end of file
diff --git a/src/bin/quality_check.rs b/src/bin/quality_check.rs
index a4baae2..fa405cf 100644
--- a/src/bin/quality_check.rs
+++ b/src/bin/quality_check.rs
@@ -3,9 +3,60 @@ use log::info;
 use ndarray::{s, Array2, ArrayView1, Axis};
 use ndarray_rand::RandomExt;
 use rand_distr::Uniform;
+use serde::Serialize;
+use std::fs::File;
+use std::io::Write;
 use std::time::Instant;
 use vector_quantizer::pq::PQ;
 
+#[derive(Serialize)]
+struct BenchmarkResult {
+    n_samples: usize,
+    n_dims: usize,
+    fit_time_ms: f64,
+    compression_time_ms: f64,
+    reconstruction_error: f32,
+    recall: f32,
+    memory_reduction_ratio: f32,
+}
+
+fn run_benchmark(
+    n_samples: usize,
+    n_dims: usize,
+    m: usize,
+    ks: u32,
+    iterations: usize,
+) -> Result<BenchmarkResult> {
+    let original_data = Array2::<f32>::random((n_samples, n_dims), Uniform::new(0.0, 1.0));
+
+    let mut pq = PQ::try_new(m, ks)?;
+
+    let fit_start = Instant::now();
+    pq.fit(&original_data, iterations)?;
+    let fit_time = fit_start.elapsed().as_secs_f64() * 1000.0;
+
+    let compress_start = Instant::now();
+    let compressed_data = pq.compress(&original_data)?;
+    let compression_time = compress_start.elapsed().as_secs_f64() * 1000.0;
+
+    let reconstruction_error = calculate_reconstruction_error(&original_data, &compressed_data);
+    let recall = calculate_recall(&original_data, &compressed_data, 10)?;
+
+    let original_size = n_samples * n_dims * size_of::<f32>();
+    let compressed_size = n_samples * m; // Each subspace uses 1 byte
+    let memory_reduction_ratio = compressed_size as f32 / original_size as f32;
+
+    Ok(BenchmarkResult {
+        n_samples,
+        n_dims,
+        fit_time_ms: fit_time,
+        compression_time_ms: compression_time,
+        reconstruction_error,
+        recall,
+        memory_reduction_ratio,
+    })
+}
+
 fn euclidean_distance(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> f32 {
     a.iter()
         .zip(b.iter())
@@ -30,12 +81,30 @@ fn calculate_reconstruction_error(original: &Array2<f32>, reconstructed: &Array2
 
 fn calculate_recall(original: &Array2<f32>, compressed: &Array2<f32>, k: usize) -> Result<f32> {
     let n_samples = original.len_of(Axis(0));
+
+    let max_eval_samples = 1000;
+    let eval_samples = if n_samples > max_eval_samples {
+        max_eval_samples
+    } else {
+        n_samples
+    };
+
     let mut total_recall = 0.0;
+    let step = n_samples / eval_samples;
 
-    for i in 0..n_samples {
+    for i in (0..n_samples).step_by(step) {
         let query = original.slice(s![i, ..]);
 
-        let mut true_neighbors: Vec<(usize, f32)> = (0..n_samples)
+        let search_window = if n_samples > 10000 { 5000 } else { n_samples };
+
+        let start_idx = if i > search_window / 2 {
+            i - search_window / 2
+        } else {
+            0
+        };
+        let end_idx = (i + search_window / 2).min(n_samples);
+
+        let mut true_neighbors: Vec<(usize, f32)> = (start_idx..end_idx)
             .filter(|&j| j != i)
             .map(|j| (j, euclidean_distance(&query, &original.slice(s![j, ..]))))
             .collect();
@@ -43,7 +112,7 @@ fn calculate_recall(original: &Array2<f32>, compressed: &Array2<f32>, k: usize)
         let true_neighbors: Vec<usize> =
             true_neighbors.iter().take(k).map(|&(idx, _)| idx).collect();
 
-        let mut approx_neighbors: Vec<(usize, f32)> = (0..n_samples)
+        let mut approx_neighbors: Vec<(usize, f32)> = (start_idx..end_idx)
             .filter(|&j| j != i)
             .map(|j| {
                 (
@@ -67,35 +136,54 @@ fn calculate_recall(original: &Array2<f32>, compressed: &Array2<f32>, k: usize)
         total_recall += intersection / k as f32;
     }
 
-    Ok(total_recall / n_samples as f32)
+    Ok(total_recall / (n_samples / step) as f32)
 }
 
 fn main() -> Result<()> {
     env_logger::init();
 
-    let n_samples = 1000;
+    let sample_sizes = vec![1000, 5000, 10000, 50000, 100000];
     let n_dims = 128;
-    let original_data = Array2::<f32>::random((n_samples, n_dims), Uniform::new(0.0, 1.0));
-
     let m = 16;
     let ks = 256;
     let iterations = 10;
 
-    let mut pq = PQ::try_new(m, ks)?;
-
-    let fit_start = Instant::now();
-    pq.fit(&original_data, iterations)?;
-    println!("Fit completed in {:?}", fit_start.elapsed());
+    let mut results = Vec::new();
 
-    let encode_start = Instant::now();
-    let compressed_data = pq.compress(&original_data)?;
-    println!("Compression completed in {:?}", encode_start.elapsed());
+    for n_samples in sample_sizes {
+        info!("Running benchmark with {} samples...", n_samples);
+        let result = run_benchmark(n_samples, n_dims, m, ks, iterations)?;
+        results.push(result);
+    }
 
-    let reconstruction_error = calculate_reconstruction_error(&original_data, &compressed_data);
-    println!("Reconstruction Error: {:.4}", reconstruction_error);
+    let mut file = File::create("benchmark_results.csv")?;
+    writeln!(file, "n_samples,n_dims,fit_time_ms,compression_time_ms,reconstruction_error,recall,memory_reduction_ratio")?;
+
+    for result in &results {
+        writeln!(
+            file,
+            "{},{},{},{},{},{},{}",
+            result.n_samples,
+            result.n_dims,
+            result.fit_time_ms,
+            result.compression_time_ms,
+            result.reconstruction_error,
+            result.recall,
+            result.memory_reduction_ratio
+        )?;
+    }
 
-    let recall = calculate_recall(&original_data, &compressed_data, 10)?;
-    println!("Recall@10: {:.4}", recall);
+    for result in &results {
+        info!("\nResults for {} samples:", result.n_samples);
+        info!("Fit time: {:.2}ms", result.fit_time_ms);
+        info!("Compression time: {:.2}ms", result.compression_time_ms);
+        info!("Reconstruction Error: {:.4}", result.reconstruction_error);
+        info!("Recall@10: {:.4}", result.recall);
+        info!(
+            "Memory reduction: {:.2}%",
+            (1.0 - result.memory_reduction_ratio) * 100.0
+        );
+    }
 
     Ok(())
 }