Skip to content

Commit

Permalink
wip: added random dataset benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
nishaq503 committed Jan 4, 2025
1 parent 2df7100 commit fcd8e1a
Show file tree
Hide file tree
Showing 4 changed files with 262 additions and 118 deletions.
335 changes: 221 additions & 114 deletions benches/cakes/src/data_gen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,223 @@ pub fn read_tabular_and_augment<P: AsRef<std::path::Path>, M: ParMetric<Vec<f32>
inp_dir: &P,
out_dir: &P,
) -> Result<Vec<Vec<f32>>, String> {
let ([data_orig_path, queries_path, neighbors_path, distances_path], all_paths) =
gen_all_paths(dataset, max_power, out_dir);

if all_paths.iter().all(|p| p.exists()) {
ftlog::info!("Augmented datasets already exist. Reading queries from {queries_path:?}...");
return FlatVec::<Vec<f32>, usize>::read_npy(&queries_path).map(FlatVec::take_items);
}

ftlog::info!("Reading data from {:?}...", inp_dir.as_ref());
let data = dataset.read::<_, f32>(&inp_dir)?;
let (train, queries, neighbors) = (data.train, data.queries, data.neighbors);
let (neighbors, distances): (Vec<_>, Vec<_>) = neighbors
.into_iter()
.map(|n| {
let (n, d): (Vec<_>, Vec<_>) = n.into_iter().unzip();
let n = n.into_iter().map(Number::as_u64).collect::<Vec<_>>();
(n, d)
})
.unzip();
let k = neighbors[0].len();
let neighbors = FlatVec::new(neighbors)?.with_dim_lower_bound(k).with_dim_upper_bound(k);
let distances = FlatVec::new(distances)?.with_dim_lower_bound(k).with_dim_upper_bound(k);

let (min_dim, max_dim) = train
.iter()
.chain(queries.iter())
.fold((usize::MAX, 0), |(min, max), x| (min.min(x.len()), max.max(x.len())));

let data = FlatVec::new(train)?
.with_name(dataset.name())
.with_dim_lower_bound(min_dim)
.with_dim_upper_bound(max_dim);

ftlog::info!("Writing original data as npy array to {data_orig_path:?}...");
data.write_npy(&data_orig_path)?;

let query_data = FlatVec::new(queries)?
.with_name(&format!("{}-queries", dataset.name()))
.with_dim_lower_bound(min_dim)
.with_dim_upper_bound(max_dim);
ftlog::info!("Writing queries as npy array to {queries_path:?}...");
query_data.write_npy(&queries_path)?;
let queries = query_data.take_items();

ftlog::info!("Writing neighbors to {neighbors_path:?}...");
neighbors.write_npy(&neighbors_path)?;
distances.write_npy(&distances_path)?;

ftlog::info!("Augmenting data...");
let train = data.take_items();
let base_cardinality = train.len();
let data = AnnDataset {
train,
queries: Vec::new(),
neighbors: Vec::new(),
}
.augment(1 << max_power, 0.1);

// The value of k is hardcoded to 100 to find the true neighbors of the
// augmented datasets.
let k = 100;

let mut data = FlatVec::new(data.train)?
.with_dim_lower_bound(min_dim)
.with_dim_upper_bound(max_dim);

for power in (1..=max_power).rev() {
let name = format!("{}-{power}", dataset.name());
let data_path = out_dir.as_ref().join(format!("{name}.npy"));
let neighbors_path = out_dir.as_ref().join(format!("{name}-neighbors.npy"));
let distances_path = out_dir.as_ref().join(format!("{name}-distances.npy"));

let size = base_cardinality * (1 << power);
let mut rng = rand::rngs::StdRng::seed_from_u64(seed.unwrap_or(42));
data = data.random_subsample(&mut rng, size).with_name(&name);
ftlog::info!("Writing {}x augmented data to {data_path:?}...", 1 << power);
data.write_npy(&data_path)?;

ftlog::info!("Finding true neighbors for {name}...");
let indices = (0..data.cardinality()).collect::<Vec<_>>();
let true_hits = queries
.par_iter()
.map(|query| {
let mut hits = data.par_query_to_many(query, &indices, metric).collect::<Vec<_>>();
hits.sort_by(|(_, a), (_, b)| a.total_cmp(b));
let _ = hits.split_off(k);
hits
})
.collect::<Vec<_>>();
ftlog::info!("Writing true neighbors to {neighbors_path:?} and distances to {distances_path:?}...");
let (neighbors, distances): (Vec<_>, Vec<_>) = true_hits
.into_iter()
.map(|mut nds| {
// Sort the neighbors by distance from the query.
nds.sort_by(|(_, a), (_, b)| a.total_cmp(b));

let (n, d): (Vec<_>, Vec<_>) = nds.into_iter().unzip();
let n = n.into_iter().map(Number::as_u64).collect::<Vec<_>>();

(n, d)
})
.unzip();
FlatVec::new(neighbors)?
.with_dim_lower_bound(k)
.with_dim_upper_bound(k)
.write_npy(&neighbors_path)?;
FlatVec::new(distances)?
.with_dim_lower_bound(k)
.with_dim_upper_bound(k)
.write_npy(&distances_path)?;
}

Ok(queries)
}

/// Read or generate random datasets and ground-truth search results.
///
/// # Arguments
///
/// - `metric`: The metric to use for linear search to find true neighbors of
/// augmented datasets.
/// - `max_power`: The maximum power of 2 to which the cardinality of the
/// dataset should be augmented.
/// - `seed`: The seed for the random number generator.
/// - `out_dir`: The directory to which the augmented datasets and ground-truth
/// neighbors and distances should be written.
///
/// # Returns
///
/// The queries to use for benchmarking.
pub fn read_or_gen_random<P: AsRef<std::path::Path>, M: ParMetric<Vec<f32>, f32>>(
metric: &M,
max_power: u32,
seed: Option<u64>,
out_dir: &P,
) -> Result<Vec<Vec<f32>>, String> {
let dataset = bench_utils::ann_benchmarks::RawData::Random;
let ([_, queries_path, _, _], all_paths) = gen_all_paths(&dataset, max_power, out_dir);

if all_paths.iter().all(|p| p.exists()) {
ftlog::info!("Random datasets already exist. Reading queries from {queries_path:?}...");
return FlatVec::<Vec<f32>, usize>::read_npy(&queries_path).map(FlatVec::take_items);
}
let k = 100;
let n_queries = 100;
let base_cardinality = 1_000_000;
let dimensionality = 128;
let data = AnnDataset::gen_random(base_cardinality, 1 << max_power, dimensionality, n_queries, 42);
let (train, queries, _) = (data.train, data.queries, data.neighbors);

let queries = FlatVec::new(queries)?
.with_dim_lower_bound(dimensionality)
.with_dim_upper_bound(dimensionality);
let queries_path = out_dir.as_ref().join(format!("{}-queries.npy", dataset.name()));
ftlog::info!("Writing queries as npy array to {queries_path:?}...");
queries.write_npy(&queries_path)?;
let queries = queries.take_items();

let mut data = FlatVec::new(train)?
.with_dim_lower_bound(dimensionality)
.with_dim_upper_bound(dimensionality);
for power in (0..=max_power).rev() {
let name = format!("{}-{}", dataset.name(), power);
let data_path = out_dir.as_ref().join(format!("{name}.npy"));
let neighbors_path = out_dir.as_ref().join(format!("{name}-neighbors.npy"));
let distances_path = out_dir.as_ref().join(format!("{name}-distances.npy"));

let size = base_cardinality * (1 << power);
let mut rng = rand::rngs::StdRng::seed_from_u64(seed.unwrap_or(42));
data = data.random_subsample(&mut rng, size).with_name(&name);
ftlog::info!("Writing {}x random data to {data_path:?}...", 1 << power);
data.write_npy(&data_path)?;

ftlog::info!("Finding true neighbors for {name}...");
let indices = (0..data.cardinality()).collect::<Vec<_>>();
let true_hits = queries
.par_iter()
.map(|query| {
let mut hits = data.par_query_to_many(query, &indices, metric).collect::<Vec<_>>();
hits.sort_by(|(_, a), (_, b)| a.total_cmp(b));
let _ = hits.split_off(k);
hits
})
.collect::<Vec<_>>();
ftlog::info!("Writing true neighbors to {neighbors_path:?} and distances to {distances_path:?}...");
let (neighbors, distances): (Vec<_>, Vec<_>) = true_hits
.into_iter()
.map(|mut nds| {
// Sort the neighbors by distance from the query.
nds.sort_by(|(_, a), (_, b)| a.total_cmp(b));

let (n, d): (Vec<_>, Vec<_>) = nds.into_iter().unzip();
let n = n.into_iter().map(Number::as_u64).collect::<Vec<_>>();

(n, d)
})
.unzip();
FlatVec::new(neighbors)?
.with_dim_lower_bound(k)
.with_dim_upper_bound(k)
.write_npy(&neighbors_path)?;
FlatVec::new(distances)?
.with_dim_lower_bound(k)
.with_dim_upper_bound(k)
.write_npy(&distances_path)?;
}

Ok(queries)
}

/// Generate all paths for the augmented datasets and ground-truth neighbors and
/// distances.
fn gen_all_paths<P: AsRef<std::path::Path>>(
dataset: &bench_utils::ann_benchmarks::RawData,
max_power: u32,
out_dir: &P,
) -> ([std::path::PathBuf; 4], Vec<std::path::PathBuf>) {
let data_orig_path = out_dir.as_ref().join(format!("{}-0.npy", dataset.name()));
let queries_path = out_dir.as_ref().join(format!("{}-queries.npy", dataset.name()));
let neighbors_path = out_dir.as_ref().join(format!("{}-0-neighbors.npy", dataset.name()));
Expand All @@ -63,118 +280,8 @@ pub fn read_tabular_and_augment<P: AsRef<std::path::Path>, M: ParMetric<Vec<f32>
paths
};

let queries = if all_paths.iter().all(|p| p.exists()) {
ftlog::info!("Augmented datasets already exist. Reading queries from {queries_path:?}...");
FlatVec::<Vec<f32>, usize>::read_npy(&queries_path)?.take_items()
} else {
ftlog::info!("Reading data from {:?}...", inp_dir.as_ref());

let data = dataset.read::<_, f32>(&inp_dir)?;

let (train, queries, neighbors) = (data.train, data.queries, data.neighbors);
let (neighbors, distances): (Vec<_>, Vec<_>) = neighbors
.into_iter()
.map(|n| {
let (n, d): (Vec<_>, Vec<_>) = n.into_iter().unzip();
let n = n.into_iter().map(Number::as_u64).collect::<Vec<_>>();
(n, d)
})
.unzip();
let k = neighbors[0].len();
let neighbors = FlatVec::new(neighbors)?.with_dim_lower_bound(k).with_dim_upper_bound(k);
let distances = FlatVec::new(distances)?.with_dim_lower_bound(k).with_dim_upper_bound(k);

let (min_dim, max_dim) = train
.iter()
.chain(queries.iter())
.fold((usize::MAX, 0), |(min, max), x| (min.min(x.len()), max.max(x.len())));

let data = FlatVec::new(train)?
.with_name(dataset.name())
.with_dim_lower_bound(min_dim)
.with_dim_upper_bound(max_dim);

ftlog::info!("Writing original data as npy array to {data_orig_path:?}...");
data.write_npy(&data_orig_path)?;

let query_data = FlatVec::new(queries)?
.with_name(&format!("{}-queries", dataset.name()))
.with_dim_lower_bound(min_dim)
.with_dim_upper_bound(max_dim);
ftlog::info!("Writing queries as npy array to {queries_path:?}...");
query_data.write_npy(&queries_path)?;
let queries = query_data.take_items();

ftlog::info!("Writing neighbors to {neighbors_path:?}...");
neighbors.write_npy(&neighbors_path)?;
distances.write_npy(&distances_path)?;

ftlog::info!("Augmenting data...");
let train = data.take_items();
let base_cardinality = train.len();
let data = AnnDataset {
train,
queries: Vec::new(),
neighbors: Vec::new(),
}
.augment(1 << max_power, 0.1);

// The value of k is hardcoded to 100 to find the true neighbors of the
// augmented datasets.
let k = 100;

let mut data = FlatVec::new(data.train)?
.with_dim_lower_bound(min_dim)
.with_dim_upper_bound(max_dim);

for power in (1..=max_power).rev() {
let name = format!("{}-{power}", dataset.name());
let data_path = out_dir.as_ref().join(format!("{name}.npy"));
let neighbors_path = out_dir.as_ref().join(format!("{name}-neighbors.npy"));
let distances_path = out_dir.as_ref().join(format!("{name}-distances.npy"));

let size = base_cardinality * (1 << power);
let mut rng = rand::rngs::StdRng::seed_from_u64(seed.unwrap_or(42));
data = data.random_subsample(&mut rng, size).with_name(&name);
ftlog::info!("Writing {}x augmented data to {data_path:?}...", 1 << power);
data.write_npy(&data_path)?;

ftlog::info!("Finding true neighbors for {name}...");
let indices = (0..data.cardinality()).collect::<Vec<_>>();
let true_hits = queries
.par_iter()
.map(|query| {
let mut hits = data.par_query_to_many(query, &indices, metric).collect::<Vec<_>>();
hits.sort_by(|(_, a), (_, b)| a.total_cmp(b));
let _ = hits.split_off(k);
hits
})
.collect::<Vec<_>>();
ftlog::info!("Writing true neighbors to {neighbors_path:?} and distances to {distances_path:?}...");
let (neighbors, distances): (Vec<_>, Vec<_>) = true_hits
.into_iter()
.map(|mut nds| {
// Sort the neighbors by distance from the query.
nds.sort_by(|(_, a), (_, b)| a.total_cmp(b));

let (n, d): (Vec<_>, Vec<_>) = nds.into_iter().unzip();
let n = n.into_iter().map(Number::as_u64).collect::<Vec<_>>();

(n, d)
})
.unzip();
FlatVec::new(neighbors)?
.with_dim_lower_bound(k)
.with_dim_upper_bound(k)
.write_npy(&neighbors_path)?;
FlatVec::new(distances)?
.with_dim_lower_bound(k)
.with_dim_upper_bound(k)
.write_npy(&distances_path)?;
}

queries
};

Ok(queries)
(
[data_orig_path, queries_path, neighbors_path, distances_path],
all_paths,
)
}
7 changes: 5 additions & 2 deletions benches/cakes/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,11 @@ fn main() -> Result<(), String> {
metric
};

let queries =
data_gen::read_tabular_and_augment(&args.dataset, &metric, max_power, args.seed, &inp_dir, &out_dir)?;
let queries = if matches!(args.dataset, bench_utils::ann_benchmarks::RawData::Random) {
data_gen::read_or_gen_random(&metric, max_power, seed, &out_dir)?
} else {
data_gen::read_tabular_and_augment(&args.dataset, &metric, max_power, args.seed, &inp_dir, &out_dir)?
};
if args.generate_only {
return Ok(());
}
Expand Down
1 change: 1 addition & 0 deletions benches/utils/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ csv = { workspace = true }
stringzilla = "3.9.5"
bio = { workspace = true }
rand = { workspace = true }
rayon = { workspace = true }
Loading

0 comments on commit fcd8e1a

Please sign in to comment.