Skip to content

Commit

Permalink
feat: added dataset extension traits for working with metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
nishaq503 committed Nov 30, 2024
1 parent e331d29 commit 422f1ff
Show file tree
Hide file tree
Showing 17 changed files with 175 additions and 90 deletions.
3 changes: 2 additions & 1 deletion crates/abd-clam/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ CLAM is a library crate so you can add it to your crate using `cargo add abd_cla
use abd_clam::{
cakes::{Algorithm, Searchable},
cluster::Partition,
dataset::AssociatesMetadataMut,
Ball, Cluster, Dataset, FlatVec,
};
use rand::prelude::*;
Expand Down Expand Up @@ -95,7 +96,7 @@ We can then perform compressed search on the compressed dataset without having t
use abd_clam::{
cakes::{Algorithm, ParSearchable},
cluster::{adapter::ParBallAdapter, ClusterIO, ParPartition},
dataset::DatasetIO,
dataset::{AssociatesMetadataMut, DatasetIO},
metric::Levenshtein,
pancakes::{CodecData, Sequence, SquishyBall},
Ball, Cluster, FlatVec,
Expand Down
1 change: 1 addition & 0 deletions crates/abd-clam/benches/genomic_search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
use abd_clam::{
cakes::PermutedBall,
cluster::{adapter::ParBallAdapter, BalancedBall, ParPartition},
dataset::AssociatesMetadataMut,
metric::Levenshtein,
pancakes::{Sequence, SquishyBall},
Ball, Cluster, FlatVec,
Expand Down
2 changes: 1 addition & 1 deletion crates/abd-clam/src/cakes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pub mod tests {
adapter::{BallAdapter, ParBallAdapter},
ParPartition, Partition,
},
dataset::ParDataset,
dataset::{AssociatesMetadataMut, ParDataset},
metric::{Euclidean, Levenshtein, ParMetric},
Ball, Cluster, Dataset, FlatVec,
};
Expand Down
46 changes: 46 additions & 0 deletions crates/abd-clam/src/core/dataset/associates_metadata.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
//! An extension of the `Dataset` trait that provides methods for working with
//! metadata associated with items in a dataset.
use super::Dataset;

/// A trait that extends the `Dataset` trait with methods for working with
/// metadata associated with items in a dataset.
///
/// Each item in the dataset should be associated with a piece of metadata.
///
/// # Type parameters
///
/// - `I`: The type of the items in the dataset.
/// - `Me`: The type of the metadata associated with each item in the dataset.
pub trait AssociatesMetadata<I, Me>: Dataset<I> {
/// Returns the all metadata associated with the items in the dataset.
fn metadata(&self) -> &[Me];

/// Returns the metadata associated with the item at the given index.
fn metadata_at(&self, index: usize) -> &Me;
}

/// An extension of the `AssociatesMetadata` trait that provides methods for
/// changing the metadata associated with items in a dataset.
#[allow(clippy::module_name_repetitions)]
pub trait AssociatesMetadataMut<I, Me, Met: Clone, D: AssociatesMetadata<I, Met>>: AssociatesMetadata<I, Me> {
/// Returns the all metadata associated with the items in the dataset,
/// mutably.
fn metadata_mut(&mut self) -> &mut [Me];

/// Returns the metadata associated with the item at the given index,
/// mutably.
fn metadata_at_mut(&mut self, index: usize) -> &mut Me;

/// Changes all metadata associated with the items in the dataset.
///
/// # Errors
///
/// - If the number of metadata items is not equal to the cardinality of the
/// dataset.
fn with_metadata(self, metadata: &[Met]) -> Result<D, String>;

/// Applies a transformation to the metadata associated with the items in
/// the dataset.
fn transform_metadata<F: Fn(&Me) -> Met>(self, f: F) -> D;
}
94 changes: 53 additions & 41 deletions crates/abd-clam/src/core/dataset/flat_vec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
use serde::{Deserialize, Serialize};

use super::{Dataset, ParDataset, Permutable};
use super::{AssociatesMetadata, AssociatesMetadataMut, Dataset, ParDataset, Permutable};

/// A `FlatVec` is a `Dataset` that in which the items are stored in a vector.
///
Expand Down Expand Up @@ -100,45 +100,6 @@ impl<I, Me> FlatVec<I, Me> {
self
}

/// Returns the metadata associated with the items.
#[must_use]
pub fn metadata(&self) -> &[Me] {
&self.metadata
}

/// Assigns metadata to the items.
///
/// # Parameters
///
/// - `metadata`: The metadata to assign to the items.
///
/// # Returns
///
/// The dataset with the metadata assigned to the items.
///
/// # Errors
///
/// * If the metadata length does not match the number of items.
pub fn with_metadata<Mn: Clone>(self, metadata: &[Mn]) -> Result<FlatVec<I, Mn>, String> {
if metadata.len() == self.items.len() {
let mut metadata = metadata.to_vec();
metadata.permute(&self.permutation);
Ok(FlatVec {
items: self.items,
dimensionality_hint: self.dimensionality_hint,
permutation: self.permutation,
metadata,
name: self.name,
})
} else {
Err(format!(
"The metadata length does not match the number of items. {} vs {}",
metadata.len(),
self.items.len()
))
}
}

/// Changes the permutation in the dataset without reordering the items.
#[must_use]
pub fn with_permutation(mut self, permutation: &[usize]) -> Self {
Expand Down Expand Up @@ -195,6 +156,57 @@ impl<I, Me> Dataset<I> for FlatVec<I, Me> {

impl<I: Send + Sync, Me: Send + Sync> ParDataset<I> for FlatVec<I, Me> {}

impl<I, Me> AssociatesMetadata<I, Me> for FlatVec<I, Me> {
fn metadata(&self) -> &[Me] {
&self.metadata
}

fn metadata_at(&self, index: usize) -> &Me {
&self.metadata[index]
}
}

impl<I, Me, Met: Clone> AssociatesMetadataMut<I, Me, Met, FlatVec<I, Met>> for FlatVec<I, Me> {
fn metadata_mut(&mut self) -> &mut [Me] {
&mut self.metadata
}

fn metadata_at_mut(&mut self, index: usize) -> &mut Me {
&mut self.metadata[index]
}

fn with_metadata(self, metadata: &[Met]) -> Result<FlatVec<I, Met>, String> {
if metadata.len() == self.items.len() {
let mut metadata = metadata.to_vec();
metadata.permute(&self.permutation);
Ok(FlatVec {
items: self.items,
dimensionality_hint: self.dimensionality_hint,
permutation: self.permutation,
metadata,
name: self.name,
})
} else {
Err(format!(
"The metadata length does not match the number of items. {} vs {}",
metadata.len(),
self.items.len()
))
}
}

fn transform_metadata<F: Fn(&Me) -> Met>(self, f: F) -> FlatVec<I, Met> {
let metadata = self.metadata.iter().map(f).collect();
FlatVec {
items: self.items,
dimensionality_hint: self.dimensionality_hint,
permutation: self.permutation,
metadata,
name: self.name,
}
}
}

impl<I, Me> Permutable for FlatVec<I, Me> {
fn permutation(&self) -> Vec<usize> {
self.permutation.clone()
Expand Down Expand Up @@ -342,7 +354,7 @@ mod tests {
use rayon::prelude::*;

use crate::{
dataset::{ParDataset, Permutable},
dataset::{AssociatesMetadata, ParDataset, Permutable},
metric::Manhattan,
Dataset,
};
Expand Down
2 changes: 2 additions & 0 deletions crates/abd-clam/src/core/dataset/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@ use rayon::prelude::*;

use super::{metric::ParMetric, Metric};

mod associates_metadata;
mod flat_vec;
mod io;
mod permutable;
mod sized_heap;

pub use associates_metadata::{AssociatesMetadata, AssociatesMetadataMut};
pub use flat_vec::FlatVec;
pub use permutable::Permutable;
pub use sized_heap::SizedHeap;
Expand Down
95 changes: 56 additions & 39 deletions crates/abd-clam/src/pancakes/dataset/codec_data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use serde::{Deserialize, Serialize};

use crate::{
cluster::ParCluster,
dataset::{ParDataset, Permutable},
dataset::{AssociatesMetadata, AssociatesMetadataMut, ParDataset, Permutable},
metric::ParMetric,
Cluster, Dataset, FlatVec, Metric, SizedHeap,
};
Expand Down Expand Up @@ -118,44 +118,6 @@ impl<I: Encodable + Clone + Send + Sync> CodecData<I, usize> {
}

impl<I, Me> CodecData<I, Me> {
/// Changes the metadata associated with items in the dataset.
///
/// # Type Parameters
///
/// - `Met`: The type of the new metadata.
///
/// # Errors
///
/// If the length of `metadata` does not match the cardinality of the
/// dataset.
pub fn with_metadata<Met: Clone>(self, metadata: &[Met]) -> Result<CodecData<I, Met>, String> {
if metadata.len() == self.cardinality {
let mut metadata = metadata.to_vec();
metadata.permute(&self.permutation);
Ok(CodecData {
cardinality: self.cardinality,
dimensionality_hint: self.dimensionality_hint,
metadata,
permutation: self.permutation,
name: self.name,
center_map: self.center_map,
leaf_bytes: self.leaf_bytes,
})
} else {
Err(format!(
"The length of the metadata vector ({}) does not match the cardinality of the dataset ({}).",
metadata.len(),
self.cardinality
))
}
}

/// Returns the metadata associated with the items in the dataset.
#[must_use]
pub fn metadata(&self) -> &[Me] {
&self.metadata
}

/// Changes the permutation of the dataset without changing the order of the
/// items.
#[must_use]
Expand Down Expand Up @@ -324,6 +286,61 @@ impl<I: Decodable + Send + Sync, Me: Send + Sync> ParDataset<I> for CodecData<I,
}
}

impl<I: Decodable, Me> AssociatesMetadata<I, Me> for CodecData<I, Me> {
fn metadata(&self) -> &[Me] {
&self.metadata
}

fn metadata_at(&self, index: usize) -> &Me {
&self.metadata[index]
}
}

impl<I: Decodable, Me, Met: Clone> AssociatesMetadataMut<I, Me, Met, CodecData<I, Met>> for CodecData<I, Me> {
fn metadata_mut(&mut self) -> &mut [Me] {
&mut self.metadata
}

fn metadata_at_mut(&mut self, index: usize) -> &mut Me {
&mut self.metadata[index]
}

fn with_metadata(self, metadata: &[Met]) -> Result<CodecData<I, Met>, String> {
if metadata.len() == self.cardinality {
let mut metadata = metadata.to_vec();
metadata.permute(&self.permutation);
Ok(CodecData {
cardinality: self.cardinality,
dimensionality_hint: self.dimensionality_hint,
metadata,
permutation: self.permutation,
name: self.name,
center_map: self.center_map,
leaf_bytes: self.leaf_bytes,
})
} else {
Err(format!(
"The length of the metadata vector ({}) does not match the cardinality of the dataset ({}).",
metadata.len(),
self.cardinality
))
}
}

fn transform_metadata<F: Fn(&Me) -> Met>(self, f: F) -> CodecData<I, Met> {
let metadata = self.metadata.iter().map(f).collect();
CodecData {
cardinality: self.cardinality,
dimensionality_hint: self.dimensionality_hint,
metadata,
permutation: self.permutation,
name: self.name,
center_map: self.center_map,
leaf_bytes: self.leaf_bytes,
}
}
}

#[cfg(feature = "bitcode")]
/// Encodes using bitcode and compresses using Gzip.
///
Expand Down
1 change: 1 addition & 0 deletions crates/abd-clam/src/pancakes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ pub mod tests {
adapter::{BallAdapter, ParBallAdapter},
ParPartition, Partition,
},
dataset::{AssociatesMetadata, AssociatesMetadataMut},
metric::{AbsoluteDifference, Levenshtein},
pancakes::{CodecData, SquishyBall},
Ball, Cluster, Dataset, FlatVec,
Expand Down
2 changes: 1 addition & 1 deletion crates/results/cakes/src/data/raw/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! Reading data from various sources.
use abd_clam::FlatVec;
use abd_clam::{dataset::AssociatesMetadataMut, FlatVec};

use super::tree::instances::{Aligned, MemberSet, Unaligned};

Expand Down
2 changes: 1 addition & 1 deletion crates/results/cakes/src/data/tree/aligned.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::collections::HashMap;
use abd_clam::{
cakes::Algorithm,
cluster::{adapter::ParBallAdapter, ClusterIO, Csv, ParPartition},
dataset::ParDatasetIO,
dataset::{AssociatesMetadata, AssociatesMetadataMut, ParDatasetIO},
pancakes::{CodecData, SquishyBall},
Ball, Cluster, Dataset, FlatVec,
};
Expand Down
2 changes: 1 addition & 1 deletion crates/results/cakes/src/data/tree/ann_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::collections::HashMap;
use abd_clam::{
cakes::Algorithm,
cluster::{adapter::ParBallAdapter, ClusterIO, Csv, ParPartition},
dataset::ParDatasetIO,
dataset::{AssociatesMetadata, AssociatesMetadataMut, ParDatasetIO},
pancakes::{CodecData, SquishyBall},
Ball, Cluster, FlatVec,
};
Expand Down
2 changes: 1 addition & 1 deletion crates/results/cakes/src/data/tree/unaligned.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::collections::HashMap;
use abd_clam::{
cakes::Algorithm,
cluster::{adapter::ParBallAdapter, ClusterIO, Csv, ParPartition},
dataset::ParDatasetIO,
dataset::{AssociatesMetadata, AssociatesMetadataMut, ParDatasetIO},
metric::Levenshtein,
pancakes::{CodecData, SquishyBall},
Ball, Cluster, Dataset, FlatVec,
Expand Down
2 changes: 1 addition & 1 deletion crates/results/chaoda/src/data/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
use std::path::Path;

use abd_clam::{Dataset, FlatVec};
use abd_clam::{dataset::AssociatesMetadataMut, Dataset, FlatVec};
use ndarray::prelude::*;
use ndarray_npy::ReadNpyExt;

Expand Down
1 change: 1 addition & 0 deletions crates/results/chaoda/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use clap::Parser;

use abd_clam::{
chaoda::{GraphAlgorithm, TrainableMetaMlModel, TrainableSmc},
dataset::AssociatesMetadata,
metric::{Euclidean, Manhattan},
Ball, Cluster, Dataset,
};
Expand Down
Loading

0 comments on commit 422f1ff

Please sign in to comment.