From 3a64a79168a89b50c523e900bc9feb12a9f4d614 Mon Sep 17 00:00:00 2001 From: Najib Ishaq Date: Fri, 26 Jul 2024 22:02:02 -0400 Subject: [PATCH] feat: updated most of CHAODA --- crates/abd-clam/src/chaoda/cluster/mod.rs | 2 +- crates/abd-clam/src/chaoda/cluster/vertex.rs | 18 +- crates/abd-clam/src/chaoda/component.rs | 303 +++++++++++++++ crates/abd-clam/src/chaoda/graph.rs | 143 ++++++++ crates/abd-clam/src/chaoda/members/cc.rs | 32 ++ crates/abd-clam/src/chaoda/members/gn.rs | 56 +++ crates/abd-clam/src/chaoda/members/mod.rs | 203 ++++++++++ crates/abd-clam/src/chaoda/members/pc.rs | 32 ++ crates/abd-clam/src/chaoda/members/sc.rs | 37 ++ crates/abd-clam/src/chaoda/members/sp.rs | 49 +++ crates/abd-clam/src/chaoda/members/vd.rs | 32 ++ crates/abd-clam/src/chaoda/meta_ml.rs | 187 ++++++++++ crates/abd-clam/src/chaoda/mod.rs | 347 +++++++++++++++++- crates/abd-clam/src/core/cluster/ball.rs | 13 + crates/abd-clam/src/core/cluster/children.rs | 2 +- .../abd-clam/src/core/cluster/index_store.rs | 2 +- crates/abd-clam/src/core/cluster/mod.rs | 34 +- 17 files changed, 1485 insertions(+), 7 deletions(-) create mode 100644 crates/abd-clam/src/chaoda/component.rs create mode 100644 crates/abd-clam/src/chaoda/graph.rs create mode 100644 crates/abd-clam/src/chaoda/members/cc.rs create mode 100644 crates/abd-clam/src/chaoda/members/gn.rs create mode 100644 crates/abd-clam/src/chaoda/members/mod.rs create mode 100644 crates/abd-clam/src/chaoda/members/pc.rs create mode 100644 crates/abd-clam/src/chaoda/members/sc.rs create mode 100644 crates/abd-clam/src/chaoda/members/sp.rs create mode 100644 crates/abd-clam/src/chaoda/members/vd.rs create mode 100644 crates/abd-clam/src/chaoda/meta_ml.rs diff --git a/crates/abd-clam/src/chaoda/cluster/mod.rs b/crates/abd-clam/src/chaoda/cluster/mod.rs index 6ce625970..17518ba16 100644 --- a/crates/abd-clam/src/chaoda/cluster/mod.rs +++ b/crates/abd-clam/src/chaoda/cluster/mod.rs @@ -6,7 +6,7 @@ use distances::Number; use crate::Cluster; -pub use vertex::Vertex; +pub use vertex::{Ratios, Vertex}; /// A cluster that is used for anomaly detection. pub trait OddBall: Cluster { diff --git a/crates/abd-clam/src/chaoda/cluster/vertex.rs b/crates/abd-clam/src/chaoda/cluster/vertex.rs index c9db707df..a8d0c7b8b 100644 --- a/crates/abd-clam/src/chaoda/cluster/vertex.rs +++ b/crates/abd-clam/src/chaoda/cluster/vertex.rs @@ -10,7 +10,7 @@ use super::OddBall; pub type Ratios = [f32; 6]; /// A `Vertex` to use as a node in a `Graph`. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Vertex { /// The `Ball` that was adapted into this `Vertex`. ball: Ball, @@ -237,8 +237,22 @@ impl PartialEq for Vertex { } } +impl Eq for Vertex {} + impl PartialOrd for Vertex { fn partial_cmp(&self, other: &Self) -> Option { - self.ball.partial_cmp(&other.ball) + Some(self.cmp(other)) + } +} + +impl Ord for Vertex { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.ball.cmp(&other.ball) + } +} + +impl std::hash::Hash for Vertex { + fn hash(&self, state: &mut H) { + self.ball.hash(state); } } diff --git a/crates/abd-clam/src/chaoda/component.rs b/crates/abd-clam/src/chaoda/component.rs new file mode 100644 index 000000000..05c5b6099 --- /dev/null +++ b/crates/abd-clam/src/chaoda/component.rs @@ -0,0 +1,303 @@ +//! A `Component` is a single connected subgraph of a `Graph`. + +use std::collections::{BTreeMap, BTreeSet}; + +use distances::Number; +use ndarray::prelude::*; + +use crate::Dataset; + +use super::OddBall; + +/// A `Neighbors` is a mapping from a `ClusterKey` to the distance between the `OddBall`s. +pub type Neighbors = BTreeMap; +/// An `AdjacencyList` is a mapping from a `ClusterKey` to its neighbors. +pub type AdjacencyList = BTreeMap>; + +/// A `Component` is a single connected subgraph of a `Graph`. +/// +/// We break the `Graph` into connected `Component`s because this makes several +/// computations significantly easier to think about and implement. +#[derive(Clone)] +pub struct Component<'a, U: Number, C: OddBall> { + /// The clusters and their neighbors in the `Component`. + adjacency_list: AdjacencyList<&'a C, U>, + /// The total number of points in the `OddBall`s in the `Component`. + population: usize, + /// Eccentricity of each `OddBall` in the `Component`. + eccentricities: Option>, + /// Diameter of the `Component`. + diameter: Option, + /// neighborhood sizes of each `OddBall` in the `Component` at each step through a BFT. + neighborhood_sizes: Option>>, + /// The accumulated child-parent cardinality ratio of each `OddBall` in the `Component`. + accumulated_cp_car_ratios: BTreeMap<&'a C, f32>, + /// The anomaly properties of the `OddBall`s in the `Component`. + anomaly_properties: BTreeMap<&'a C, Vec>, +} + +impl<'a, U: Number, C: OddBall> Component<'a, U, C> { + /// Create a new `Component` from a collection of `OddBall`s. + pub fn new>(clusters: &[&'a C], data: &D) -> Vec { + let adjacency_list: AdjacencyList<_, _> = clusters + .iter() + .enumerate() + .map(|(i, &c1)| { + let neighbors = clusters + .iter() + .enumerate() + .filter(|&(j, _)| i != j) + .filter_map(|(_, &c2)| { + let (r1, r2) = (c1.radius(), c2.radius()); + let d = c1.distance_to_other(data, c2); + if d <= r1 + r2 { + Some((c2, d)) + } else { + None + } + }) + .collect::>(); + (c1, neighbors) + }) + .collect(); + + let population = clusters.iter().map(|c| c.cardinality()).sum(); + let accumulated_cp_car_ratios = clusters.iter().map(|&c| (c, c.accumulated_cp_car_ratio())).collect(); + let anomaly_properties = clusters.iter().map(|&c| (c, c.ratios())).collect(); + + let c = Self { + adjacency_list, + population, + eccentricities: None, + diameter: None, + neighborhood_sizes: None, + accumulated_cp_car_ratios, + anomaly_properties, + }; + + let [mut c, mut other] = c.partition(); + let mut components = vec![c]; + while !other.is_empty() { + [c, other] = other.partition(); + components.push(c); + } + + components + } + + /// Partition the `Component` into two `Component`s. + /// + /// The first component is a connected subgraph of the original `Component` + /// and the second component is the rest of the original `Component`. + /// + /// This method is used when first constructing the `Graph` to find the + /// connected subgraphs of the `Graph`. + /// + /// This method is meant to be used in a loop to find all connected subgraphs + /// of a `Graph`. It resets the internal members of the `Component` that are + /// computed lazily, i.e. the eccentricities, diameter, and neighborhood sizes. + fn partition(mut self) -> [Self; 2] { + // Perform a traversal of the adjacency list to find a connected subgraph. + let mut visited: BTreeSet<&C> = BTreeSet::new(); + let mut stack: Vec<&C> = self.adjacency_list.keys().copied().collect(); + while let Some(k) = stack.pop() { + // Check if the cluster has already been visited. + if visited.contains(&k) { + continue; + } + // Mark the cluster as visited. + visited.insert(k); + // Add the neighbors of the cluster to the stack to be visited. + for &j in self.adjacency_list[&k].keys() { + stack.push(j); + } + } + + // Partition the clusters into visited and unvisited clusters. + let (p1, p2): (AdjacencyList<_, _>, AdjacencyList<_, _>) = + self.adjacency_list.into_iter().partition(|(k, _)| visited.contains(k)); + let p1: AdjacencyList<_, _> = p1 + .into_iter() + .map(|(k, n)| { + let n = n.into_iter().filter(|(j, _)| visited.contains(j)).collect(); + (k, n) + }) + .collect(); + let p2: AdjacencyList<_, _> = p2 + .into_iter() + .map(|(k, n)| { + let n = n.into_iter().filter(|(j, _)| visited.contains(j)).collect(); + (k, n) + }) + .collect(); + + // Build a component from the clusters that were not visited in the traversal. + let population = p2.keys().map(|c| c.cardinality()).sum(); + let accumulated_cp_car_ratios = p2.keys().map(|&r| (r, self.accumulated_cp_car_ratios[&r])).collect(); + let anomaly_properties = p2.keys().map(|&r| (r, self.anomaly_properties[&r].clone())).collect(); + let other = Self { + adjacency_list: p2, + population, + eccentricities: None, + diameter: None, + neighborhood_sizes: None, + accumulated_cp_car_ratios, + anomaly_properties, + }; + + // Set the current component to the visited clusters. + self.adjacency_list = p1; + self.population = self.adjacency_list.keys().map(|c| c.cardinality()).sum(); + self.eccentricities = None; + self.diameter = None; + self.neighborhood_sizes = None; + self.accumulated_cp_car_ratios = self + .adjacency_list + .keys() + .map(|&r| (r, self.accumulated_cp_car_ratios[&r])) + .collect(); + self.anomaly_properties = self + .adjacency_list + .keys() + .map(|&r| (r, self.anomaly_properties[&r].clone())) + .collect(); + + [self, other] + } + + /// Check if the `Component` has any `OddBall`s. + #[must_use] + pub fn is_empty(&self) -> bool { + self.adjacency_list.is_empty() + } + + /// Iterate over the `OddBall`s in the `Component`. + pub fn iter_clusters(&self) -> impl Iterator { + self.adjacency_list.keys().copied() + } + + /// Iterate over the lists of neighbors of the `OddBall`s in the `Component`. + pub fn iter_neighbors(&self) -> impl Iterator> { + self.adjacency_list.values() + } + + /// Iterate over the anomaly properties of the `OddBall`s in the `Component`. + pub fn iter_anomaly_properties(&self) -> impl Iterator> { + self.anomaly_properties.values() + } + + /// Get the number of `OddBall`s in the `Component`. + #[must_use] + pub fn cardinality(&self) -> usize { + self.adjacency_list.len() + } + + /// Get the total number of points in the `Component`. + #[must_use] + pub const fn population(&self) -> usize { + self.population + } + + /// Get the diameter of the `Component`. + pub fn diameter(&mut self) -> usize { + if self.diameter.is_none() { + if self.eccentricities.is_none() { + self.compute_eccentricities(); + } + let ecc = self + .eccentricities + .as_ref() + .unwrap_or_else(|| unreachable!("We just computed the eccentricities")); + self.diameter = Some(ecc.iter().copied().max().unwrap_or(0)); + } + self.diameter + .unwrap_or_else(|| unreachable!("We just computed the diameter")) + } + + /// Compute the eccentricity of each `OddBall` in the `Component`. + pub fn compute_eccentricities(&mut self) { + self.eccentricities = Some(self.neighborhood_sizes().map(Vec::len).collect()); + } + + /// Get the neighborhood sizes of all `OddBall`s in the `Component`. + pub fn neighborhood_sizes(&mut self) -> impl Iterator> + '_ { + if self.neighborhood_sizes.is_none() { + self.neighborhood_sizes = Some( + self.adjacency_list + .iter() + .map(|(&k, _)| (k, self.compute_neighborhood_sizes(k))) + .collect(), + ); + } + self.neighborhood_sizes + .as_ref() + .unwrap_or_else(|| unreachable!("We just computed the neighborhood sizes")) + .values() + } + + /// Get the cumulative number of neighbors encountered after each step through a BFT. + fn compute_neighborhood_sizes(&self, k: &C) -> Vec { + let mut visited: BTreeSet<&C> = BTreeSet::new(); + let mut neighborhood_sizes: Vec = Vec::new(); + let mut stack: Vec<&C> = vec![k]; + + while let Some(i) = stack.pop() { + if visited.contains(&i) { + continue; + } + visited.insert(i); + let new_neighbors = self.adjacency_list[&i] + .iter() + .filter(|(&j, _)| !visited.contains(j)) + .collect::>(); + neighborhood_sizes.push(new_neighbors.len()); + stack.extend(new_neighbors.iter().map(|(j, _)| *j)); + } + + neighborhood_sizes + .iter() + .scan(0, |acc, x| { + *acc += x; + Some(*acc) + }) + .collect() + } + + /// Compute the stationary probability of each `OddBall` in the `Component`. + #[must_use] + pub fn compute_stationary_probabilities(&self, num_steps: usize) -> Vec { + if self.cardinality() == 1 { + // Singleton components need to be marked as anomalous. + return vec![0.0]; + } + + let mut transition_matrix = vec![0_f32; self.cardinality() * self.cardinality()]; + for (i, (_, neighbors)) in self.adjacency_list.iter().enumerate() { + for (j, (_, &d)) in neighbors.iter().enumerate() { + transition_matrix[i * self.cardinality() + j] = d.as_f32().recip(); + } + } + // Convert the transition matrix to an Array2 + let mut transition_matrix = Array2::from_shape_vec((self.cardinality(), self.cardinality()), transition_matrix) + .unwrap_or_else(|e| unreachable!("We created a square Transition matrix: {e}")); + + // Normalize the transition matrix so that each row sums to 1 + for i in 0..self.cardinality() { + let row_sum = transition_matrix.row(i).sum(); + transition_matrix.row_mut(i).mapv_inplace(|x| x / row_sum); + } + + // Compute the stationary probabilities by squaring the transition matrix `num_steps` times + for _ in 0..num_steps { + transition_matrix = transition_matrix.dot(&transition_matrix); + } + + // Compute the stationary probabilities by summing the rows of the transition matrix + transition_matrix.sum_axis(Axis(1)).to_vec() + } + + /// Get the accumulated child-parent cardinality ratio of each `OddBall` in the `Component`. + pub fn accumulated_cp_car_ratios(&self) -> impl Iterator + '_ { + self.accumulated_cp_car_ratios.values().copied() + } +} diff --git a/crates/abd-clam/src/chaoda/graph.rs b/crates/abd-clam/src/chaoda/graph.rs new file mode 100644 index 000000000..e994e8965 --- /dev/null +++ b/crates/abd-clam/src/chaoda/graph.rs @@ -0,0 +1,143 @@ +//! A `Graph` is a collection of `OddBall`s. + +use core::cmp::Reverse; +use std::collections::BinaryHeap; + +use distances::Number; +use ordered_float::OrderedFloat; + +use crate::Dataset; + +use super::{Component, Neighbors, OddBall}; + +/// A `Graph` is a collection of `OddBall`s. +/// +/// Two `OddBall`s have an edge between them if they have any overlapping volume, +/// i.e. if the distance between their centers is no greater than the sum of their +/// radii. +#[derive(Clone)] +pub struct Graph<'a, U: Number, C: OddBall> { + /// The collection of `Component`s in the `Graph`. + components: Vec>, + /// Cumulative populations of the `Component`s in the `Graph`. + populations: Vec, +} + +// , C: OddBall, const N: usize +impl<'a, U: Number, C: OddBall> Graph<'a, U, C> { + /// Create a new `Graph` from a `Tree`. + /// + /// # Arguments + /// + /// * `tree`: The `Tree` to create the `Graph` from. + /// * `cluster_scorer`: A function that scores `OddBall`s. + /// * `min_depth`: The minimum depth at which to consider a `OddBall`. + pub fn from_tree>( + root: &'a C, + data: &D, + cluster_scorer: impl Fn(&[&'a C]) -> Vec, + min_depth: usize, + ) -> Self + where + U: 'a, + { + let clusters = root.subtree(); + let scores = cluster_scorer(&clusters); + + // We use `OrderedFloat` to have the `Ord` trait implemented for `f64` so that we can use it in a `BinaryHeap`. + // We use `Reverse` on `OddBall` so that we can bias towards selecting shallower `OddBall`s. + // `OddBall`s are selected by highest score and then by shallowest depth. + let mut candidates = clusters + .into_iter() + .zip(scores.into_iter().map(OrderedFloat)) + .filter(|(c, _)| c.is_leaf() || c.depth() >= min_depth) + .map(|(c, s)| (s, Reverse(c))) + .collect::>(); + + let mut clusters = vec![]; + while let Some((_, Reverse(c))) = candidates.pop() { + clusters.push(c); + // Remove `OddBall`s that are ancestors or descendants of the selected `OddBall`, so as not to have duplicates + // in the `Graph`. + candidates.retain(|&(_, Reverse(other))| !(c.is_descendant_of(other) || other.is_descendant_of(c))); + } + + Self::from_clusters(&clusters, data) + } + + /// Create a new `Graph` from a collection of `OddBall`s. + pub fn from_clusters>(clusters: &[&'a C], data: &D) -> Self { + let components = Component::new(clusters, data); + let populations = components + .iter() + .map(Component::population) + .scan(0, |acc, x| { + *acc += x; + Some(*acc) + }) + .collect::>(); + Self { + components, + populations, + } + } + + /// Iterate over the `OddBall`s in the `Graph`. + pub fn iter_clusters(&self) -> impl Iterator { + self.components.iter().flat_map(Component::iter_clusters) + } + + /// Iterate over the lists of neighbors of the `OddBall`s in the `Graph`. + pub fn iter_neighbors(&self) -> impl Iterator> { + self.components.iter().flat_map(Component::iter_neighbors) + } + + /// Iterate over the anomaly properties of the `OddBall`s in the `Graph`. + pub fn iter_anomaly_properties(&self) -> impl Iterator> { + self.components.iter().flat_map(Component::iter_anomaly_properties) + } + + /// Get the diameter of the `Graph`. + pub fn diameter(&mut self) -> usize { + self.components.iter_mut().map(Component::diameter).max().unwrap_or(0) + } + + /// Get the neighborhood sizes of all `OddBall`s in the `Graph`. + pub fn neighborhood_sizes(&mut self) -> impl Iterator> + '_ { + self.components + .iter_mut() + .map(Component::neighborhood_sizes) + .collect::>() + .into_iter() + .flatten() + } + + /// Get the total number of points in the `Graph`. + #[must_use] + pub fn population(&self) -> usize { + self.populations.last().copied().unwrap_or(0) + } + + /// Iterate over the `Component`s in the `Graph`. + pub(crate) fn iter_components(&self) -> impl Iterator> { + self.components.iter() + } + + /// Compute the stationary probability of each `OddBall` in the `Graph`. + #[must_use] + pub fn compute_stationary_probabilities(&self, num_steps: usize) -> Vec { + self.components + .iter() + .flat_map(|c| c.compute_stationary_probabilities(num_steps)) + .collect() + } + + /// Get the accumulated child-parent cardinality ratio of each `OddBall` in the `Graph`. + #[must_use] + pub fn accumulated_cp_car_ratios(&self) -> Vec { + self.components + .iter() + .flat_map(Component::accumulated_cp_car_ratios) + .collect() + } +} diff --git a/crates/abd-clam/src/chaoda/members/cc.rs b/crates/abd-clam/src/chaoda/members/cc.rs new file mode 100644 index 000000000..7affeff4f --- /dev/null +++ b/crates/abd-clam/src/chaoda/members/cc.rs @@ -0,0 +1,32 @@ +//! Cluster Cardinality algorithm. + +use distances::Number; +use serde::{Deserialize, Serialize}; + +use crate::chaoda::{Graph, OddBall}; + +use super::Algorithm; + +/// `Cluster`s with relatively few points are more likely to be anomalous. +#[derive(Clone, Serialize, Deserialize)] +pub struct ClusterCardinality; + +impl Algorithm for ClusterCardinality { + fn name(&self) -> String { + "cc".to_string() + } + + fn evaluate_clusters>(&self, g: &mut Graph) -> Vec { + g.iter_clusters().map(|c| -c.cardinality().as_f32()).collect() + } + + fn normalize_by_cluster(&self) -> bool { + true + } +} + +impl Default for ClusterCardinality { + fn default() -> Self { + Self + } +} diff --git a/crates/abd-clam/src/chaoda/members/gn.rs b/crates/abd-clam/src/chaoda/members/gn.rs new file mode 100644 index 000000000..e23ae2535 --- /dev/null +++ b/crates/abd-clam/src/chaoda/members/gn.rs @@ -0,0 +1,56 @@ +//! Graph Neighborhood Algorithm + +use distances::Number; +use serde::{Deserialize, Serialize}; + +use crate::chaoda::{Graph, OddBall}; + +use super::Algorithm; + +/// `Cluster`s in an isolated neighborhood are more likely to be anomalous. +#[derive(Clone, Serialize, Deserialize)] +pub struct GraphNeighborhood { + /// The fraction of graph diameter to use as the neighborhood radius. + diameter_fraction: f32, +} + +impl GraphNeighborhood { + /// Create a new `GraphNeighborhood` algorithm. + /// + /// # Parameters + /// + /// * `diameter_fraction`: The fraction of graph diameter to use as the neighborhood radius. + pub fn new(diameter_fraction: f32) -> Result { + if diameter_fraction <= 0.0 || diameter_fraction >= 1.0 { + Err("Diameter fraction must be in the range [0, 1]".to_string()) + } else { + Ok(Self { diameter_fraction }) + } + } +} + +impl Algorithm for GraphNeighborhood { + fn name(&self) -> String { + format!("gn-{}", self.diameter_fraction) + } + + fn evaluate_clusters>(&self, g: &mut Graph) -> Vec { + let diameter = g.diameter(); + #[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)] + let k = (self.diameter_fraction * diameter.as_f32()).round() as usize; + g.neighborhood_sizes() + .map(|n| if n.len() <= k { n.last().unwrap_or(&0) } else { &n[k] }) + .map(|&n| -n.as_f32()) + .collect() + } + + fn normalize_by_cluster(&self) -> bool { + true + } +} + +impl Default for GraphNeighborhood { + fn default() -> Self { + Self { diameter_fraction: 0.1 } + } +} diff --git a/crates/abd-clam/src/chaoda/members/mod.rs b/crates/abd-clam/src/chaoda/members/mod.rs new file mode 100644 index 000000000..0840a071d --- /dev/null +++ b/crates/abd-clam/src/chaoda/members/mod.rs @@ -0,0 +1,203 @@ +//! The individual algorithms that make up the CHAODA ensemble. + +mod cc; +mod gn; +mod pc; +mod sc; +mod sp; +mod vd; + +use distances::Number; +use serde::{Deserialize, Serialize}; + +use crate::utils; + +use super::{Graph, OddBall}; + +/// The algorithms that make up the CHAODA ensemble. +#[derive(Clone, Serialize, Deserialize)] +pub enum Member { + /// The Cluster Cardinality algorithm. + CC(cc::ClusterCardinality), + /// The Graph Neighborhood algorithm. + GN(gn::GraphNeighborhood), + /// The Parent Cardinality algorithm. + PC(pc::ParentCardinality), + /// The Subgraph Cardinality algorithm. + SC(sc::SubgraphCardinality), + /// The Stationary Probability algorithm. + SP(sp::StationaryProbability), + /// The Vertex Degree algorithm. + VD(vd::VertexDegree), +} + +impl Member { + /// Create a new `ChaodaMember` algorithm using default parameters. + /// + /// # Parameters + /// + /// * `model`: The name of the algorithm. + /// + /// # Errors + /// + /// If the algorithm name is not recognized. + pub fn new(model: &str) -> Result { + Ok(match model { + "cc" | "CC" | "ClusterCardinality" => Self::CC(cc::ClusterCardinality), + "gn" | "GN" | "GraphNeighborhood" => Self::GN(gn::GraphNeighborhood::default()), + "pc" | "PC" | "ParentCardinality" => Self::PC(pc::ParentCardinality), + "sc" | "SC" | "SubgraphCardinality" => Self::SC(sc::SubgraphCardinality), + "sp" | "SP" | "StationaryProbability" => Self::SP(sp::StationaryProbability::default()), + "vd" | "VD" | "VertexDegree" => Self::VD(vd::VertexDegree), + _ => return Err(format!("Unknown model: {model}")), + }) + } + + /// Create the default set of algorithms for the CHAODA ensemble. + #[must_use] + pub fn default_members() -> Vec { + vec![ + Self::CC(cc::ClusterCardinality), + Self::GN(gn::GraphNeighborhood::default()), + Self::PC(pc::ParentCardinality), + Self::SC(sc::SubgraphCardinality), + // Self::SP(sp::StationaryProbability::default()), + Self::VD(vd::VertexDegree), + ] + } + + /// Get the name of the algorithm. + #[must_use] + pub fn name(&self) -> String { + match self { + Self::CC(a) => a.name(), + Self::GN(a) => a.name(), + Self::PC(a) => a.name(), + Self::SC(a) => a.name(), + Self::SP(a) => a.name(), + Self::VD(a) => a.name(), + } + } + + /// Evaluate the algorithm on a `Graph` and return a vector of scores for each + /// `OddBall` in the `Graph`. + /// + /// The output vector must be the same length as the number of `OddBall`s in + /// the `Graph`, and the order of the scores must correspond to the order of the + /// `OddBall`s in the `Graph`. + pub fn evaluate_clusters>(&self, g: &mut Graph) -> Vec { + match self { + Self::CC(a) => a.evaluate_clusters(g), + Self::GN(a) => a.evaluate_clusters(g), + Self::PC(a) => a.evaluate_clusters(g), + Self::SC(a) => a.evaluate_clusters(g), + Self::SP(a) => a.evaluate_clusters(g), + Self::VD(a) => a.evaluate_clusters(g), + } + } + + /// Whether to normalize anomaly scores by cluster or by point. + #[must_use] + pub fn normalize_by_cluster(&self) -> bool { + match self { + Self::CC(a) => a.normalize_by_cluster(), + Self::GN(a) => a.normalize_by_cluster(), + Self::PC(a) => a.normalize_by_cluster(), + Self::SC(a) => a.normalize_by_cluster(), + Self::SP(a) => a.normalize_by_cluster(), + Self::VD(a) => a.normalize_by_cluster(), + } + } + + /// Compute the anomaly scores for all points in the `Graph`. + /// + /// This method is a convenience method that wraps the `evaluate` and `inherit_scores` + /// methods. It evaluates the algorithm on the `Graph` and then inherits the scores + /// from the `OddBall`s to the points. It correctly handles normalization by cluster + /// or by point. + /// + /// # Returns + /// + /// * A vector of anomaly scores for each point in the `Graph`. + pub fn evaluate_points>(&self, g: &mut Graph) -> Vec { + match self { + Self::CC(a) => a.evaluate_points(g), + Self::GN(a) => a.evaluate_points(g), + Self::PC(a) => a.evaluate_points(g), + Self::SC(a) => a.evaluate_points(g), + Self::SP(a) => a.evaluate_points(g), + Self::VD(a) => a.evaluate_points(g), + } + } + + /// Normalize the scores using the Error Function. + #[must_use] + pub fn normalize_scores(scores: &[f32]) -> Vec { + let mean = utils::mean(scores); + let sd = utils::standard_deviation(scores); + utils::normalize_1d(scores, mean, sd) + } +} + +/// A trait for an algorithm in the CHAODA ensemble. +trait Algorithm: Default + Clone + Send + Sync + Serialize + for<'de> Deserialize<'de> { + /// Get the name of the algorithm. + fn name(&self) -> String; + + /// Evaluate the algorithm on a `Graph` and return a vector of scores for each + /// `OddBall` in the `Graph`. + /// + /// The output vector must be the same length as the number of `OddBall`s in + /// the `Graph`, and the order of the scores must correspond to the order of the + /// `OddBall`s in the `Graph`. + fn evaluate_clusters>(&self, g: &mut Graph) -> Vec; + + /// Whether to normalize anomaly scores by cluster or by point. + fn normalize_by_cluster(&self) -> bool; + + /// Have points inherit scores from `OddBall`s. + fn inherit_scores>(&self, g: &Graph, scores: &[f32]) -> Vec { + let mut points_scores = vec![0.0; g.population()]; + for (c, &s) in g.iter_clusters().zip(scores.iter()) { + for i in c.indices() { + points_scores[i] = s; + } + } + points_scores + } + + /// Compute the anomaly scores for all points in the `Graph`. + /// + /// This method is a convenience method that wraps the `evaluate` and `inherit_scores` + /// methods. It evaluates the algorithm on the `Graph` and then inherits the scores + /// from the `OddBall`s to the points. It correctly handles normalization by cluster + /// or by point. + /// + /// # Returns + /// + /// * A vector of anomaly scores for each point in the `Graph`. + fn evaluate_points>(&self, g: &mut Graph) -> Vec { + let cluster_scores = { + let scores = self.evaluate_clusters(g); + if self.normalize_by_cluster() { + self.normalize_scores(&scores) + } else { + scores + } + }; + + let scores = self.inherit_scores(g, &cluster_scores); + if self.normalize_by_cluster() { + scores + } else { + self.normalize_scores(&scores) + } + } + + /// Normalize the scores using the Error Function. + fn normalize_scores(&self, scores: &[f32]) -> Vec { + let mean = utils::mean(scores); + let sd = utils::standard_deviation(scores); + utils::normalize_1d(scores, mean, sd) + } +} diff --git a/crates/abd-clam/src/chaoda/members/pc.rs b/crates/abd-clam/src/chaoda/members/pc.rs new file mode 100644 index 000000000..fa1473e68 --- /dev/null +++ b/crates/abd-clam/src/chaoda/members/pc.rs @@ -0,0 +1,32 @@ +//! Relative Parent Cardinality algorithm. + +use distances::Number; +use serde::{Deserialize, Serialize}; + +use crate::chaoda::{Graph, OddBall}; + +use super::Algorithm; + +/// `Cluster`s with a smaller fraction of points from their parent `Cluster` are more anomalous. +#[derive(Clone, Serialize, Deserialize)] +pub struct ParentCardinality; + +impl Algorithm for ParentCardinality { + fn name(&self) -> String { + "pc".to_string() + } + + fn evaluate_clusters>(&self, g: &mut Graph) -> Vec { + g.accumulated_cp_car_ratios() + } + + fn normalize_by_cluster(&self) -> bool { + true + } +} + +impl Default for ParentCardinality { + fn default() -> Self { + Self + } +} diff --git a/crates/abd-clam/src/chaoda/members/sc.rs b/crates/abd-clam/src/chaoda/members/sc.rs new file mode 100644 index 000000000..781e23bd7 --- /dev/null +++ b/crates/abd-clam/src/chaoda/members/sc.rs @@ -0,0 +1,37 @@ +//! Subgraph Cardinality algorithm. + +use distances::Number; +use serde::{Deserialize, Serialize}; + +use crate::chaoda::{Graph, OddBall}; + +use super::Algorithm; + +/// `Cluster`s in subgraphs with relatively small population are more likely to be anomalous. +#[derive(Clone, Serialize, Deserialize)] +pub struct SubgraphCardinality; + +impl Algorithm for SubgraphCardinality { + fn name(&self) -> String { + "sc".to_string() + } + + fn evaluate_clusters>(&self, g: &mut Graph) -> Vec { + g.iter_components() + .flat_map(|sg| { + let p = -sg.population().as_f32(); + core::iter::repeat(p).take(sg.cardinality()) + }) + .collect() + } + + fn normalize_by_cluster(&self) -> bool { + true + } +} + +impl Default for SubgraphCardinality { + fn default() -> Self { + Self + } +} diff --git a/crates/abd-clam/src/chaoda/members/sp.rs b/crates/abd-clam/src/chaoda/members/sp.rs new file mode 100644 index 000000000..48ddbab2d --- /dev/null +++ b/crates/abd-clam/src/chaoda/members/sp.rs @@ -0,0 +1,49 @@ +//! Stationary Probabilities Algorithm. + +use distances::Number; +use serde::{Deserialize, Serialize}; + +use crate::chaoda::{Graph, OddBall}; + +use super::Algorithm; + +/// Clusters with smaller stationary probabilities are more anomalous. +#[derive(Clone, Serialize, Deserialize)] +pub struct StationaryProbability { + /// The Random Walk will be simulated for 2^`num_steps` steps. + num_steps: usize, +} + +impl StationaryProbability { + /// Create a new `StationaryProbability` algorithm. + /// + /// # Arguments + /// + /// * `num_steps`: The Random Walk will be simulated for 2^`num_steps` steps. + pub const fn new(num_steps: usize) -> Self { + Self { num_steps } + } +} + +impl Algorithm for StationaryProbability { + fn name(&self) -> String { + format!("sp-{}", self.num_steps) + } + + fn evaluate_clusters>(&self, g: &mut Graph) -> Vec { + g.compute_stationary_probabilities(self.num_steps) + .into_iter() + .map(|x| 1.0 - x) + .collect() + } + + fn normalize_by_cluster(&self) -> bool { + true + } +} + +impl Default for StationaryProbability { + fn default() -> Self { + Self { num_steps: 16 } + } +} diff --git a/crates/abd-clam/src/chaoda/members/vd.rs b/crates/abd-clam/src/chaoda/members/vd.rs new file mode 100644 index 000000000..497e2ee86 --- /dev/null +++ b/crates/abd-clam/src/chaoda/members/vd.rs @@ -0,0 +1,32 @@ +//! Vertex Degree Algorithm + +use distances::Number; +use serde::{Deserialize, Serialize}; + +use crate::chaoda::{Graph, OddBall}; + +use super::Algorithm; + +/// `Cluster`s with relatively few neighbors are more likely to be anomalous. +#[derive(Clone, Serialize, Deserialize)] +pub struct VertexDegree; + +impl Algorithm for VertexDegree { + fn name(&self) -> String { + "vd".to_string() + } + + fn evaluate_clusters>(&self, g: &mut Graph) -> Vec { + g.iter_neighbors().map(|n| -n.len().as_f32()).collect() + } + + fn normalize_by_cluster(&self) -> bool { + true + } +} + +impl Default for VertexDegree { + fn default() -> Self { + Self + } +} diff --git a/crates/abd-clam/src/chaoda/meta_ml.rs b/crates/abd-clam/src/chaoda/meta_ml.rs new file mode 100644 index 000000000..450f3f66b --- /dev/null +++ b/crates/abd-clam/src/chaoda/meta_ml.rs @@ -0,0 +1,187 @@ +//! Meta Machine Learning models for CHAODA. + +use serde::{Deserialize, Serialize}; +use smartcore::{ + ensemble::random_forest_regressor::{RandomForestRegressor, RandomForestRegressorParameters}, + linalg::basic::matrix::DenseMatrix, + linear::{ + elastic_net::{ElasticNet, ElasticNetParameters}, + lasso::{Lasso, LassoParameters}, + linear_regression::{LinearRegression, LinearRegressionParameters, LinearRegressionSolverName}, + ridge_regression::{RidgeRegression, RidgeRegressionParameters}, + }, + tree::decision_tree_regressor::{DecisionTreeRegressor, DecisionTreeRegressorParameters}, +}; + +/// A Meta Machine Learning model for CHAODA. +#[derive(Serialize, Deserialize)] +pub enum MlModel { + /// A linear regression model. + LinearRegression(LinearRegression, Vec>), + /// An Elastic Net model. + ElasticNet(ElasticNet, Vec>), + /// A Lasso model. + Lasso(Lasso, Vec>), + /// A Ridge Regression model. + RidgeRegression(RidgeRegression, Vec>), + /// A Decision Tree Regressor model. + DecisionTreeRegressor(DecisionTreeRegressor, Vec>), + /// A Random Forest Regressor model. + RandomForestRegressor(RandomForestRegressor, Vec>), +} + +impl MlModel { + /// Create a new `MetaMlModel`. + /// + /// # Arguments + /// + /// * `model`: The name of the model. + /// + /// # Errors + /// + /// * If the model name is unknown. + pub fn new(model: &str) -> Result { + // Dummy data for model initialization. This comes from the `smartcore` examples. + let dummy_x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], + &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + let dummy_y = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9, + ]; + + Ok(match model { + "lr" | "LR" | "LinearRegression" => Self::LinearRegression( + LinearRegression::fit( + &dummy_x, + &dummy_y, + LinearRegressionParameters::default().with_solver(LinearRegressionSolverName::QR), + ) + .map_err(|e| e.to_string())?, + ), + "en" | "EN" | "ElasticNet" => Self::ElasticNet( + ElasticNet::fit(&dummy_x, &dummy_y, ElasticNetParameters::default()).map_err(|e| e.to_string())?, + ), + "la" | "LA" | "Lasso" => { + Self::Lasso(Lasso::fit(&dummy_x, &dummy_y, LassoParameters::default()).map_err(|e| e.to_string())?) + } + "rr" | "RR" | "RidgeRegression" => Self::RidgeRegression( + RidgeRegression::fit(&dummy_x, &dummy_y, RidgeRegressionParameters::default()) + .map_err(|e| e.to_string())?, + ), + "dt" | "DT" | "DecisionTreeRegressor" => Self::DecisionTreeRegressor( + DecisionTreeRegressor::fit(&dummy_x, &dummy_y, DecisionTreeRegressorParameters::default()) + .map_err(|e| e.to_string())?, + ), + "rf" | "RF" | "RandomForestRegressor" => Self::RandomForestRegressor( + RandomForestRegressor::fit(&dummy_x, &dummy_y, RandomForestRegressorParameters::default()) + .map_err(|e| e.to_string())?, + ), + _ => return Err(format!("Unknown model: {model}")), + }) + } + + /// Get the default models. + #[must_use] + pub fn defaults() -> Vec { + vec![ + Self::new("LR").unwrap_or_else(|e| unreachable!("{e}")), + // Self::new("EN").unwrap_or_else(|e| unreachable!("{e}")), + // Self::new("LA").unwrap_or_else(|e| unreachable!("{e}")), + // Self::new("RR").unwrap_or_else(|e| unreachable!("{e}")), + Self::new("DT").unwrap_or_else(|e| unreachable!("{e}")), + // Self::new("RF").unwrap_or_else(|e| unreachable!("{e}")), + ] + } + + /// Train the model on data from a `Graph`. + /// + /// # Arguments + /// + /// * `data`: A matrix where each row contains the aggregated anomaly properties of `Cluster`s in a `Graph`. + /// * `roc_scores`: The ROC score for each `Cluster`. + /// + /// # Errors + /// + /// * If the number of `labels` is not equal to the cardinality of the data. + pub fn train(&mut self, data: &[Vec], roc_scores: &Vec) -> Result<(), String> { + let data = data.iter().map(Vec::as_slice).collect::>(); + let data = DenseMatrix::from_2d_array(&data); + match self { + Self::LinearRegression(model) => { + *model = LinearRegression::fit(&data, roc_scores, LinearRegressionParameters::default()) + .map_err(|e| e.to_string())?; + Ok(()) + } + Self::ElasticNet(model) => { + *model = + ElasticNet::fit(&data, roc_scores, ElasticNetParameters::default()).map_err(|e| e.to_string())?; + Ok(()) + } + Self::Lasso(model) => { + *model = Lasso::fit(&data, roc_scores, LassoParameters::default()).map_err(|e| e.to_string())?; + Ok(()) + } + Self::RidgeRegression(model) => { + *model = RidgeRegression::fit(&data, roc_scores, RidgeRegressionParameters::default()) + .map_err(|e| e.to_string())?; + Ok(()) + } + Self::DecisionTreeRegressor(model) => { + *model = DecisionTreeRegressor::fit(&data, roc_scores, DecisionTreeRegressorParameters::default()) + .map_err(|e| e.to_string())?; + Ok(()) + } + Self::RandomForestRegressor(model) => { + *model = RandomForestRegressor::fit(&data, roc_scores, RandomForestRegressorParameters::default()) + .map_err(|e| e.to_string())?; + Ok(()) + } + } + } + + /// Predict the suitability of several `Cluster`s for selection in a `Graph`. + /// + /// This method is convenient when we want to predict the suitability of several `Cluster`s at once, + /// and using several `MetaML` models. + /// + /// # Arguments + /// + /// * `data`: A matrix where each row contains the aggregated anomaly properties of `Cluster`s in a `Graph`. + /// + /// # Returns + /// + /// The suitability of the `Cluster`s for selection in a `Graph`. + /// + /// # Errors + /// + /// * If the number of features in the data does not match the number of features in the model. + /// * If the model cannot predict the data. + pub fn predict(&self, data: &[Vec]) -> Result, String> { + let data = data.iter().map(Vec::as_slice).collect::>(); + let data = DenseMatrix::from_2d_array(&data); + match self { + Self::LinearRegression(model) => model.predict(&data), + Self::ElasticNet(model) => model.predict(&data), + Self::Lasso(model) => model.predict(&data), + Self::RidgeRegression(model) => model.predict(&data), + Self::DecisionTreeRegressor(model) => model.predict(&data), + Self::RandomForestRegressor(model) => model.predict(&data), + } + .map_err(|e| e.to_string()) + } +} diff --git a/crates/abd-clam/src/chaoda/mod.rs b/crates/abd-clam/src/chaoda/mod.rs index c78af1deb..56b276e6e 100644 --- a/crates/abd-clam/src/chaoda/mod.rs +++ b/crates/abd-clam/src/chaoda/mod.rs @@ -1,5 +1,350 @@ //! Clustered Hierarchical Anomaly and Outlier Detection Algorithms (CHAODA) mod cluster; +mod component; +mod graph; +mod members; +mod meta_ml; -pub use cluster::{OddBall, Vertex}; +use distances::Number; +use ndarray::prelude::*; +use serde::{Deserialize, Serialize}; +use smartcore::metrics::roc_auc_score; + +use crate::{Dataset, Partition}; + +pub use cluster::{OddBall, Ratios, Vertex}; +pub use component::{AdjacencyList, Component, Neighbors}; +pub use graph::Graph; +pub use members::Member; +pub use meta_ml::MlModel; + +/// The training data for the ensemble. +/// +/// The outer vector is the ensemble members. +/// The middle vector is the meta-ML models. +/// The inner vector is the epoch training data. +/// The tuple is the anomaly ratios and roc scores. +pub type TrainingData = Vec>, Vec)>>; + +/// A CHAODA ensemble. +#[derive(Serialize, Deserialize)] +pub struct Chaoda { + /// The combination of the CHAODA algorithms and the meta-ML models. + algorithms: Vec<(Member, Vec)>, + /// The minimum depth of `Cluster`s to consider for selection. + min_depth: usize, +} + +impl Default for Chaoda { + fn default() -> Self { + Self { + algorithms: Member::default_members() + .into_iter() + .map(|member| (member, MlModel::defaults())) + .collect(), + min_depth: 4, + } + } +} + +impl Chaoda { + /// Create a new `Chaoda` ensemble. + #[must_use] + pub const fn new(algorithms: Vec<(Member, Vec)>, min_depth: usize) -> Self { + Self { algorithms, min_depth } + } + + /// Get the number of predictors in the ensemble. + #[must_use] + pub fn num_predictors(&self) -> usize { + self.algorithms.iter().map(|(_, models)| models.len()).sum() + } + + /// Save the model to a given path. + /// + /// # Arguments + /// + /// * `path`: The path to save the model to. + /// + /// # Errors + /// + /// * If there is an error creating the file. + /// * If there is an error serializing the model. + pub fn save(&self, path: &std::path::Path) -> Result<(), String> { + let file = std::fs::File::create(path).map_err(|e| format!("Error creating file: {e}"))?; + bincode::serialize_into(file, self).map_err(|e| format!("Error serializing: {e}"))?; + Ok(()) + } + + /// Load the model from a given path. + /// + /// # Arguments + /// + /// * `path`: The path to load the model from. + /// + /// # Errors + /// + /// * If there is an error opening the file. + /// * If there is an error deserializing the model. + pub fn load(path: &std::path::Path) -> Result { + let file = std::fs::File::open(path).map_err(|e| format!("Error opening file: {e}"))?; + let model = bincode::deserialize_from(file).map_err(|e| format!("Error deserializing: {e}"))?; + Ok(model) + } + + /// Predict the anomaly scores for the given dataset and root `Cluster`. + /// + /// This method produces scores for points in their original order. + /// + /// # Arguments + /// + /// * `data`: The dataset to predict on. + /// * `num_trees`: The number of trees to use in the ensemble. + /// * `criteria`: The partition criterion to use for building the trees. + /// * `seed`: The seed to use for random number generation, if any. + /// + /// # Returns + /// + /// The anomaly scores for each point in the dataset. + pub fn predict(&self, data: &D, root: &C) -> Vec + where + U: Number, + D: Dataset, + C: OddBall, + { + let mut graphs = self.create_graphs(data, root); + let predictions = self + .algorithms + .iter() + .zip(graphs.iter_mut()) + .flat_map(|((member, _), m_graphs)| { + m_graphs + .iter_mut() + .map(|g| member.evaluate_points(g)) + .collect::>() + }) + .collect::>(); + + Self::aggregate_predictions(&predictions) + } + + /// Aggregate the predictions of the ensemble. + /// + /// For now, we take the mean of the anomaly scores for each point. Later, + /// we may want to consider other aggregation methods. + #[must_use] + pub fn aggregate_predictions(scores: &[Vec]) -> Vec { + // Take the mean of the anomaly scores for each point + let shape = (scores.len(), scores[0].len()); + let scores = scores.iter().flat_map(Vec::as_slice).copied().collect::>(); + Array2::from_shape_vec(shape, scores) + .unwrap_or_else(|_| unreachable!("We made sure the shape was correct.")) + .mean_axis(Axis(0)) + .unwrap_or_else(|| unreachable!("We made sure the shape was correct.")) + .to_vec() + } + + /// Train the ensemble on the given datasets. + /// + /// # Arguments + /// + /// * `datasets`: The datasets and labels to train on. + /// * `num_epochs`: The number of epochs to train for. + /// * `criteria`: The partition criterion to use for building the tree. + /// * `previous_data`: The previous training data to start from, if any. + /// * `seed`: The seed to use for random number generation, if any. + /// + /// # Type Parameters + /// + /// * `I`: The type of the instances in the dataset. + /// * `U`: The type of the distance values. + /// * `D`: The type of the dataset. + /// * `C`: The type of the `OddBall` `Cluster`. + /// * `N`: Half the number of anomaly properties in the `Cluster`. + /// * `P`: The partition criteria. + pub fn train( + &mut self, + datasets: &[(D, Vec)], + num_epochs: usize, + criteria: &P, + previous_data: Option, + seed: Option, + ) where + U: Number, + D: Dataset, + C: OddBall + Partition, + P: Fn(&C) -> bool, + { + let mut fresh_start = previous_data.is_none(); + + let mut full_training_data = previous_data.unwrap_or_else(|| { + self.algorithms + .iter() + .map(|(_, models)| models.iter().map(|_| (Vec::new(), Vec::new())).collect::>()) + .collect() + }); + let mut graphs; + + for e in 0..num_epochs { + let training_data_size = full_training_data + .iter() + .map(|m| m.iter().map(|m| m.0.len()).sum::()) + .sum::(); + println!( + "Starting Epoch {}/{num_epochs} with dataset size: {training_data_size}", + e + 1 + ); + + for (data, labels) in datasets { + // Build the tree + let seed = seed.map(|s| s + (e + data.cardinality()) as u64); + let (root, _) = C::new_tree(data, criteria, seed); + + // Create the graphs + graphs = if fresh_start { + fresh_start = false; + let cluster_scorer = |clusters: &[&C]| { + clusters + .iter() + .map(|c| { + if c.depth() == self.min_depth || (c.is_leaf() && c.depth() < self.min_depth) { + 1.0 + } else { + 0.0 + } + }) + .collect::>() + }; + let graph = Graph::from_tree(&root, data, cluster_scorer, 4); + self.algorithms + .iter() + .map(|(_, models)| models.iter().map(|_| graph.clone()).collect::>()) + .collect::>() + } else { + self.create_graphs(data, &root) + }; + + // Create the new training data + let new_training_data = self.generate_training_data(&mut graphs, labels); + + // Aggregate the training data + full_training_data = full_training_data + .into_iter() + .zip(new_training_data) + .map(|(m_old, m_new)| { + m_old + .into_iter() + .zip(m_new) + .map(|((mut x_old, mut y_old), (mut x_new, mut y_new))| { + x_old.append(&mut x_new); + y_old.append(&mut y_new); + (x_old, y_old) + }) + .collect::>() + }) + .collect::>(); + + // Train the inner models + self.train_inner_models(&full_training_data); + + // Report the ROC score + let y_true = labels.iter().map(|&b| if b { 1.0 } else { 0.0 }).collect::>(); + let roc_score = { + let predictions = self.predict(data, &root); + roc_auc_score(&y_true, &predictions) + }; + println!("ROC AUC Score: {roc_score}"); + } + } + } + + /// Create `Graph`s for the ensemble. + fn create_graphs<'a, I, U, D, C>(&self, data: &D, root: &'a C) -> Vec>> + where + U: Number + 'a, + D: Dataset, + C: OddBall, + { + self.algorithms + .iter() + .map(|(_, models)| { + models + .iter() + .map(|ml_model| { + let cluster_scorer = |clusters: &[&C]| { + let properties = clusters.iter().map(|c| c.ratios()).collect::>(); + ml_model + .predict(&properties) + .unwrap_or_else(|_| unreachable!("We made sure the shape was correct.")) + }; + Graph::from_tree(root, data, cluster_scorer, self.min_depth) + }) + .collect() + }) + .collect() + } + + /// Generate training data from `Graph`s. + fn generate_training_data(&self, graphs: &mut [Vec>], labels: &[bool]) -> TrainingData + where + U: Number, + C: OddBall, + { + self.algorithms + .iter() + .zip(graphs.iter_mut()) + .map(|((member, _), m_graphs)| { + m_graphs + .iter_mut() + .map(|g| { + let train_x = g.iter_anomaly_properties().cloned().collect::>(); + let anomaly_ratings = Member::normalize_scores(&member.evaluate_clusters(g)); + let train_y = g + .iter_clusters() + .zip(anomaly_ratings) + .map(|(c, rating)| { + let indices = c.indices(); + // The roc-score function needs both classes represented so we add a + // couple of dummy values to the end of the vectors. + let mut y_true = indices + .iter() + .map(|&i| if labels[i] { 1.0 } else { 0.0 }) + .collect::>(); + y_true.push(1.0); + y_true.push(0.0); + let mut y_pred = vec![rating; c.cardinality()]; + y_pred.push(1.0); + y_pred.push(0.0); + roc_auc_score(&y_true, &y_pred).as_f32() + }) + .collect(); + (train_x, train_y) + }) + .collect() + }) + .collect() + } + + /// Train the inner models given the training data. + fn train_inner_models(&mut self, training_data: &TrainingData) { + self.algorithms + .iter_mut() + .zip(training_data) + .for_each(|((_, ml_models), member_data)| { + ml_models + .iter_mut() + .zip(member_data) + .for_each(|(model, (train_x, train_y))| { + model.train(train_x, train_y).unwrap_or_else(|e| unreachable!("{e}")); + }); + }); + } + + /// Compute the ROC AUC score. + #[allow(clippy::ptr_arg)] + #[must_use] + pub fn roc_auc_score(y_true: &Vec, y_pred: &Vec) -> f32 { + roc_auc_score(y_true, y_pred).as_f32() + } +} diff --git a/crates/abd-clam/src/core/cluster/ball.rs b/crates/abd-clam/src/core/cluster/ball.rs index 06d5fe873..e923f718d 100644 --- a/crates/abd-clam/src/core/cluster/ball.rs +++ b/crates/abd-clam/src/core/cluster/ball.rs @@ -2,6 +2,8 @@ use core::fmt::Debug; +use std::hash::Hash; + use distances::Number; use crate::{ @@ -13,6 +15,7 @@ use super::{Children, Cluster, IndexStore, ParCluster, ParPartition, Partition, /// A metric-`Ball` is a collection of instances that are within a certain /// distance of a center. +#[derive(Clone)] pub struct Ball { /// Parameters used for creating the `Ball`. depth: usize, @@ -85,6 +88,16 @@ impl Ord for Ball { } } +impl Hash for Ball { + #[allow(unused_variables)] + fn hash(&self, state: &mut H) { + todo!() + // self.depth.hash(state); + // self.index_store.hash(state); + // self.cardinality.hash(state); + } +} + impl Cluster for Ball { fn new>(data: &D, indices: &[usize], depth: usize, seed: Option) -> (Self, usize) where diff --git a/crates/abd-clam/src/core/cluster/children.rs b/crates/abd-clam/src/core/cluster/children.rs index b3b96e234..ed3f006ba 100644 --- a/crates/abd-clam/src/core/cluster/children.rs +++ b/crates/abd-clam/src/core/cluster/children.rs @@ -3,7 +3,7 @@ use distances::Number; /// The `Children` of a `Cluster`. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Children { /// The children of the `Cluster`. children: Vec>, diff --git a/crates/abd-clam/src/core/cluster/index_store.rs b/crates/abd-clam/src/core/cluster/index_store.rs index e59116847..0e7694ea3 100644 --- a/crates/abd-clam/src/core/cluster/index_store.rs +++ b/crates/abd-clam/src/core/cluster/index_store.rs @@ -9,7 +9,7 @@ use super::Cluster; /// The various ways to store the indices of a `Cluster`. #[non_exhaustive] -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] pub enum IndexStore { /// Every `Cluster` stores the indices of its instances. EveryCluster(Vec), diff --git a/crates/abd-clam/src/core/cluster/mod.rs b/crates/abd-clam/src/core/cluster/mod.rs index 948823d31..33dac0c4a 100644 --- a/crates/abd-clam/src/core/cluster/mod.rs +++ b/crates/abd-clam/src/core/cluster/mod.rs @@ -9,6 +9,8 @@ mod partition; use core::fmt::Debug; +use std::hash::Hash; + use distances::Number; use super::{Dataset, MetricSpace, ParDataset}; @@ -26,7 +28,7 @@ pub use partition::{ParPartition, Partition}; /// /// - `U`: The type of the distance values between instances. /// - `P`: The type of the parameters used to create the `Cluster`. -pub trait Cluster: Debug + PartialOrd { +pub trait Cluster: Debug + Ord + Clone + Hash { /// Creates a new `Cluster`. /// /// This should store indices as `IndexStore::EveryCluster`. @@ -141,6 +143,36 @@ pub trait Cluster: Debug + PartialOrd { clusters } + /// Returns whether the `Cluster` is a descendant of another `Cluster`. + /// + /// This may only return `true` if both `Cluster`s have the same variant of + /// `IndexStore`. + /// + /// If the `IndexStore` is `EveryCluster` or `LeafOnly`, then we will check + /// if the indices in `self` are a subset of the indices in `other`. + /// Otherwise, we will check if the `offset` of `self` is in the range + /// `[offset, offset + cardinality)` of `other`. + fn is_descendant_of(&self, other: &Self) -> bool + where + Self: Sized, + { + match (self.index_store(), other.index_store()) { + (IndexStore::PostPermutation(s_offset), IndexStore::PostPermutation(o_offset)) => { + let o_range = (*o_offset)..(*o_offset + other.cardinality()); + o_range.contains(s_offset) + } + (IndexStore::EveryCluster(s_indices), IndexStore::EveryCluster(o_indices)) => { + let o_indices = o_indices.iter().collect::>(); + s_indices.iter().all(|i| o_indices.contains(i)) + } + (IndexStore::LeafOnly(_), IndexStore::LeafOnly(_)) => { + let o_indices = other.indices().into_iter().collect::>(); + self.indices().iter().all(|i| o_indices.contains(i)) + } + _ => false, + } + } + /// Whether the `Cluster` is a leaf in the tree. fn is_leaf(&self) -> bool where