diff --git a/.gitignore b/.gitignore index 69369904..91a0d835 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ /target **/*.rs.bk Cargo.lock + +# IDE-related +tags +rusty-tags.vi +.vscode \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 1c07183a..1063b4c4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ addons: - libssl-dev cache: cargo rust: - - 1.28.0 + - 1.30.0 - stable - beta - nightly diff --git a/Cargo.toml b/Cargo.toml index 3c4b17ac..ff5dc083 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ ndarray = "0.12" noisy_float = "0.1" num-traits = "0.2" rand = "0.5" +itertools = { version = "0.7.0", default-features = false } [dev-dependencies] quickcheck = "0.7" diff --git a/src/histogram/bins.rs b/src/histogram/bins.rs new file mode 100644 index 00000000..6c018359 --- /dev/null +++ b/src/histogram/bins.rs @@ -0,0 +1,437 @@ +use ndarray::prelude::*; +use std::ops::{Index, Range}; + +/// `Edges` is a sorted collection of `A` elements used +/// to represent the boundaries of intervals ([`Bins`]) on +/// a 1-dimensional axis. +/// +/// [`Bins`]: struct.Bins.html +/// # Example: +/// +/// ``` +/// extern crate ndarray_stats; +/// extern crate noisy_float; +/// use ndarray_stats::histogram::{Edges, Bins}; +/// use noisy_float::types::n64; +/// +/// let unit_edges = Edges::from(vec![n64(0.), n64(1.)]); +/// let unit_interval = Bins::new(unit_edges); +/// // left inclusive +/// assert_eq!( +/// unit_interval.range_of(&n64(0.)).unwrap(), +/// n64(0.)..n64(1.), +/// ); +/// // right exclusive +/// assert_eq!( +/// unit_interval.range_of(&n64(1.)), +/// None +/// ); +/// ``` +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Edges { + edges: Vec, +} + +impl From> for Edges { + + /// Get an `Edges` instance from a `Vec`: + /// the vector will be sorted in increasing order + /// using an unstable sorting algorithm and duplicates + /// will be removed. + /// + /// # Example: + /// + /// ``` + /// extern crate ndarray_stats; + /// #[macro_use(array)] + /// extern crate ndarray; + /// use ndarray_stats::histogram::Edges; + /// + /// # fn main() { + /// let edges = Edges::from(array![1, 15, 10, 10, 20]); + /// // The array gets sorted! + /// assert_eq!( + /// edges[2], + /// 15 + /// ); + /// # } + /// ``` + fn from(mut edges: Vec) -> Self { + // sort the array in-place + edges.sort_unstable(); + // remove duplicates + edges.dedup(); + Edges { edges } + } +} + +impl From> for Edges { + /// Get an `Edges` instance from a `Array1`: + /// the array elements will be sorted in increasing order + /// using an unstable sorting algorithm and duplicates will be removed. + /// + /// # Example: + /// + /// ``` + /// extern crate ndarray_stats; + /// use ndarray_stats::histogram::Edges; + /// + /// let edges = Edges::from(vec![1, 15, 10, 20]); + /// // The vec gets sorted! + /// assert_eq!( + /// edges[1], + /// 10 + /// ); + /// ``` + fn from(edges: Array1) -> Self { + let edges = edges.to_vec(); + Self::from(edges) + } +} + +impl Index for Edges{ + type Output = A; + + /// Get the `i`-th edge. + /// + /// **Panics** if the index `i` is out of bounds. + /// + /// # Example: + /// + /// ``` + /// extern crate ndarray_stats; + /// use ndarray_stats::histogram::Edges; + /// + /// let edges = Edges::from(vec![1, 5, 10, 20]); + /// assert_eq!( + /// edges[1], + /// 5 + /// ); + /// ``` + fn index(&self, i: usize) -> &Self::Output { + &self.edges[i] + } +} + +impl Edges { + /// Number of edges in `self`. + /// + /// # Example: + /// + /// ``` + /// extern crate ndarray_stats; + /// extern crate noisy_float; + /// use ndarray_stats::histogram::Edges; + /// use noisy_float::types::n64; + /// + /// let edges = Edges::from(vec![n64(0.), n64(1.), n64(3.)]); + /// assert_eq!( + /// edges.len(), + /// 3 + /// ); + /// ``` + pub fn len(&self) -> usize { + self.edges.len() + } + + /// Borrow an immutable reference to the edges as a 1-dimensional + /// array view. + /// + /// # Example: + /// + /// ``` + /// extern crate ndarray; + /// extern crate ndarray_stats; + /// use ndarray::array; + /// use ndarray_stats::histogram::Edges; + /// + /// let edges = Edges::from(vec![0, 5, 3]); + /// assert_eq!( + /// edges.as_array_view(), + /// array![0, 3, 5].view() + /// ); + /// ``` + pub fn as_array_view(&self) -> ArrayView1 { + ArrayView1::from(&self.edges) + } + + /// Given `value`, it returns an option: + /// - `Some((left, right))`, where `right=left+1`, if there are two consecutive edges in + /// `self` such that `self[left] <= value < self[right]`; + /// - `None`, otherwise. + /// + /// # Example: + /// + /// ``` + /// extern crate ndarray_stats; + /// use ndarray_stats::histogram::Edges; + /// + /// let edges = Edges::from(vec![0, 2, 3]); + /// assert_eq!( + /// edges.indices_of(&1), + /// Some((0, 1)) + /// ); + /// assert_eq!( + /// edges.indices_of(&5), + /// None + /// ); + /// ``` + pub fn indices_of(&self, value: &A) -> Option<(usize, usize)> { + // binary search for the correct bin + let n_edges = self.len(); + match self.edges.binary_search(value) { + Ok(i) if i == n_edges - 1 => None, + Ok(i) => Some((i, i + 1)), + Err(i) => { + match i { + 0 => None, + j if j == n_edges => None, + j => Some((j - 1, j)), + } + } + } + } + + pub fn iter(&self) -> impl Iterator { + self.edges.iter() + } +} + +/// `Bins` is a sorted collection of non-overlapping +/// 1-dimensional intervals. +/// +/// All intervals are left-inclusive and right-exclusive. +/// +/// # Example: +/// +/// ``` +/// extern crate ndarray_stats; +/// extern crate noisy_float; +/// use ndarray_stats::histogram::{Edges, Bins}; +/// use noisy_float::types::n64; +/// +/// let edges = Edges::from(vec![n64(0.), n64(1.), n64(2.)]); +/// let bins = Bins::new(edges); +/// // first bin +/// assert_eq!( +/// bins.index(0), +/// n64(0.)..n64(1.) // n64(1.) is not included in the bin! +/// ); +/// // second bin +/// assert_eq!( +/// bins.index(1), +/// n64(1.)..n64(2.) +/// ); +/// ``` +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Bins { + edges: Edges, +} + +impl Bins { + /// Given a collection of [`Edges`], it returns the corresponding `Bins` instance. + /// + /// [`Edges`]: struct.Edges.html + pub fn new(edges: Edges) -> Self { + Bins { edges } + } + + /// Returns the number of bins. + /// + /// # Example: + /// + /// ``` + /// extern crate ndarray_stats; + /// extern crate noisy_float; + /// use ndarray_stats::histogram::{Edges, Bins}; + /// use noisy_float::types::n64; + /// + /// let edges = Edges::from(vec![n64(0.), n64(1.), n64(2.)]); + /// let bins = Bins::new(edges); + /// assert_eq!( + /// bins.len(), + /// 2 + /// ); + /// ``` + pub fn len(&self) -> usize { + match self.edges.len() { + 0 => 0, + n => n - 1, + } + } + + /// Given `value`, it returns: + /// - `Some(i)`, if the `i`-th bin in `self` contains `value`; + /// - `None`, if `value` does not belong to any of the bins in `self`. + /// + /// # Example: + /// + /// ``` + /// extern crate ndarray_stats; + /// use ndarray_stats::histogram::{Edges, Bins}; + /// + /// let edges = Edges::from(vec![0, 2, 4, 6]); + /// let bins = Bins::new(edges); + /// let value = 1; + /// assert_eq!( + /// bins.index_of(&1), + /// Some(0) + /// ); + /// assert_eq!( + /// bins.index(bins.index_of(&1).unwrap()), + /// 0..2 + /// ); + /// ``` + pub fn index_of(&self, value: &A) -> Option { + self.edges.indices_of(value).map(|t| t.0) + } + + /// Given `value`, it returns: + /// - `Some(left_edge..right_edge)`, if there exists a bin in `self` such that + /// `left_edge <= value < right_edge`; + /// - `None`, otherwise. + /// + /// # Example: + /// + /// ``` + /// extern crate ndarray_stats; + /// use ndarray_stats::histogram::{Edges, Bins}; + /// + /// let edges = Edges::from(vec![0, 2, 4, 6]); + /// let bins = Bins::new(edges); + /// assert_eq!( + /// bins.range_of(&1), + /// Some(0..2) + /// ); + /// assert_eq!( + /// bins.range_of(&10), + /// None + /// ); + /// ``` + pub fn range_of(&self, value: &A) -> Option> + where + A: Clone, + { + let edges_indexes = self.edges.indices_of(value); + edges_indexes.map( + |(left, right)| { + Range { + start: self.edges[left].clone(), + end: self.edges[right].clone(), + } + } + ) + } + + /// Get the `i`-th bin. + /// + /// **Panics** if `index` is out of bounds. + /// + /// # Example: + /// + /// ``` + /// extern crate ndarray_stats; + /// use ndarray_stats::histogram::{Edges, Bins}; + /// + /// let edges = Edges::from(vec![1, 5, 10, 20]); + /// let bins = Bins::new(edges); + /// assert_eq!( + /// bins.index(1), + /// 5..10 + /// ); + /// ``` + pub fn index(&self, index: usize) -> Range + where + A: Clone, + { + // It was not possible to implement this functionality + // using the `Index` trait unless we were willing to + // allocate a `Vec>` in the struct. + // Index, in fact, forces you to return a reference. + Range { + start: self.edges[index].clone(), + end: self.edges[index+1].clone(), + } + } +} + +#[cfg(test)] +mod edges_tests { + use super::*; + use std::collections::BTreeSet; + use std::iter::FromIterator; + + quickcheck! { + fn check_sorted_from_vec(v: Vec) -> bool { + let edges = Edges::from(v); + let n = edges.len(); + for i in 1..n { + if edges[i-1] > edges[i] { + return false; + } + } + true + } + + fn check_sorted_from_array(v: Vec) -> bool { + let a = Array1::from_vec(v); + let edges = Edges::from(a); + let n = edges.len(); + for i in 1..n { + if edges[i-1] > edges[i] { + return false; + } + } + true + } + + fn edges_are_right_exclusive(v: Vec) -> bool { + let edges = Edges::from(v); + let view = edges.as_array_view(); + if view.len() == 0 { + true + } else { + let last = view[view.len()-1]; + edges.indices_of(&last).is_none() + } + } + + fn edges_are_left_inclusive(v: Vec) -> bool { + let edges = Edges::from(v); + match edges.len() { + 1 => true, + _ => { + let view = edges.as_array_view(); + if view.len() == 0 { + true + } else { + let first = view[0]; + edges.indices_of(&first).is_some() + } + } + } + } + + fn edges_are_deduped(v: Vec) -> bool { + let unique_elements = BTreeSet::from_iter(v.iter()); + let edges = Edges::from(v.clone()); + let view = edges.as_array_view(); + let unique_edges = BTreeSet::from_iter(view.iter()); + unique_edges == unique_elements + } + } +} + +#[cfg(test)] +mod bins_tests { + use super::*; + + #[test] + #[should_panic] + fn get_panics_for_out_of_bound_indexes() { + let edges = Edges::from(vec![0]); + let bins = Bins::new(edges); + // we need at least two edges to make a valid bin! + bins.index(0); + } +} diff --git a/src/histogram/errors.rs b/src/histogram/errors.rs new file mode 100644 index 00000000..7afaea1f --- /dev/null +++ b/src/histogram/errors.rs @@ -0,0 +1,23 @@ +use std::error; +use std::fmt; + +/// Error to denote that no bin has been found for a certain observation. +#[derive(Debug, Clone)] +pub struct BinNotFound; + +impl fmt::Display for BinNotFound { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "No bin has been found.") + } +} + +impl error::Error for BinNotFound { + fn description(&self) -> &str { + "No bin has been found." + } + + fn cause(&self) -> Option<&error::Error> { + // Generic error, underlying cause isn't tracked. + None + } +} diff --git a/src/histogram/grid.rs b/src/histogram/grid.rs new file mode 100644 index 00000000..32a7161b --- /dev/null +++ b/src/histogram/grid.rs @@ -0,0 +1,186 @@ +use super::bins::Bins; +use super::strategies::BinsBuildingStrategy; +use std::ops::Range; +use itertools::izip; +use ndarray::{ArrayBase, Data, Ix1, Ix2, Axis}; + +/// A `Grid` is a partition of a rectangular region of an *n*-dimensional +/// space—e.g. [*a*0, *b*0) × ⋯ × [*a**n*−1, +/// *b**n*−1)—into a collection of rectangular *n*-dimensional bins. +/// +/// The grid is **fully determined by its 1-dimensional projections** on the +/// coordinate axes. For example, this is a partition that can be represented +/// as a `Grid` struct: +/// ```text +/// +---+-------+-+ +/// | | | | +/// +---+-------+-+ +/// | | | | +/// | | | | +/// | | | | +/// | | | | +/// +---+-------+-+ +/// ``` +/// while the next one can't: +/// ```text +/// +---+-------+-+ +/// | | | | +/// | +-------+-+ +/// | | | +/// | | | +/// | | | +/// | | | +/// +---+-------+-+ +/// ``` +/// +/// # Example: +/// +/// ``` +/// extern crate ndarray_stats; +/// extern crate ndarray; +/// extern crate noisy_float; +/// use ndarray::{Array, array}; +/// use ndarray_stats::{HistogramExt, +/// histogram::{Histogram, Grid, GridBuilder, +/// Edges, Bins, strategies::Auto}}; +/// use noisy_float::types::{N64, n64}; +/// +/// # fn main() { +/// // 1-dimensional observations, as a (n_observations, 1) 2-d matrix +/// let observations = Array::from_shape_vec( +/// (12, 1), +/// vec![1, 4, 5, 2, 100, 20, 50, 65, 27, 40, 45, 23], +/// ).unwrap(); +/// +/// // The optimal grid layout is inferred from the data, +/// // specifying a strategy (Auto in this case) +/// let grid = GridBuilder::>::from_array(&observations).build(); +/// let expected_grid = Grid::from(vec![Bins::new(Edges::from(vec![1, 20, 39, 58, 77, 96, 115]))]); +/// assert_eq!(grid, expected_grid); +/// +/// let histogram = observations.histogram(grid); +/// +/// let histogram_matrix = histogram.counts(); +/// // Bins are left inclusive, right exclusive! +/// let expected = array![4, 3, 3, 1, 0, 1]; +/// assert_eq!(histogram_matrix, expected.into_dyn()); +/// # } +/// ``` +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Grid { + projections: Vec>, +} + +impl From>> for Grid { + + /// Get a `Grid` instance from a `Vec>`. + /// + /// The `i`-th element in `Vec>` represents the 1-dimensional + /// projection of the bin grid on the `i`-th axis. + /// + /// Alternatively, a `Grid` can be built directly from data using a + /// [`GridBuilder`]. + /// + /// [`GridBuilder`]: struct.GridBuilder.html + fn from(projections: Vec>) -> Self { + Grid { projections } + } +} + +impl Grid { + /// Returns `n`, the number of dimensions of the region partitioned by the grid. + pub fn ndim(&self) -> usize { + self.projections.len() + } + + /// Returns the number of bins along each coordinate axis. + pub fn shape(&self) -> Vec { + self.projections.iter().map(|e| e.len()).collect() + } + + /// Returns the grid projections on the coordinate axes as a slice of immutable references. + pub fn projections(&self) -> &[Bins] { + &self.projections + } + + /// Returns the index of the *n*-dimensional bin containing the point, if + /// one exists. + /// + /// Returns `None` if the point is outside the grid. + /// + /// **Panics** if `point.len()` does not equal `self.ndim()`. + pub fn index_of(&self, point: &ArrayBase) -> Option> + where + S: Data, + { + assert_eq!(point.len(), self.ndim(), + "Dimension mismatch: the point has {:?} dimensions, the grid \ + expected {:?} dimensions.", point.len(), self.ndim()); + point + .iter() + .zip(self.projections.iter()) + .map(|(v, e)| e.index_of(v)) + .collect() + } +} + +impl Grid { + /// Given `i=(i_0, ..., i_{n-1})`, an `n`-dimensional index, it returns + /// `I_{i_0}x...xI_{i_{n-1}}`, an `n`-dimensional bin, where `I_{i_j}` is + /// the `i_j`-th interval on the `j`-th projection of the grid on the coordinate axes. + /// + /// **Panics** if at least one among `(i_0, ..., i_{n-1})` is out of bounds on the respective + /// coordinate axis - i.e. if there exists `j` such that `i_j >= self.projections[j].len()`. + pub fn index(&self, index: &[usize]) -> Vec> { + assert_eq!(index.len(), self.ndim(), + "Dimension mismatch: the index has {0:?} dimensions, the grid \ + expected {1:?} dimensions.", index.len(), self.ndim()); + izip!(&self.projections, index) + .map(|(bins, &i)| bins.index(i)) + .collect() + } +} + +/// `GridBuilder`, given a [`strategy`] and some observations, returns a [`Grid`] +/// instance for [`histogram`] computation. +/// +/// [`Grid`]: struct.Grid.html +/// [`histogram`]: trait.HistogramExt.html +/// [`strategy`]: strategies/index.html +pub struct GridBuilder { + bin_builders: Vec, +} + +impl GridBuilder +where + A: Ord, + B: BinsBuildingStrategy, +{ + /// Given some observations in a 2-dimensional array with shape `(n_observations, n_dimension)` + /// it returns a `GridBuilder` instance that has learned the required parameter + /// to build a [`Grid`] according to the specified [`strategy`]. + /// + /// [`Grid`]: struct.Grid.html + /// [`strategy`]: strategies/index.html + pub fn from_array(array: &ArrayBase) -> Self + where + S: Data, + { + let bin_builders = array + .axis_iter(Axis(1)) + .map(|data| B::from_array(&data)) + .collect(); + Self { bin_builders } + } + + /// Returns a [`Grid`] instance, built accordingly to the specified [`strategy`] + /// using the parameters inferred from observations in [`from_array`]. + /// + /// [`Grid`]: struct.Grid.html + /// [`strategy`]: strategies/index.html + /// [`from_array`]: #method.from_array.html + pub fn build(&self) -> Grid { + let projections: Vec<_> = self.bin_builders.iter().map(|b| b.build()).collect(); + Grid::from(projections) + } +} diff --git a/src/histogram/histograms.rs b/src/histogram/histograms.rs new file mode 100644 index 00000000..825aadb7 --- /dev/null +++ b/src/histogram/histograms.rs @@ -0,0 +1,165 @@ +use ndarray::prelude::*; +use ndarray::Data; +use super::grid::Grid; +use super::errors::BinNotFound; + +/// Histogram data structure. +pub struct Histogram { + counts: ArrayD, + grid: Grid, +} + +impl Histogram { + /// Returns a new instance of Histogram given a [`Grid`]. + /// + /// [`Grid`]: struct.Grid.html + pub fn new(grid: Grid) -> Self { + let counts = ArrayD::zeros(grid.shape()); + Histogram { counts, grid } + } + + /// Adds a single observation to the histogram. + /// + /// **Panics** if dimensions do not match: `self.ndim() != observation.len()`. + /// + /// # Example: + /// ``` + /// extern crate ndarray_stats; + /// extern crate ndarray; + /// extern crate noisy_float; + /// use ndarray::array; + /// use ndarray_stats::histogram::{Edges, Bins, Histogram, Grid}; + /// use noisy_float::types::n64; + /// + /// # fn main() -> Result<(), Box> { + /// let edges = Edges::from(vec![n64(-1.), n64(0.), n64(1.)]); + /// let bins = Bins::new(edges); + /// let square_grid = Grid::from(vec![bins.clone(), bins.clone()]); + /// let mut histogram = Histogram::new(square_grid); + /// + /// let observation = array![n64(0.5), n64(0.6)]; + /// + /// histogram.add_observation(&observation)?; + /// + /// let histogram_matrix = histogram.counts(); + /// let expected = array![ + /// [0, 0], + /// [0, 1], + /// ]; + /// assert_eq!(histogram_matrix, expected.into_dyn()); + /// # Ok(()) + /// # } + /// ``` + pub fn add_observation(&mut self, observation: &ArrayBase) -> Result<(), BinNotFound> + where + S: Data, + { + match self.grid.index_of(observation) { + Some(bin_index) => { + self.counts[&*bin_index] += 1; + Ok(()) + }, + None => Err(BinNotFound) + } + } + + /// Returns the number of dimensions of the space the histogram is covering. + pub fn ndim(&self) -> usize { + debug_assert_eq!(self.counts.ndim(), self.grid.ndim()); + self.counts.ndim() + } + + /// Borrows a view on the histogram counts matrix. + pub fn counts(&self) -> ArrayViewD { + self.counts.view() + } + + /// Borrows an immutable reference to the histogram grid. + pub fn grid(&self) -> &Grid { + &self.grid + } +} + +/// Extension trait for `ArrayBase` providing methods to compute histograms. +pub trait HistogramExt + where + S: Data, +{ + /// Returns the [histogram](https://en.wikipedia.org/wiki/Histogram) + /// for a 2-dimensional array of points `M`. + /// + /// Let `(n, d)` be the shape of `M`: + /// - `n` is the number of points; + /// - `d` is the number of dimensions of the space those points belong to. + /// It follows that every column in `M` is a `d`-dimensional point. + /// + /// For example: a (3, 4) matrix `M` is a collection of 3 points in a + /// 4-dimensional space. + /// + /// Important: points outside the grid are ignored! + /// + /// **Panics** if `d` is different from `grid.ndim()`. + /// + /// # Example: + /// + /// ``` + /// extern crate ndarray_stats; + /// #[macro_use(array)] + /// extern crate ndarray; + /// extern crate noisy_float; + /// use ndarray_stats::{ + /// HistogramExt, + /// histogram::{ + /// Histogram, Grid, GridBuilder, + /// Edges, Bins, + /// strategies::Sqrt}, + /// }; + /// use noisy_float::types::{N64, n64}; + /// + /// # fn main() { + /// let observations = array![ + /// [n64(1.), n64(0.5)], + /// [n64(-0.5), n64(1.)], + /// [n64(-1.), n64(-0.5)], + /// [n64(0.5), n64(-1.)] + /// ]; + /// let grid = GridBuilder::>::from_array(&observations).build(); + /// let expected_grid = Grid::from( + /// vec![ + /// Bins::new(Edges::from(vec![n64(-1.), n64(0.), n64(1.), n64(2.)])), + /// Bins::new(Edges::from(vec![n64(-1.), n64(0.), n64(1.), n64(2.)])), + /// ] + /// ); + /// assert_eq!(grid, expected_grid); + /// + /// let histogram = observations.histogram(grid); + /// + /// let histogram_matrix = histogram.counts(); + /// // Bins are left inclusive, right exclusive! + /// let expected = array![ + /// [1, 0, 1], + /// [1, 0, 0], + /// [0, 1, 0], + /// ]; + /// assert_eq!(histogram_matrix, expected.into_dyn()); + /// # } + /// ``` + fn histogram(&self, grid: Grid) -> Histogram + where + A: Ord; +} + +impl HistogramExt for ArrayBase + where + S: Data, + A: Ord, +{ + fn histogram(&self, grid: Grid) -> Histogram + { + let mut histogram = Histogram::new(grid); + for point in self.axis_iter(Axis(0)) { + let _ = histogram.add_observation(&point); + } + histogram + } +} diff --git a/src/histogram/mod.rs b/src/histogram/mod.rs new file mode 100644 index 00000000..9176aee1 --- /dev/null +++ b/src/histogram/mod.rs @@ -0,0 +1,10 @@ +//! Histogram functionalities. +pub use self::histograms::{Histogram, HistogramExt}; +pub use self::bins::{Edges, Bins}; +pub use self::grid::{Grid, GridBuilder}; + +mod histograms; +mod bins; +pub mod strategies; +mod grid; +pub mod errors; diff --git a/src/histogram/strategies.rs b/src/histogram/strategies.rs new file mode 100644 index 00000000..d25eb2c3 --- /dev/null +++ b/src/histogram/strategies.rs @@ -0,0 +1,522 @@ +//! Strategies to build [`Bins`]s and [`Grid`]s (using [`GridBuilder`]) inferring +//! optimal parameters directly from data. +//! +//! The docs for each strategy have been taken almost verbatim from [`NumPy`]. +//! +//! Each strategy specifies how to compute the optimal number of [`Bins`] or +//! the optimal bin width. +//! For those strategies that prescribe the optimal number +//! of [`Bins`] we then compute the optimal bin width with +//! +//! `bin_width = (max - min)/n` +//! +//! All our bins are left-inclusive and right-exclusive: we make sure to add an extra bin +//! if it is necessary to include the maximum value of the array that has been passed as argument +//! to the `from_array` method. +//! +//! [`Bins`]: ../struct.Bins.html +//! [`Grid`]: ../struct.Grid.html +//! [`GridBuilder`]: ../struct.GridBuilder.html +//! [`NumPy`]: https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram_bin_edges.html#numpy.histogram_bin_edges +use ndarray::prelude::*; +use ndarray::Data; +use num_traits::{FromPrimitive, NumOps, Zero}; +use super::super::{QuantileExt, QuantileExt1d}; +use super::super::interpolate::Nearest; +use super::{Edges, Bins}; + + +/// A trait implemented by all strategies to build [`Bins`] +/// with parameters inferred from observations. +/// +/// A `BinsBuildingStrategy` is required by [`GridBuilder`] +/// to know how to build a [`Grid`]'s projections on the +/// coordinate axes. +/// +/// [`Bins`]: ../struct.Bins.html +/// [`Grid`]: ../struct.Grid.html +/// [`GridBuilder`]: ../struct.GridBuilder.html +pub trait BinsBuildingStrategy +{ + type Elem: Ord; + /// Given some observations in a 1-dimensional array it returns a `BinsBuildingStrategy` + /// that has learned the required parameter to build a collection of [`Bins`]. + /// + /// [`Bins`]: ../struct.Bins.html + fn from_array(array: &ArrayBase) -> Self + where + S: Data; + + /// Returns a [`Bins`] instance, built accordingly to the parameters + /// inferred from observations in [`from_array`]. + /// + /// [`Bins`]: ../struct.Bins.html + /// [`from_array`]: #method.from_array.html + fn build(&self) -> Bins; + + /// Returns the optimal number of bins, according to the parameters + /// inferred from observations in [`from_array`]. + /// + /// [`from_array`]: #method.from_array.html + fn n_bins(&self) -> usize; +} + +struct EquiSpaced { + bin_width: T, + min: T, + max: T, +} + +/// Square root (of data size) strategy, used by Excel and other programs +/// for its speed and simplicity. +/// +/// Let `n` be the number of observations. Then +/// +/// `n_bins` = `sqrt(n)` +pub struct Sqrt { + builder: EquiSpaced, +} + +/// A strategy that does not take variability into account, only data size. Commonly +/// overestimates number of bins required. +/// +/// Let `n` be the number of observations and `n_bins` the number of bins. +/// +/// `n_bins` = 2`n`1/3 +/// +/// `n_bins` is only proportional to cube root of `n`. It tends to overestimate +/// the `n_bins` and it does not take into account data variability. +pub struct Rice { + builder: EquiSpaced, +} + +/// R’s default strategy, only accounts for data size. Only optimal for gaussian data and +/// underestimates number of bins for large non-gaussian datasets. +/// +/// Let `n` be the number of observations. +/// The number of bins is 1 plus the base 2 log of `n`. This estimator assumes normality of data and +/// is too conservative for larger, non-normal datasets. +/// +/// This is the default method in R’s hist method. +pub struct Sturges { + builder: EquiSpaced, +} + +/// Robust (resilient to outliers) strategy that takes into +/// account data variability and data size. +/// +/// Let `n` be the number of observations. +/// +/// `bin_width` = 2×`IQR`×`n`−1/3 +/// +/// The bin width is proportional to the interquartile range ([`IQR`]) and inversely proportional to +/// cube root of `n`. It can be too conservative for small datasets, but it is quite good for +/// large datasets. +/// +/// The [`IQR`] is very robust to outliers. +/// +/// [`IQR`]: https://en.wikipedia.org/wiki/Interquartile_range +pub struct FreedmanDiaconis { + builder: EquiSpaced, +} + +enum SturgesOrFD { + Sturges(Sturges), + FreedmanDiaconis(FreedmanDiaconis), +} + +/// Maximum of the [`Sturges`] and [`FreedmanDiaconis`] strategies. +/// Provides good all around performance. +/// +/// A compromise to get a good value. For small datasets the [`Sturges`] value will usually be chosen, +/// while larger datasets will usually default to [`FreedmanDiaconis`]. Avoids the overly +/// conservative behaviour of [`FreedmanDiaconis`] and [`Sturges`] for +/// small and large datasets respectively. +/// +/// [`Sturges`]: struct.Sturges.html +/// [`FreedmanDiaconis`]: struct.FreedmanDiaconis.html +pub struct Auto { + builder: SturgesOrFD, +} + +impl EquiSpaced + where + T: Ord + Clone + FromPrimitive + NumOps + Zero +{ + /// **Panics** if `bin_width<=0`. + fn new(bin_width: T, min: T, max: T) -> Self + { + assert!(bin_width > T::zero()); + Self { bin_width, min, max } + } + + fn build(&self) -> Bins { + let n_bins = self.n_bins(); + let mut edges: Vec = vec![]; + for i in 0..(n_bins+1) { + let edge = self.min.clone() + T::from_usize(i).unwrap()*self.bin_width.clone(); + edges.push(edge); + } + Bins::new(Edges::from(edges)) + } + + fn n_bins(&self) -> usize { + let mut max_edge = self.min.clone(); + let mut n_bins = 0; + while max_edge <= self.max { + max_edge = max_edge + self.bin_width.clone(); + n_bins += 1; + } + return n_bins + } + + fn bin_width(&self) -> T { + self.bin_width.clone() + } +} + +impl BinsBuildingStrategy for Sqrt + where + T: Ord + Clone + FromPrimitive + NumOps + Zero +{ + type Elem = T; + + /// **Panics** if the array is constant or if `a.len()==0` and division by 0 panics for `T`. + fn from_array(a: &ArrayBase) -> Self + where + S: Data + { + let n_elems = a.len(); + let n_bins = (n_elems as f64).sqrt().round() as usize; + let min = a.min().clone(); + let max = a.max().clone(); + let bin_width = compute_bin_width(min.clone(), max.clone(), n_bins); + let builder = EquiSpaced::new(bin_width, min, max); + Self { builder } + } + + fn build(&self) -> Bins { + self.builder.build() + } + + fn n_bins(&self) -> usize { + self.builder.n_bins() + } +} + +impl Sqrt + where + T: Ord + Clone + FromPrimitive + NumOps + Zero +{ + /// The bin width (or bin length) according to the fitted strategy. + pub fn bin_width(&self) -> T { + self.builder.bin_width() + } +} + +impl BinsBuildingStrategy for Rice + where + T: Ord + Clone + FromPrimitive + NumOps + Zero +{ + type Elem = T; + + /// **Panics** if the array is constant or if `a.len()==0` and division by 0 panics for `T`. + fn from_array(a: &ArrayBase) -> Self + where + S: Data + { + let n_elems = a.len(); + let n_bins = (2. * (n_elems as f64).powf(1./3.)).round() as usize; + let min = a.min().clone(); + let max = a.max().clone(); + let bin_width = compute_bin_width(min.clone(), max.clone(), n_bins); + let builder = EquiSpaced::new(bin_width, min, max); + Self { builder } + } + + fn build(&self) -> Bins { + self.builder.build() + } + + fn n_bins(&self) -> usize { + self.builder.n_bins() + } +} + +impl Rice + where + T: Ord + Clone + FromPrimitive + NumOps + Zero +{ + /// The bin width (or bin length) according to the fitted strategy. + pub fn bin_width(&self) -> T { + self.builder.bin_width() + } +} + +impl BinsBuildingStrategy for Sturges + where + T: Ord + Clone + FromPrimitive + NumOps + Zero +{ + type Elem = T; + + /// **Panics** if the array is constant or if `a.len()==0` and division by 0 panics for `T`. + fn from_array(a: &ArrayBase) -> Self + where + S: Data + { + let n_elems = a.len(); + let n_bins = (n_elems as f64).log2().round() as usize + 1; + let min = a.min().clone(); + let max = a.max().clone(); + let bin_width = compute_bin_width(min.clone(), max.clone(), n_bins); + let builder = EquiSpaced::new(bin_width, min, max); + Self { builder } + } + + fn build(&self) -> Bins { + self.builder.build() + } + + fn n_bins(&self) -> usize { + self.builder.n_bins() + } +} + +impl Sturges + where + T: Ord + Clone + FromPrimitive + NumOps + Zero +{ + /// The bin width (or bin length) according to the fitted strategy. + pub fn bin_width(&self) -> T { + self.builder.bin_width() + } +} + +impl BinsBuildingStrategy for FreedmanDiaconis + where + T: Ord + Clone + FromPrimitive + NumOps + Zero +{ + type Elem = T; + + /// **Panics** if `IQR==0` or if `a.len()==0` and division by 0 panics for `T`. + fn from_array(a: &ArrayBase) -> Self + where + S: Data + { + let n_points = a.len(); + + let mut a_copy = a.to_owned(); + let first_quartile = a_copy.quantile_mut::(0.25); + let third_quartile = a_copy.quantile_mut::(0.75); + let iqr = third_quartile - first_quartile; + + let bin_width = FreedmanDiaconis::compute_bin_width(n_points, iqr); + let min = a_copy.min().clone(); + let max = a_copy.max().clone(); + let builder = EquiSpaced::new(bin_width, min, max); + Self { builder } + } + + fn build(&self) -> Bins { + self.builder.build() + } + + fn n_bins(&self) -> usize { + self.builder.n_bins() + } +} + +impl FreedmanDiaconis + where + T: Ord + Clone + FromPrimitive + NumOps + Zero +{ + fn compute_bin_width(n_bins: usize, iqr: T) -> T + { + let denominator = (n_bins as f64).powf(1. / 3.); + let bin_width = T::from_usize(2).unwrap() * iqr / T::from_f64(denominator).unwrap(); + bin_width + } + + /// The bin width (or bin length) according to the fitted strategy. + pub fn bin_width(&self) -> T { + self.builder.bin_width() + } +} + +impl BinsBuildingStrategy for Auto + where + T: Ord + Clone + FromPrimitive + NumOps + Zero +{ + type Elem = T; + + /// **Panics** if `IQR==0`, the array is constant or if + /// `a.len()==0` and division by 0 panics for `T`. + fn from_array(a: &ArrayBase) -> Self + where + S: Data + { + let fd_builder = FreedmanDiaconis::from_array(&a); + let sturges_builder = Sturges::from_array(&a); + let builder = { + if fd_builder.bin_width() > sturges_builder.bin_width() { + SturgesOrFD::Sturges(sturges_builder) + } else { + SturgesOrFD::FreedmanDiaconis(fd_builder) + } + }; + Self { builder } + } + + fn build(&self) -> Bins { + // Ugly + match &self.builder { + SturgesOrFD::FreedmanDiaconis(b) => b.build(), + SturgesOrFD::Sturges(b) => b.build(), + } + } + + fn n_bins(&self) -> usize { + // Ugly + match &self.builder { + SturgesOrFD::FreedmanDiaconis(b) => b.n_bins(), + SturgesOrFD::Sturges(b) => b.n_bins(), + } + } +} + +impl Auto + where + T: Ord + Clone + FromPrimitive + NumOps + Zero +{ + /// The bin width (or bin length) according to the fitted strategy. + pub fn bin_width(&self) -> T { + // Ugly + match &self.builder { + SturgesOrFD::FreedmanDiaconis(b) => b.bin_width(), + SturgesOrFD::Sturges(b) => b.bin_width(), + } + } +} + +/// Given a range (max, min) and the number of bins, it returns +/// the associated bin_width: +/// +/// `bin_width = (max - min)/n` +/// +/// **Panics** if division by 0 panics for `T`. +fn compute_bin_width(min: T, max: T, n_bins: usize) -> T +where + T: Ord + Clone + FromPrimitive + NumOps + Zero, +{ + let range = max.clone() - min.clone(); + let bin_width = range / T::from_usize(n_bins).unwrap(); + bin_width +} + +#[cfg(test)] +mod equispaced_tests { + use super::*; + + #[should_panic] + #[test] + fn bin_width_has_to_be_positive() { + EquiSpaced::new(0, 0, 200); + } +} + +#[cfg(test)] +mod sqrt_tests { + use super::*; + + #[should_panic] + #[test] + fn constant_array_are_bad() { + Sqrt::from_array(&array![1, 1, 1, 1, 1, 1, 1]); + } + + #[should_panic] + #[test] + fn empty_arrays_cause_panic() { + let _: Sqrt = Sqrt::from_array(&array![]); + } +} + +#[cfg(test)] +mod rice_tests { + use super::*; + + #[should_panic] + #[test] + fn constant_array_are_bad() { + Rice::from_array(&array![1, 1, 1, 1, 1, 1, 1]); + } + + #[should_panic] + #[test] + fn empty_arrays_cause_panic() { + let _: Rice = Rice::from_array(&array![]); + } +} + +#[cfg(test)] +mod sturges_tests { + use super::*; + + #[should_panic] + #[test] + fn constant_array_are_bad() { + Sturges::from_array(&array![1, 1, 1, 1, 1, 1, 1]); + } + + #[should_panic] + #[test] + fn empty_arrays_cause_panic() { + let _: Sturges = Sturges::from_array(&array![]); + } +} + +#[cfg(test)] +mod fd_tests { + use super::*; + + #[should_panic] + #[test] + fn constant_array_are_bad() { + FreedmanDiaconis::from_array(&array![1, 1, 1, 1, 1, 1, 1]); + } + + #[should_panic] + #[test] + fn zero_iqr_causes_panic() { + FreedmanDiaconis::from_array(&array![-20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 20]); + } + + #[should_panic] + #[test] + fn empty_arrays_cause_panic() { + let _: FreedmanDiaconis = FreedmanDiaconis::from_array(&array![]); + } +} + +#[cfg(test)] +mod auto_tests { + use super::*; + + #[should_panic] + #[test] + fn constant_array_are_bad() { + Auto::from_array(&array![1, 1, 1, 1, 1, 1, 1]); + } + + #[should_panic] + #[test] + fn zero_iqr_causes_panic() { + Auto::from_array(&array![-20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 20]); + } + + #[should_panic] + #[test] + fn empty_arrays_cause_panic() { + let _: Auto = Auto::from_array(&array![]); + } +} diff --git a/src/lib.rs b/src/lib.rs index cf8e56d8..b2543b71 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,7 @@ extern crate ndarray; extern crate noisy_float; extern crate num_traits; extern crate rand; +extern crate itertools; #[cfg(test)] extern crate ndarray_rand; @@ -12,11 +13,13 @@ extern crate ndarray_rand; extern crate quickcheck; pub use maybe_nan::{MaybeNan, MaybeNanExt}; -pub use quantile::{interpolate, QuantileExt}; +pub use quantile::{interpolate, QuantileExt, QuantileExt1d}; pub use sort::Sort1dExt; pub use correlation::CorrelationExt; +pub use histogram::HistogramExt; mod maybe_nan; mod quantile; mod sort; -mod correlation; \ No newline at end of file +mod correlation; +pub mod histogram; \ No newline at end of file diff --git a/src/quantile.rs b/src/quantile.rs index e61e008c..3e179539 100644 --- a/src/quantile.rs +++ b/src/quantile.rs @@ -169,7 +169,7 @@ pub mod interpolate { } } -/// Quantile methods. +/// Quantile methods for `ArrayBase`. pub trait QuantileExt where S: Data, @@ -410,3 +410,57 @@ where }) } } + +/// Quantile methods for 1-dimensional arrays. +pub trait QuantileExt1d + where + S: Data, +{ + /// Return the qth quantile of the data. + /// + /// `q` needs to be a float between 0 and 1, bounds included. + /// The qth quantile for a 1-dimensional array of length `N` is defined + /// as the element that would be indexed as `(N-1)q` if the array were to be sorted + /// in increasing order. + /// If `(N-1)q` is not an integer the desired quantile lies between + /// two data points: we return the lower, nearest, higher or interpolated + /// value depending on the type `Interpolate` bound `I`. + /// + /// Some examples: + /// - `q=0.` returns the minimum; + /// - `q=0.5` returns the median; + /// - `q=1.` returns the maximum. + /// (`q=0` and `q=1` are considered improper quantiles) + /// + /// The array is shuffled **in place** in order to produce the required quantile + /// without allocating a copy. + /// No assumptions should be made on the ordering of the array elements + /// after this computation. + /// + /// Complexity ([quickselect](https://en.wikipedia.org/wiki/Quickselect)): + /// - average case: O(`m`); + /// - worst case: O(`m`^2); + /// where `m` is the number of elements in the array. + /// + /// **Panics** if `q` is not between `0.` and `1.` (inclusive). + fn quantile_mut(&mut self, q: f64) -> A + where + A: Ord + Clone, + S: DataMut, + I: Interpolate; +} + +impl QuantileExt1d for ArrayBase + where + S: Data, +{ + fn quantile_mut(&mut self, q: f64) -> A + where + A: Ord + Clone, + S: DataMut, + I: Interpolate, + { + self.quantile_axis_mut::(Axis(0), q).into_scalar() + } +} +