Merge pull request #133 from lbluque/master

Simple structure selection functions
CederGroupHub · Aug 13, 2021 · c2125b2 · c2125b2
2 parents 78e3c2b + b56958c
commit c2125b2
Show file tree

Hide file tree

Showing 3 changed files with 160 additions and 1 deletion.
diff --git a/smol/cofe/wrangling/__init__.py b/smol/cofe/wrangling/__init__.py
@@ -5,6 +5,8 @@
 
 from .tools import unique_corr_vector_indices, max_ewald_energy_indices, \
     weights_energy_above_composition, weights_energy_above_hull
+from .select import full_row_rank_select, gaussian_select, composition_select
 
 __all__ = ["unique_corr_vector_indices", "max_ewald_energy_indices",
-           "weights_energy_above_composition", "weights_energy_above_hull"]
+           "weights_energy_above_composition", "weights_energy_above_hull",
+           "full_row_rank_select", "gaussian_select", "composition_select"]
diff --git a/smol/cofe/wrangling/select.py b/smol/cofe/wrangling/select.py
@@ -0,0 +1,108 @@
+"""Tools for training structure selection."""
+
+from warnings import warn
+import random
+import numpy as np
+from scipy.linalg import lu
+from scipy.stats import multivariate_normal, multinomial
+
+__author__ = "Luis Barroso-Luque"
+
+
+def full_row_rank_select(feature_matrix, tol=1E-15, nrows=None):
+    """Choose a (maximally) full rank subset of rows in a feature matrix.
+
+    This method is for underdetermined systems.
+    columns/features > rows/structures
+
+    Args:
+        feature_matrix (ndarray):
+            feature matrix to select rows/structure from.
+        tol (float): optional
+            tolerance to use to determine the pivots in upper triangular matrix
+            of the LU decomposition.
+        nrows (int): optional
+            Number of rows to include. If None given, will include the maximum
+            possible number of rows.
+    Returns:
+        list of int: list with indices of rows that form a full rank system.
+    """
+    nrows = nrows if nrows is not None else feature_matrix.shape[0]
+    P, L, U = lu(feature_matrix.T)
+    sample_ids = np.unique((abs(U) > tol).argmax(axis=1))[:nrows]
+    if len(sample_ids) > np.linalg.matrix_rank(feature_matrix[:nrows]):
+        warn("More samples than the matrix rank where selected.\n"
+             "The selection will not actually be full rank.\n"
+             "Decrease tolerance to ensure full rank indices are selected.\n")
+    return list(sample_ids)
+
+
+def gaussian_select(feature_matrix, num_samples):
+    """V. Vanilla structure selection to increase incoherence.
+
+    Sequentially pick samples with feature vector that most closely aligns
+    with a sampled random gaussian vector on the unit sphere.
+
+    Args:
+        feature_matrix (ndarray):
+            feature matrix to select rows/structure from.
+        num_samples (int):
+            number of samples/rows/structures to select.
+
+    Returns:
+        list of int: list with indices of rows that align with Gaussian samples
+    """
+    M = feature_matrix[:, 1:].copy()  # exclude first column (empty cluster)
+    m, n = M.shape
+    M /= np.linalg.norm(M, axis=1).reshape(m, 1)
+    gauss = multivariate_normal(mean=np.zeros(n)).rvs(size=num_samples)
+    gauss /= np.linalg.norm(gauss, axis=1).reshape(num_samples, 1)
+    sample_ids = map(lambda v: np.argmin(np.linalg.norm(M - v, axis=1)), gauss)
+    return list(sample_ids)
+
+
+def composition_select(composition_vector, composition, cell_sizes,
+                       num_samples):
+    """Structure selection based on composition multinomial probability.
+
+    Note: this function is needs quite a bit of tweaking to get nice samples.
+
+    Args:
+        composition_vector (ndarray):
+            N (samples) by n (components) with the composition of the samples
+            to select from.
+        composition (ndarray):
+            array for the center composition to sample around.
+        cell_sizes (int or Sequence):
+            size of unit cell sizes or size of supercells used to set the
+            number of variables in the multinomial distribution.
+        num_samples (int):
+            number of samples to return. Note that if the number is too high
+            compared to the total number of samples (or coverage of the space)
+            it may take very long to return if at all, and the samples will not
+            be very representative of the multinomial distributions.
+    Returns:
+        list of int: list with indices of the composition vector corresponding
+                     to selected samples.
+    """
+    unique_compositions = np.unique(composition_vector, axis=1)
+    # Sample structures biased by composition
+    if isinstance(cell_sizes, int):
+        cell_sizes = np.array(len(composition_vector) * [cell_sizes, ])
+    else:
+        cell_sizes = np.array(cell_sizes)
+
+    dists = {size: multinomial(size, composition)
+             for size in np.unique(cell_sizes)}
+    max_probs = {size: dist.pmf(size * unique_compositions).max()
+                 for size, dist in dists.items()}
+    sample_ids = [
+        i for i, (size, comp) in enumerate(zip(cell_sizes, composition_vector))
+        if dists[size].pmf(size*comp) >= max_probs[size]*random.random()]
+    while len(sample_ids) <= num_samples:
+        sample_ids += [
+            i for i, (size, comp) in enumerate(
+                zip(cell_sizes, composition_vector))
+            if dists[size].pmf(size*comp) >= max_probs[size]*random.random()
+            and i not in sample_ids]
+    return sample_ids[:num_samples]
diff --git a/tests/test_cofe/test_select.py b/tests/test_cofe/test_select.py
@@ -0,0 +1,49 @@
+import numpy as np
+import numpy.testing as npt
+from smol.cofe.wrangling.select import full_row_rank_select, gaussian_select, \
+    composition_select
+
+
+def test_full_row_rank_select():
+    for shape in ((100, 100), (100, 200), (100, 500)):
+        matrix = 10 * np.random.random(shape)
+        # test for set sizes
+        for n in range(5, shape[0]//2):
+            inds = full_row_rank_select(feature_matrix=matrix, nrows=n)
+            assert max(inds) <= shape[0]
+            assert len(inds) == n
+
+        # test for max possible samples
+        inds = full_row_rank_select(feature_matrix=matrix)
+        assert max(inds) <= shape[0]
+        # add 1 for numerical slack
+        assert len(inds) + 1 >= np.linalg.matrix_rank(matrix)
+
+
+def test_gaussian_select():
+    # not an amazing test, but something is something
+    matrix = 10 * np.random.random((3000, 1000)) - 2
+    inds = gaussian_select(matrix, 300)
+    std = matrix[inds].std(axis=0)
+    assert np.all(abs(matrix[inds].mean(axis=0)) <= 1.5 * std)
+    assert matrix[inds].mean() <= matrix.mean()
+
+
+def test_composition_select():
+    n = 5000
+    cell_sizes = np.random.choice((5, 10, 15, 20), size=n)
+    species = [np.array([np.random.randint(0, 3) for _ in range(size)])
+               for size in cell_sizes]
+    concentrations = np.array(
+        [[sum(s == 0)/len(s), sum(s == 1)/len(s), sum(s == 2)/len(s)]
+         for s in species])
+    inds = composition_select(concentrations, [0.2, 0.2, 0.6], cell_sizes, 200)
+
+    # initial concentrations should be .3, .3, .3
+    npt.assert_array_almost_equal(
+        concentrations.mean(axis=0), [1/3.0, 1/3.0, 1/3.0], decimal=2)
+    sample_avg = concentrations[inds].mean(axis=0)
+    # very loose criteria here
+    assert sample_avg[0] < 1/3
+    assert sample_avg[1] < 1/3
+    assert sample_avg[2] > 1 / 3