Skip to content

Commit

Permalink
Merge pull request #133 from lbluque/master
Browse files Browse the repository at this point in the history
Simple structure selection functions
  • Loading branch information
lbluque authored Aug 13, 2021
2 parents 78e3c2b + b56958c commit c2125b2
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 1 deletion.
4 changes: 3 additions & 1 deletion smol/cofe/wrangling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from .tools import unique_corr_vector_indices, max_ewald_energy_indices, \
weights_energy_above_composition, weights_energy_above_hull
from .select import full_row_rank_select, gaussian_select, composition_select

__all__ = ["unique_corr_vector_indices", "max_ewald_energy_indices",
"weights_energy_above_composition", "weights_energy_above_hull"]
"weights_energy_above_composition", "weights_energy_above_hull",
"full_row_rank_select", "gaussian_select", "composition_select"]
108 changes: 108 additions & 0 deletions smol/cofe/wrangling/select.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""Tools for training structure selection."""

from warnings import warn
import random
import numpy as np
from scipy.linalg import lu
from scipy.stats import multivariate_normal, multinomial

__author__ = "Luis Barroso-Luque"


def full_row_rank_select(feature_matrix, tol=1E-15, nrows=None):
"""Choose a (maximally) full rank subset of rows in a feature matrix.
This method is for underdetermined systems.
columns/features > rows/structures
Args:
feature_matrix (ndarray):
feature matrix to select rows/structure from.
tol (float): optional
tolerance to use to determine the pivots in upper triangular matrix
of the LU decomposition.
nrows (int): optional
Number of rows to include. If None given, will include the maximum
possible number of rows.
Returns:
list of int: list with indices of rows that form a full rank system.
"""
nrows = nrows if nrows is not None else feature_matrix.shape[0]
P, L, U = lu(feature_matrix.T)
sample_ids = np.unique((abs(U) > tol).argmax(axis=1))[:nrows]
if len(sample_ids) > np.linalg.matrix_rank(feature_matrix[:nrows]):
warn("More samples than the matrix rank where selected.\n"
"The selection will not actually be full rank.\n"
"Decrease tolerance to ensure full rank indices are selected.\n")
return list(sample_ids)


def gaussian_select(feature_matrix, num_samples):
"""V. Vanilla structure selection to increase incoherence.
Sequentially pick samples with feature vector that most closely aligns
with a sampled random gaussian vector on the unit sphere.
Args:
feature_matrix (ndarray):
feature matrix to select rows/structure from.
num_samples (int):
number of samples/rows/structures to select.
Returns:
list of int: list with indices of rows that align with Gaussian samples
"""
M = feature_matrix[:, 1:].copy() # exclude first column (empty cluster)
m, n = M.shape
M /= np.linalg.norm(M, axis=1).reshape(m, 1)
gauss = multivariate_normal(mean=np.zeros(n)).rvs(size=num_samples)
gauss /= np.linalg.norm(gauss, axis=1).reshape(num_samples, 1)
sample_ids = map(lambda v: np.argmin(np.linalg.norm(M - v, axis=1)), gauss)
return list(sample_ids)


def composition_select(composition_vector, composition, cell_sizes,
num_samples):
"""Structure selection based on composition multinomial probability.
Note: this function is needs quite a bit of tweaking to get nice samples.
Args:
composition_vector (ndarray):
N (samples) by n (components) with the composition of the samples
to select from.
composition (ndarray):
array for the center composition to sample around.
cell_sizes (int or Sequence):
size of unit cell sizes or size of supercells used to set the
number of variables in the multinomial distribution.
num_samples (int):
number of samples to return. Note that if the number is too high
compared to the total number of samples (or coverage of the space)
it may take very long to return if at all, and the samples will not
be very representative of the multinomial distributions.
Returns:
list of int: list with indices of the composition vector corresponding
to selected samples.
"""
unique_compositions = np.unique(composition_vector, axis=1)
# Sample structures biased by composition
if isinstance(cell_sizes, int):
cell_sizes = np.array(len(composition_vector) * [cell_sizes, ])
else:
cell_sizes = np.array(cell_sizes)

dists = {size: multinomial(size, composition)
for size in np.unique(cell_sizes)}
max_probs = {size: dist.pmf(size * unique_compositions).max()
for size, dist in dists.items()}
sample_ids = [
i for i, (size, comp) in enumerate(zip(cell_sizes, composition_vector))
if dists[size].pmf(size*comp) >= max_probs[size]*random.random()]
while len(sample_ids) <= num_samples:
sample_ids += [
i for i, (size, comp) in enumerate(
zip(cell_sizes, composition_vector))
if dists[size].pmf(size*comp) >= max_probs[size]*random.random()
and i not in sample_ids]
return sample_ids[:num_samples]
49 changes: 49 additions & 0 deletions tests/test_cofe/test_select.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import numpy as np
import numpy.testing as npt
from smol.cofe.wrangling.select import full_row_rank_select, gaussian_select, \
composition_select


def test_full_row_rank_select():
for shape in ((100, 100), (100, 200), (100, 500)):
matrix = 10 * np.random.random(shape)
# test for set sizes
for n in range(5, shape[0]//2):
inds = full_row_rank_select(feature_matrix=matrix, nrows=n)
assert max(inds) <= shape[0]
assert len(inds) == n

# test for max possible samples
inds = full_row_rank_select(feature_matrix=matrix)
assert max(inds) <= shape[0]
# add 1 for numerical slack
assert len(inds) + 1 >= np.linalg.matrix_rank(matrix)


def test_gaussian_select():
# not an amazing test, but something is something
matrix = 10 * np.random.random((3000, 1000)) - 2
inds = gaussian_select(matrix, 300)
std = matrix[inds].std(axis=0)
assert np.all(abs(matrix[inds].mean(axis=0)) <= 1.5 * std)
assert matrix[inds].mean() <= matrix.mean()


def test_composition_select():
n = 5000
cell_sizes = np.random.choice((5, 10, 15, 20), size=n)
species = [np.array([np.random.randint(0, 3) for _ in range(size)])
for size in cell_sizes]
concentrations = np.array(
[[sum(s == 0)/len(s), sum(s == 1)/len(s), sum(s == 2)/len(s)]
for s in species])
inds = composition_select(concentrations, [0.2, 0.2, 0.6], cell_sizes, 200)

# initial concentrations should be .3, .3, .3
npt.assert_array_almost_equal(
concentrations.mean(axis=0), [1/3.0, 1/3.0, 1/3.0], decimal=2)
sample_avg = concentrations[inds].mean(axis=0)
# very loose criteria here
assert sample_avg[0] < 1/3
assert sample_avg[1] < 1/3
assert sample_avg[2] > 1 / 3

0 comments on commit c2125b2

Please sign in to comment.