-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #133 from lbluque/master
Simple structure selection functions
- Loading branch information
Showing
3 changed files
with
160 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
"""Tools for training structure selection.""" | ||
|
||
from warnings import warn | ||
import random | ||
import numpy as np | ||
from scipy.linalg import lu | ||
from scipy.stats import multivariate_normal, multinomial | ||
|
||
__author__ = "Luis Barroso-Luque" | ||
|
||
|
||
def full_row_rank_select(feature_matrix, tol=1E-15, nrows=None): | ||
"""Choose a (maximally) full rank subset of rows in a feature matrix. | ||
This method is for underdetermined systems. | ||
columns/features > rows/structures | ||
Args: | ||
feature_matrix (ndarray): | ||
feature matrix to select rows/structure from. | ||
tol (float): optional | ||
tolerance to use to determine the pivots in upper triangular matrix | ||
of the LU decomposition. | ||
nrows (int): optional | ||
Number of rows to include. If None given, will include the maximum | ||
possible number of rows. | ||
Returns: | ||
list of int: list with indices of rows that form a full rank system. | ||
""" | ||
nrows = nrows if nrows is not None else feature_matrix.shape[0] | ||
P, L, U = lu(feature_matrix.T) | ||
sample_ids = np.unique((abs(U) > tol).argmax(axis=1))[:nrows] | ||
if len(sample_ids) > np.linalg.matrix_rank(feature_matrix[:nrows]): | ||
warn("More samples than the matrix rank where selected.\n" | ||
"The selection will not actually be full rank.\n" | ||
"Decrease tolerance to ensure full rank indices are selected.\n") | ||
return list(sample_ids) | ||
|
||
|
||
def gaussian_select(feature_matrix, num_samples): | ||
"""V. Vanilla structure selection to increase incoherence. | ||
Sequentially pick samples with feature vector that most closely aligns | ||
with a sampled random gaussian vector on the unit sphere. | ||
Args: | ||
feature_matrix (ndarray): | ||
feature matrix to select rows/structure from. | ||
num_samples (int): | ||
number of samples/rows/structures to select. | ||
Returns: | ||
list of int: list with indices of rows that align with Gaussian samples | ||
""" | ||
M = feature_matrix[:, 1:].copy() # exclude first column (empty cluster) | ||
m, n = M.shape | ||
M /= np.linalg.norm(M, axis=1).reshape(m, 1) | ||
gauss = multivariate_normal(mean=np.zeros(n)).rvs(size=num_samples) | ||
gauss /= np.linalg.norm(gauss, axis=1).reshape(num_samples, 1) | ||
sample_ids = map(lambda v: np.argmin(np.linalg.norm(M - v, axis=1)), gauss) | ||
return list(sample_ids) | ||
|
||
|
||
def composition_select(composition_vector, composition, cell_sizes, | ||
num_samples): | ||
"""Structure selection based on composition multinomial probability. | ||
Note: this function is needs quite a bit of tweaking to get nice samples. | ||
Args: | ||
composition_vector (ndarray): | ||
N (samples) by n (components) with the composition of the samples | ||
to select from. | ||
composition (ndarray): | ||
array for the center composition to sample around. | ||
cell_sizes (int or Sequence): | ||
size of unit cell sizes or size of supercells used to set the | ||
number of variables in the multinomial distribution. | ||
num_samples (int): | ||
number of samples to return. Note that if the number is too high | ||
compared to the total number of samples (or coverage of the space) | ||
it may take very long to return if at all, and the samples will not | ||
be very representative of the multinomial distributions. | ||
Returns: | ||
list of int: list with indices of the composition vector corresponding | ||
to selected samples. | ||
""" | ||
unique_compositions = np.unique(composition_vector, axis=1) | ||
# Sample structures biased by composition | ||
if isinstance(cell_sizes, int): | ||
cell_sizes = np.array(len(composition_vector) * [cell_sizes, ]) | ||
else: | ||
cell_sizes = np.array(cell_sizes) | ||
|
||
dists = {size: multinomial(size, composition) | ||
for size in np.unique(cell_sizes)} | ||
max_probs = {size: dist.pmf(size * unique_compositions).max() | ||
for size, dist in dists.items()} | ||
sample_ids = [ | ||
i for i, (size, comp) in enumerate(zip(cell_sizes, composition_vector)) | ||
if dists[size].pmf(size*comp) >= max_probs[size]*random.random()] | ||
while len(sample_ids) <= num_samples: | ||
sample_ids += [ | ||
i for i, (size, comp) in enumerate( | ||
zip(cell_sizes, composition_vector)) | ||
if dists[size].pmf(size*comp) >= max_probs[size]*random.random() | ||
and i not in sample_ids] | ||
return sample_ids[:num_samples] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import numpy as np | ||
import numpy.testing as npt | ||
from smol.cofe.wrangling.select import full_row_rank_select, gaussian_select, \ | ||
composition_select | ||
|
||
|
||
def test_full_row_rank_select(): | ||
for shape in ((100, 100), (100, 200), (100, 500)): | ||
matrix = 10 * np.random.random(shape) | ||
# test for set sizes | ||
for n in range(5, shape[0]//2): | ||
inds = full_row_rank_select(feature_matrix=matrix, nrows=n) | ||
assert max(inds) <= shape[0] | ||
assert len(inds) == n | ||
|
||
# test for max possible samples | ||
inds = full_row_rank_select(feature_matrix=matrix) | ||
assert max(inds) <= shape[0] | ||
# add 1 for numerical slack | ||
assert len(inds) + 1 >= np.linalg.matrix_rank(matrix) | ||
|
||
|
||
def test_gaussian_select(): | ||
# not an amazing test, but something is something | ||
matrix = 10 * np.random.random((3000, 1000)) - 2 | ||
inds = gaussian_select(matrix, 300) | ||
std = matrix[inds].std(axis=0) | ||
assert np.all(abs(matrix[inds].mean(axis=0)) <= 1.5 * std) | ||
assert matrix[inds].mean() <= matrix.mean() | ||
|
||
|
||
def test_composition_select(): | ||
n = 5000 | ||
cell_sizes = np.random.choice((5, 10, 15, 20), size=n) | ||
species = [np.array([np.random.randint(0, 3) for _ in range(size)]) | ||
for size in cell_sizes] | ||
concentrations = np.array( | ||
[[sum(s == 0)/len(s), sum(s == 1)/len(s), sum(s == 2)/len(s)] | ||
for s in species]) | ||
inds = composition_select(concentrations, [0.2, 0.2, 0.6], cell_sizes, 200) | ||
|
||
# initial concentrations should be .3, .3, .3 | ||
npt.assert_array_almost_equal( | ||
concentrations.mean(axis=0), [1/3.0, 1/3.0, 1/3.0], decimal=2) | ||
sample_avg = concentrations[inds].mean(axis=0) | ||
# very loose criteria here | ||
assert sample_avg[0] < 1/3 | ||
assert sample_avg[1] < 1/3 | ||
assert sample_avg[2] > 1 / 3 |