MAINT: hardcoded skbio metrics to temporarily work around NaN to 0 im…

…plementation (#63)
qiime2 · May 2, 2024 · 983125e · 983125e
1 parent 40510b5
commit 983125e
Show file tree

Hide file tree

Showing 4 changed files with 185 additions and 11 deletions.
diff --git a/q2_diversity_lib/alpha.py b/q2_diversity_lib/alpha.py
@@ -7,9 +7,16 @@
 # ----------------------------------------------------------------------------
 
 import pandas as pd
+import numpy as np
+import functools
+
 import skbio.diversity
+from skbio.diversity._util import _validate_counts_vector
+import skbio.diversity.alpha
+
+from scipy.special import gammaln
+
 import biom
-import numpy as np
 
 from q2_types.feature_table import BIOMV210Format
 from q2_types.sample_data import AlphaDiversityFormat
@@ -91,7 +98,10 @@ def transform_(v, i, m):
 
  results = []
  for v in table.iter_data(dense=True):
- results.append(_skbio_alpha_diversity_from_1d(v, 'pielou_e'))
+ # using in-house metrics temporarily
+ # results.append(_skbio_alpha_diversity_from_1d(v, 'pielou_e'))
+ v = np.reshape(v, (1, len(v)))
+ results.extend([_p_evenness(c)for c in v])
  results = pd.Series(results, index=table.ids(), name='pielou_evenness')
  return results
 
@@ -104,16 +114,107 @@ def shannon_entropy(table: biom.Table,
 
  results = []
  for v in table.iter_data(dense=True):
- results.append(_skbio_alpha_diversity_from_1d(v, 'shannon'))
+ # using in-house metrics temporarily
+ # results.append(_skbio_alpha_diversity_from_1d(v, 'shannon'))
+ v = np.reshape(v, (1, len(v)))
+ results.extend([_shannon(c)for c in v])
  results = pd.Series(results, index=table.ids(), name='shannon_entropy')
  return results
 
 
 @_validate_tables
 def alpha_passthrough(table: biom.Table, metric: str) -> pd.Series:
  results = []
-
- for v in table.iter_data(dense=True):
- results.append(_skbio_alpha_diversity_from_1d(v.astype(int), metric))
+ method_map = {"berger_parker_d": _berger_parker,
+ "brillouin_d": _brillouin_d,
+ "simpson": _simpsons_dominance,
+ "esty_ci": _esty_ci,
+ "goods_coverage": _goods_coverage,
+ "margalef": _margalef,
+ "mcintosh_d": _mcintosh_d,
+ "strong": _strong}
+
+ if metric in method_map:
+ metric = functools.partial(method_map[metric])
+ for v in table.iter_data(dense=True):
+ v = np.reshape(v, (1, len(v)))
+ results.extend([metric(c)for c in v])
+ else:
+ for v in table.iter_data(dense=True):
+ results.append(_skbio_alpha_diversity_from_1d(v.astype(int),
+ metric))
  results = pd.Series(results, index=table.ids(), name=metric)
  return results
+
+
+# c&p methods from skbio
+def _berger_parker(counts):
+ counts = _validate_counts_vector(counts)
+ return counts.max() / counts.sum()
+
+
+def _brillouin_d(counts):
+ counts = _validate_counts_vector(counts)
+ nz = counts[counts.nonzero()]
+ n = nz.sum()
+ return (gammaln(n + 1) - gammaln(nz + 1).sum()) / n
+
+
+def _simpsons_dominance(counts):
+ counts = _validate_counts_vector(counts)
+ return 1 - skbio.diversity.alpha.dominance(counts)
+
+
+def _esty_ci(counts):
+ counts = _validate_counts_vector(counts)
+
+ f1 = skbio.diversity.alpha.singles(counts)
+ f2 = skbio.diversity.alpha.doubles(counts)
+ n = counts.sum()
+ z = 1.959963985
+ W = (f1 * (n - f1) + 2 * n * f2) / (n ** 3)
+
+ return f1 / n - z * np.sqrt(W), f1 / n + z * np.sqrt(W)
+
+
+def _goods_coverage(counts):
+ counts = _validate_counts_vector(counts)
+ f1 = skbio.diversity.alpha.singles(counts)
+ N = counts.sum()
+ return 1 - (f1 / N)
+
+
+def _margalef(counts):
+ counts = _validate_counts_vector(counts)
+ # replaced observed_otu call to sobs
+ return (skbio.diversity.alpha.sobs(counts) - 1) / np.log(counts.sum())
+
+
+def _mcintosh_d(counts):
+ counts = _validate_counts_vector(counts)
+ u = np.sqrt((counts * counts).sum())
+ n = counts.sum()
+ return (n - u) / (n - np.sqrt(n))
+
+
+def _strong(counts):
+ counts = _validate_counts_vector(counts)
+ n = counts.sum()
+ # replaced observed_otu call to sobs
+ s = skbio.diversity.alpha.sobs(counts)
+ i = np.arange(1, len(counts) + 1)
+ sorted_sum = np.sort(counts)[::-1].cumsum()
+ return (sorted_sum / n - (i / s)).max()
+
+
+def _p_evenness(counts):
+ counts = _validate_counts_vector(counts)
+ return _shannon(counts, base=np.e) / np.log(
+ skbio.diversity.alpha.sobs(counts=counts))
+
+
+def _shannon(counts, base=2):
+ counts = _validate_counts_vector(counts)
+ freqs = counts / counts.sum()
+ nonzero_freqs = freqs[freqs.nonzero()]
+ return -(nonzero_freqs * np.log(nonzero_freqs)).sum() / np.log(base)
diff --git a/q2_diversity_lib/beta.py b/q2_diversity_lib/beta.py
@@ -38,7 +38,7 @@
  'IMPL': {'braycurtis', 'jaccard'},
  'UNIMPL': {'cityblock', 'euclidean', 'seuclidean', 'sqeuclidean',
  'cosine', 'correlation', 'hamming', 'chebyshev', 'canberra',
- 'yule', 'matching', 'dice', 'kulsinski',
+ 'yule', 'matching', 'dice',
  'rogerstanimoto', 'russellrao', 'sokalmichener',
  'sokalsneath', 'minkowski', 'aitchison', 'canberra_adkins',
  'jensenshannon'}

diff --git a/q2_diversity_lib/examples.py b/q2_diversity_lib/examples.py
@@ -322,8 +322,8 @@ def beta_passthrough_n_jobs_example(use):
  result, = use.action(
  use.UsageAction(plugin_id='diversity_lib',
  action_id='beta_passthrough'),
- use.UsageInputs(table=ft, metric='kulsinski', n_jobs=1),
- use.UsageOutputNames(distance_matrix='kulsinski_dm')
+ use.UsageInputs(table=ft, metric='euclidean', n_jobs=1),
+ use.UsageOutputNames(distance_matrix='euclidean_dm')
  )
  result.assert_output_type('DistanceMatrix')
 

diff --git a/q2_diversity_lib/tests/test_alpha.py b/q2_diversity_lib/tests/test_alpha.py
@@ -9,6 +9,7 @@
 from subprocess import CalledProcessError
 
 import numpy as np
+import numpy.testing as npt
 import pandas as pd
 import pandas.testing as pdt
 import biom
@@ -17,7 +18,12 @@
 from qiime2 import Artifact
 
 from ..alpha import (pielou_evenness, observed_features,
- shannon_entropy, METRICS)
+ shannon_entropy, METRICS,
+ _berger_parker, _brillouin_d,
+ _simpsons_dominance, _esty_ci,
+ _goods_coverage, _margalef,
+ _mcintosh_d, _strong
+ )
 
 
 class SmokeTests(TestPluginBase):
@@ -154,7 +160,9 @@ def test_drop_undefined_samples(self):
  [0, 0, 0, 1, 0, 1]]),
  ['A', 'B', 'C'],
  ['S1', 'S2', 'S3', 'S4', 'S5', 'S6'])
- expected = pd.Series({'S5': 1, 'S6': 1}, name='pielou_evenness')
+ # pandas supports floating point correction for float dtype only,
+ # these 1 ints were changed to 1.0 floats to get that support
+ expected = pd.Series({'S5': 1.0, 'S6': 1.0}, name='pielou_evenness')
  actual = pielou_evenness(table=NaN_table, drop_undefined_samples=True)
  pdt.assert_series_equal(actual, expected, check_dtype=False)
 
@@ -250,3 +258,68 @@ def test_passed_implemented_metric(self):
  for metric in METRICS['NONPHYLO']['IMPL']:
  with self.assertRaisesRegex(TypeError, f"{metric}.*incompatible"):
  self.method(table=self.crawford_tbl, metric=metric)
+
+# tests for passthrough metrics were sourced from skbio
+ def test_berger_parker_d(self):
+ self.assertEqual(_berger_parker(np.array([5, 5])), 0.5)
+ self.assertEqual(_berger_parker(np.array([1, 1, 1, 1, 0])), 0.25)
+
+ def test_brillouin_d(self):
+ self.assertAlmostEqual(_brillouin_d(np.array([1, 2, 0, 0, 3, 1])),
+ 0.86289353018248782)
+
+ def test_esty_ci(self):
+ def _diversity(indices, f):
+ """Calculate diversity index for each window of size 1.
+
+ indices: vector of indices of taxa
+ f: f(counts) -> diversity measure
+
+ """
+ result = []
+ max_size = max(indices) + 1
+ freqs = np.zeros(max_size, dtype=int)
+ for i in range(len(indices)):
+ freqs += np.bincount(indices[i:i + 1], minlength=max_size)
+ try:
+ curr = f(freqs)
+ except (ZeroDivisionError, FloatingPointError):
+ curr = 0
+ result.append(curr)
+ return np.array(result)
+
+ data = [1, 1, 2, 1, 1, 3, 2, 1, 3, 4]
+
+ observed_lower, observed_upper = zip(*_diversity(data, _esty_ci))
+
+ expected_lower = np.array([1, -1.38590382, -0.73353593, -0.17434465,
+ -0.15060902, -0.04386191, -0.33042054,
+ -0.29041008, -0.43554755, -0.33385652])
+ expected_upper = np.array([1, 1.38590382, 1.40020259, 0.67434465,
+ 0.55060902, 0.71052858, 0.61613483,
+ 0.54041008, 0.43554755, 0.53385652])
+
+ npt.assert_array_almost_equal(observed_lower, expected_lower)
+ npt.assert_array_almost_equal(observed_upper, expected_upper)
+
+ def test_simpson(self):
+ self.assertAlmostEqual(_simpsons_dominance(np.array([1, 0, 2, 5, 2])),
+ 0.66)
+ self.assertAlmostEqual(_simpsons_dominance(np.array([5])), 0)
+
+ def test_goods_coverage(self):
+ counts = [1] * 75 + [2, 2, 2, 2, 2, 2, 3, 4, 4]
+ obs = _goods_coverage(counts)
+ self.assertAlmostEqual(obs, 0.23469387755)
+
+ def test_margalef(self):
+
+ self.assertEqual(_margalef(np.array([0, 1, 1, 4, 2, 5, 2, 4, 1, 2])),
+ 8 / np.log(22))
+
+ def test_mcintosh_d(self):
+ self.assertAlmostEqual(_mcintosh_d(np.array([1, 2, 3])),
+ 0.636061424871458)
+
+ def test_strong(self):
+ self.assertAlmostEqual(_strong(np.array([1, 2, 3, 1])), 0.214285714)