sourmash-bio · bluegenes · Apr 15, 2022 · Apr 6, 2022 · Apr 6, 2022 · Apr 7, 2022
diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py
@@ -0,0 +1,277 @@
+"""
+Utility functions for jaccard/containment --> distance estimates
+From https://github.com/KoslickiLab/mutation-rate-ci-calculator
+https://doi.org/10.1101/2022.01.11.475870
+"""
+from scipy.optimize import brentq
+from scipy.stats import norm as scipy_norm
+from numpy import sqrt
+from math import log, exp
+
+from .logging import notify
+
+
+def r1_to_q(k, r1):
+    r1 = float(r1)
+    q = 1 - (1 - r1) ** k
+    return float(q)
+
+
+def var_n_mutated(L, k, r1, q=None):
+    # there are computational issues in the variance formula that we solve here
+    # by the use of higher-precision arithmetic; the problem occurs when r is
+    # very small; for example, with L=10,k=2,r1=1e-6 standard precision
+    # gives varN<0 which is nonsense; by using the mpf type, we get the correct
+    # answer which is about 0.000038.
+    if r1 == 0:
+        return 0.0
+    r1 = float(r1)
+    if q == None:  # we assume that if q is provided, it is correct for r1
+        q = r1_to_q(k, r1)
+    varN = (
+        L * (1 - q) * (q * (2 * k + (2 / r1) - 1) - 2 * k)
+        + k * (k - 1) * (1 - q) ** 2
+        + (2 * (1 - q) / (r1**2)) * ((1 + (k - 1) * (1 - q)) * r1 - q)
+    )
+    if varN < 0.0:  # this seems to happen only with super tiny test data
+        raise ValueError("Error: varN <0.0!")
+    return float(varN)
+
+
+def exp_n_mutated(L, k, r1):
+    q = r1_to_q(k, r1)
+    return L * q
+
+
+def exp_n_mutated_squared(L, k, p):
+    return var_n_mutated(L, k, p) + exp_n_mutated(L, k, p) ** 2
+
+
+def probit(p):
+    return scipy_norm.ppf(p)
+
+
+def sequence_len_to_n_kmers(sequence_len_bp, ksize):
+    n_kmers = sequence_len_bp - (ksize - 1)
+    return n_kmers
+
+
+def distance_to_identity(dist, d_low=None, d_high=None):
+    """
+    ANI = 1-distance
+    """
+    if not 0 <= dist <= 1:
+        raise ValueError(f"Error: distance value {dist} is not between 0 and 1!")
+    ident = 1 - dist
+    result = ident
+    id_low, id_high = None, None
+    if any([d_low is not None, d_high is not None]):
+        if d_low is not None and d_high is not None:
+            if (0 <= d_low <= 1) and (0 <= d_high <= 1):
+                id_high = 1 - d_low
+                id_low = 1 - d_high
+                result = [ident, id_low, id_high]
+        else:
+            raise ValueError(
+                "Error: `distance_to_identity` expects either one value (distance) or three values (distance, low_ci, high_ci)."
+            )
+
+    return result
+
+
+def get_expected_log_probability(n_unique_kmers, ksize, mutation_rate, scaled_fraction):
+    """helper function
+    Note that scaled here needs to be between 0 and 1
+    (e.g. scaled 1000 --> scaled_fraction 0.001)
+    """
+    exp_nmut = exp_n_mutated(n_unique_kmers, ksize, mutation_rate)
+    try:
+        return (n_unique_kmers - exp_nmut) * log(1.0 - scaled_fraction)
+    except:
+        return float("-inf")
+
+
+def get_exp_probability_nothing_common(
+    mutation_rate, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None
+):
+    """
+    Given parameters, calculate the expected probability that nothing will be common
+    between a fracminhash sketch of a original sequence and a fracminhash sketch of a mutated
+    sequence. If this is above a threshold, we should suspect that the two sketches may have
+    nothing in common. The threshold needs to be set with proper insights.
+
+    Arguments: n_unique_kmers, ksize, mutation_rate, scaled
+    Returns: float - expected likelihood that nothing is common between sketches
+    """
+    # NTP note: do we have any checks for ksize >=1 in sourmash_args? The rest should be taken care of.
+    # assert 0.0 <= mutation_rate <= 1.0 and ksize >= 1 and scaled >= 1
+    if sequence_len_bp and not n_unique_kmers:
+        n_unique_kmers = sequence_len_to_n_kmers(sequence_len_bp, ksize)
+    f_scaled = 1.0 / float(scaled)
+    if mutation_rate == 1.0:
+        return 1.0
+    elif mutation_rate == 0.0:
+        return 0.0
+    return exp(
+        get_expected_log_probability(n_unique_kmers, ksize, mutation_rate, f_scaled)
+    )
+
+
+def containment_to_distance(
+    containment,
+    ksize,
+    scaled,
+    n_unique_kmers=None,
+    sequence_len_bp=None,
+    confidence=0.95,
+    return_identity=False,
+    return_ci=False,
+    prob_threshold=10.0 ** (-3),
+):
+    """
+    Containment --> distance CI (one step)
+    """
+    sol1, sol2, point_estimate = None, None, None
+    if sequence_len_bp and not n_unique_kmers:
+        n_unique_kmers = sequence_len_to_n_kmers(sequence_len_bp, ksize)
+    if containment <= 0.0001:
+        sol2 = sol1 = point_estimate = 1.0
+    elif containment >= 0.9999:
+        sol1 = sol2 = point_estimate = 0.0
+    else:
+        point_estimate = 1.0 - containment ** (1.0 / ksize)
+        if return_ci:
+            try:
+                alpha = 1 - confidence
+                z_alpha = probit(1 - alpha / 2)
+                f_scaled = (
+                    1.0 / scaled
+                )  # these use scaled as a fraction between 0 and 1
+
+                bias_factor = 1 - (1 - f_scaled) ** n_unique_kmers
+
+                term_1 = (1.0 - f_scaled) / (
+                    f_scaled * n_unique_kmers**3 * bias_factor**2
+                )
+                term_2 = lambda pest: n_unique_kmers * exp_n_mutated(
+                    n_unique_kmers, ksize, pest
+                ) - exp_n_mutated_squared(n_unique_kmers, ksize, pest)
+                term_3 = lambda pest: var_n_mutated(n_unique_kmers, ksize, pest) / (
+                    n_unique_kmers**2
+                )
+
+                var_direct = lambda pest: term_1 * term_2(pest) + term_3(pest)
+
+                f1 = (
+                    lambda pest: (1 - pest) ** ksize
+                    + z_alpha * sqrt(var_direct(pest))
+                    - containment
+                )
+                f2 = (
+                    lambda pest: (1 - pest) ** ksize
+                    - z_alpha * sqrt(var_direct(pest))
+                    - containment
+                )
+
+                sol1 = brentq(f1, 0.0000001, 0.9999999)
+                sol2 = brentq(f2, 0.0000001, 0.9999999)
+
+            except ValueError as exc:
+                # afaict, this only happens with extremely small test data
+                notify(
+                    "WARNING: Cannot estimate ANI from containment. Do your sketches contain enough hashes?"
+                )
+                notify(str(exc))
+                return None, None, None
+
+    # Do this here, so that we don't need to reconvert distance <--> identity later.
+    prob_nothing_in_common = get_exp_probability_nothing_common(
+        point_estimate, ksize, scaled, n_unique_kmers=n_unique_kmers
+    )
+    if prob_nothing_in_common >= prob_threshold:
+        # TO DO: keep count; suggest user decrease scaled value. If that is unsuccessful, maybe decrease ksize
+        notify(
+            "WARNING: These sketches may have no hashes in common based on chance alone."
+        )
+
+    if return_identity:
+        if any([sol1 is None, sol2 is None]):
+            point_estimate = distance_to_identity(point_estimate)
+        else:
+            point_estimate, sol2, sol1 = distance_to_identity(
+                point_estimate, sol2, sol1
+            )
+
+    if return_ci:
+        return point_estimate, sol2, sol1, prob_nothing_in_common
+
+    return point_estimate, prob_nothing_in_common
+
+
+def jaccard_to_distance(
+    jaccard,
+    ksize,
+    scaled,
+    n_unique_kmers=None,
+    sequence_len_bp=None,
+    return_identity=False,
+    prob_threshold=10.0 ** (-3),
+    err_threshold=10.0 ** (-4.0),
+):
+    """
+    Given parameters, calculate point estimate for mutation rate from jaccard index.
+    First checks if parameters are valid (checks are not exhaustive). Then uses formulas
+    derived mathematically to compute the point estimate. The formula uses approximations,
+    therefore a tiny error is associated with it. A lower bound of that error is also returned.
+    A high error indicates that the point estimate cannot be trusted. Threshold of the error
+    is open to interpretation, but suggested that > 10^-4 should be handled with caution.
+
+    Note that the error is NOT a mutation rate, and therefore cannot be considered in
+    something like mut.rate +/- error.
+
+    Arguments: jaccard, ksize, scaled, n_unique_kmers
+    # Returns: tuple (point_estimate_of_mutation_rate, lower_bound_of_error)
+
+    # Returns: point_estimate_of_mutation_rate
+
+    Note: point estimate does not consider impact of scaled, but p_nothing_in_common can be
+    useful for determining whether scaled is sufficient for these comparisons.
+    """
+    error_lower_bound = None
+    if sequence_len_bp and not n_unique_kmers:
+        n_unique_kmers = sequence_len_to_n_kmers(sequence_len_bp, ksize)
+    if jaccard <= 0.0001:
+        point_estimate = 1.0
+        error_lower_bound = 0.0
+    elif jaccard >= 0.9999:
+        point_estimate = 0.0
+        error_lower_bound = 0.0
+    else:
+        point_estimate = 1.0 - (2.0 * jaccard / float(1 + jaccard)) ** (
+            1.0 / float(ksize)
+        )
+
+        exp_n_mut = exp_n_mutated(n_unique_kmers, ksize, point_estimate)
+        var_n_mut = var_n_mutated(n_unique_kmers, ksize, point_estimate)
+        error_lower_bound = (
+            1.0 * n_unique_kmers * var_n_mut / (n_unique_kmers + exp_n_mut) ** 3
+        )
+
+    if error_lower_bound is not None and error_lower_bound > err_threshold:
+        notify(
+            f"WARNING: Error on Jaccard distance point estimate is too high ({error_lower_bound})."
+        )
+
+    prob_nothing_in_common = get_exp_probability_nothing_common(
+        point_estimate, ksize, scaled, n_unique_kmers=n_unique_kmers
+    )
+    if prob_nothing_in_common >= prob_threshold:
+        # to do: keep count and recommend user lower scaled val
+        notify(
+            "WARNING: These sketches may have no hashes in common based on chance alone."
+        )
+
+    if return_identity:
+        point_estimate = distance_to_identity(point_estimate)
+
+    return point_estimate, prob_nothing_in_common, error_lower_bound
diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py
@@ -6,6 +6,7 @@ class MinHash - core MinHash class.
 class FrozenMinHash - read-only MinHash class.
 """
 from __future__ import unicode_literals, division
+from .distance_utils import jaccard_to_distance, containment_to_distance
 
 __all__ = ['get_minhash_default_seed',
            'get_minhash_max_hash',
@@ -646,6 +647,26 @@ def jaccard(self, other, downsample=False):
             raise TypeError(err)
         return self._methodcall(lib.kmerminhash_similarity, other._get_objptr(), True, downsample)
 
+    def jaccard_ani(self, other, downsample=False, jaccard=None, prob_threshold=10.0 ** (-3),  err_threshold=10.0 ** (-4.0)):
+        "Calculate Jaccard --> ANI of two MinHash objects."
+        self_mh = self
+        other_mh = other
+        scaled = self.scaled
+        if downsample:
+            scaled = max(self_mh.scaled, other_mh.scaled)
+            self_mh = self.downsample(scaled=scaled)
+            other_mh = other.downsample(scaled=scaled)
+        if jaccard is None:
+            jaccard = self_mh.similarity(other_mh, ignore_abundance=True)
+        avg_scaled_kmers = round((len(self_mh) + len(other_mh))/2)
+        avg_n_kmers = avg_scaled_kmers * scaled  # would be better if hll estimate - see #1798
+        j_ani,p_nothing_in_common,error_lower_bound = jaccard_to_distance(jaccard, self_mh.ksize, scaled,
+                                                    n_unique_kmers=avg_n_kmers,
+                                                    return_identity=True,
+                                                    prob_threshold = prob_threshold,
+                                                    err_threshold = err_threshold)
+        return j_ani, p_nothing_in_common, error_lower_bound
+
     def similarity(self, other, ignore_abundance=False, downsample=False):
         """Calculate similarity of two sketches.
 
@@ -683,6 +704,26 @@ def contained_by(self, other, downsample=False):
 
         return self.count_common(other, downsample) / len(self)
 
+    def containment_ani(self, other, downsample=False, containment=None, confidence=0.95, return_ci = False):
+        "Estimate ANI from containment with the other MinHash."
+        self_mh = self
+        other_mh = other
+        scaled = self.scaled
+        if downsample:
+            scaled = max(self_mh.scaled, other_mh.scaled)
+            self_mh = self.downsample(scaled=scaled)
+            other_mh = other.downsample(scaled=scaled)
+        if containment is None:
+            containment = self_mh.contained_by(other_mh)
+        n_kmers = len(self_mh) * scaled # would be better if hll estimate - see #1798
+
+        ## ani_info will be two values if return_ci is False; four values if return_ci is True
+        ## not ideal, but I'm not sure how to fix.
+        ani_info = containment_to_distance(containment, self_mh.ksize, self_mh.scaled,
+                                                        n_unique_kmers=n_kmers, confidence=confidence,
+                                                        return_identity=True, return_ci = return_ci)
+        return ani_info
+
     def max_containment(self, other, downsample=False):
         """
         Calculate maximum containment.
@@ -695,6 +736,27 @@ def max_containment(self, other, downsample=False):
 
         return self.count_common(other, downsample) / min_denom
 
+    def max_containment_ani(self, other, downsample=False, max_containment=None, confidence=0.95, return_ci=False):
+        "Estimate ANI from containment with the other MinHash."
+        self_mh = self
+        other_mh = other
+        scaled = self.scaled
+        if downsample:
+            scaled = max(self_mh.scaled, other_mh.scaled)
+            self_mh = self.downsample(scaled=scaled)
+            other_mh = other.downsample(scaled=scaled)
+        if max_containment is None:
+            max_containment = self_mh.max_containment(other_mh)
+        min_n_kmers = min(len(self_mh), len(other_mh))
+        n_kmers = min_n_kmers * scaled  # would be better if hll estimate - see #1798
+
+        ## ani_info will be two values if return_ci is False; four values if return_ci is True
+        ## not ideal, but I'm not sure how to fix.
+        ani_info = containment_to_distance(max_containment, self_mh.ksize, scaled,
+                                           n_unique_kmers=n_kmers,confidence=confidence,
+                                           return_identity=True, return_ci = return_ci)
+        return ani_info
+
     def __add__(self, other):
         if not isinstance(other, MinHash):
             raise TypeError("can only add MinHash objects to MinHash objects!")