Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] distance utility functions to support ANI estimation #1934

Merged
merged 26 commits into from
Apr 15, 2022
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
2e9910f
split distance utils and start updating tests for function return cha…
bluegenes Apr 6, 2022
f1cd7db
upd containment tests
bluegenes Apr 6, 2022
7ec4e49
upd jaccard tests
bluegenes Apr 7, 2022
28eb968
add p_nothing_in_common test; cleanup unused jaccard formulas
bluegenes Apr 8, 2022
3ebae36
pyflakes, formatting
bluegenes Apr 8, 2022
09dedd2
raise err with bad input to distance_to_identity
bluegenes Apr 8, 2022
e9c055f
add minhash fns and tests
bluegenes Apr 8, 2022
e0719da
upd
bluegenes Apr 11, 2022
8d7b0d3
Apply suggestions from code review
bluegenes Apr 11, 2022
3fbe684
Merge branch 'add-distance-utils-only' of github.com:sourmash-bio/sou…
bluegenes Apr 11, 2022
e92589a
try a dataclass
bluegenes Apr 12, 2022
853ea85
better dataclasses; init revamp tests for aniresult classes
bluegenes Apr 13, 2022
454abf1
mod all tests to for aniresult dataclass
bluegenes Apr 13, 2022
9a49224
init minhash changes for aniresult
bluegenes Apr 14, 2022
f1b8d72
Merge branch 'latest' into add-distance-utils-only
ctb Apr 14, 2022
a6c660a
upd tests
bluegenes Apr 14, 2022
90671fa
tiny testdata tests for var0.0
bluegenes Apr 14, 2022
9344158
Merge branch 'add-distance-utils-only' of github.com:sourmash-bio/sou…
bluegenes Apr 14, 2022
01d4ae7
test var_n_mutated directly
bluegenes Apr 14, 2022
017bd32
test test_handle_seqlen_nkmers directly
bluegenes Apr 14, 2022
83a050d
clarify comment
bluegenes Apr 14, 2022
2f46649
dont populate fake CI for dist=0,dist=1 cases
bluegenes Apr 15, 2022
e88e454
fix newline
bluegenes Apr 15, 2022
731d286
Update src/sourmash/distance_utils.py
bluegenes Apr 15, 2022
9f2349c
Merge branch 'latest' into add-distance-utils-only
bluegenes Apr 15, 2022
fb54506
fix corresponding test
bluegenes Apr 15, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
277 changes: 277 additions & 0 deletions src/sourmash/distance_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
"""
Utility functions for jaccard/containment --> distance estimates
From https://github.com/KoslickiLab/mutation-rate-ci-calculator
https://doi.org/10.1101/2022.01.11.475870
"""
from scipy.optimize import brentq
from scipy.stats import norm as scipy_norm
from numpy import sqrt
from math import log, exp

from .logging import notify


def r1_to_q(k, r1):
r1 = float(r1)
q = 1 - (1 - r1) ** k
return float(q)


def var_n_mutated(L, k, r1, q=None):
bluegenes marked this conversation as resolved.
Show resolved Hide resolved
# there are computational issues in the variance formula that we solve here
# by the use of higher-precision arithmetic; the problem occurs when r is
# very small; for example, with L=10,k=2,r1=1e-6 standard precision
# gives varN<0 which is nonsense; by using the mpf type, we get the correct
# answer which is about 0.000038.
if r1 == 0:
return 0.0
bluegenes marked this conversation as resolved.
Show resolved Hide resolved
r1 = float(r1)
if q == None: # we assume that if q is provided, it is correct for r1
q = r1_to_q(k, r1)
varN = (
L * (1 - q) * (q * (2 * k + (2 / r1) - 1) - 2 * k)
+ k * (k - 1) * (1 - q) ** 2
+ (2 * (1 - q) / (r1**2)) * ((1 + (k - 1) * (1 - q)) * r1 - q)
)
if varN < 0.0: # this seems to happen only with super tiny test data
raise ValueError("Error: varN <0.0!")
bluegenes marked this conversation as resolved.
Show resolved Hide resolved
return float(varN)


def exp_n_mutated(L, k, r1):
q = r1_to_q(k, r1)
return L * q


def exp_n_mutated_squared(L, k, p):
return var_n_mutated(L, k, p) + exp_n_mutated(L, k, p) ** 2


def probit(p):
return scipy_norm.ppf(p)


def sequence_len_to_n_kmers(sequence_len_bp, ksize):
n_kmers = sequence_len_bp - (ksize - 1)
return n_kmers


def distance_to_identity(dist, d_low=None, d_high=None):
bluegenes marked this conversation as resolved.
Show resolved Hide resolved
"""
ANI = 1-distance
"""
if not 0 <= dist <= 1:
raise ValueError(f"Error: distance value {dist} is not between 0 and 1!")
ident = 1 - dist
result = ident
id_low, id_high = None, None
if any([d_low is not None, d_high is not None]):
if d_low is not None and d_high is not None:
if (0 <= d_low <= 1) and (0 <= d_high <= 1):
id_high = 1 - d_low
id_low = 1 - d_high
result = [ident, id_low, id_high]
else:
raise ValueError(
"Error: `distance_to_identity` expects either one value (distance) or three values (distance, low_ci, high_ci)."
)

return result


def get_expected_log_probability(n_unique_kmers, ksize, mutation_rate, scaled_fraction):
"""helper function
Note that scaled here needs to be between 0 and 1
(e.g. scaled 1000 --> scaled_fraction 0.001)
"""
exp_nmut = exp_n_mutated(n_unique_kmers, ksize, mutation_rate)
try:
return (n_unique_kmers - exp_nmut) * log(1.0 - scaled_fraction)
except:
return float("-inf")


def get_exp_probability_nothing_common(
mutation_rate, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None
bluegenes marked this conversation as resolved.
Show resolved Hide resolved
):
"""
Given parameters, calculate the expected probability that nothing will be common
between a fracminhash sketch of a original sequence and a fracminhash sketch of a mutated
sequence. If this is above a threshold, we should suspect that the two sketches may have
nothing in common. The threshold needs to be set with proper insights.

Arguments: n_unique_kmers, ksize, mutation_rate, scaled
Returns: float - expected likelihood that nothing is common between sketches
"""
# NTP note: do we have any checks for ksize >=1 in sourmash_args? The rest should be taken care of.
# assert 0.0 <= mutation_rate <= 1.0 and ksize >= 1 and scaled >= 1
if sequence_len_bp and not n_unique_kmers:
n_unique_kmers = sequence_len_to_n_kmers(sequence_len_bp, ksize)
f_scaled = 1.0 / float(scaled)
if mutation_rate == 1.0:
return 1.0
elif mutation_rate == 0.0:
return 0.0
return exp(
get_expected_log_probability(n_unique_kmers, ksize, mutation_rate, f_scaled)
)


def containment_to_distance(
containment,
ksize,
scaled,
n_unique_kmers=None,
bluegenes marked this conversation as resolved.
Show resolved Hide resolved
sequence_len_bp=None,
confidence=0.95,
return_identity=False,
return_ci=False,
prob_threshold=10.0 ** (-3),
bluegenes marked this conversation as resolved.
Show resolved Hide resolved
):
"""
Containment --> distance CI (one step)
"""
sol1, sol2, point_estimate = None, None, None
if sequence_len_bp and not n_unique_kmers:
n_unique_kmers = sequence_len_to_n_kmers(sequence_len_bp, ksize)
if containment <= 0.0001:
sol2 = sol1 = point_estimate = 1.0
elif containment >= 0.9999:
sol1 = sol2 = point_estimate = 0.0
else:
point_estimate = 1.0 - containment ** (1.0 / ksize)
if return_ci:
try:
alpha = 1 - confidence
z_alpha = probit(1 - alpha / 2)
f_scaled = (
1.0 / scaled
) # these use scaled as a fraction between 0 and 1

bias_factor = 1 - (1 - f_scaled) ** n_unique_kmers

term_1 = (1.0 - f_scaled) / (
f_scaled * n_unique_kmers**3 * bias_factor**2
)
term_2 = lambda pest: n_unique_kmers * exp_n_mutated(
n_unique_kmers, ksize, pest
) - exp_n_mutated_squared(n_unique_kmers, ksize, pest)
term_3 = lambda pest: var_n_mutated(n_unique_kmers, ksize, pest) / (
n_unique_kmers**2
)

var_direct = lambda pest: term_1 * term_2(pest) + term_3(pest)

f1 = (
lambda pest: (1 - pest) ** ksize
+ z_alpha * sqrt(var_direct(pest))
- containment
)
f2 = (
lambda pest: (1 - pest) ** ksize
- z_alpha * sqrt(var_direct(pest))
- containment
)

sol1 = brentq(f1, 0.0000001, 0.9999999)
sol2 = brentq(f2, 0.0000001, 0.9999999)

except ValueError as exc:
bluegenes marked this conversation as resolved.
Show resolved Hide resolved
# afaict, this only happens with extremely small test data
notify(
"WARNING: Cannot estimate ANI from containment. Do your sketches contain enough hashes?"
)
notify(str(exc))
return None, None, None

# Do this here, so that we don't need to reconvert distance <--> identity later.
prob_nothing_in_common = get_exp_probability_nothing_common(
point_estimate, ksize, scaled, n_unique_kmers=n_unique_kmers
)
if prob_nothing_in_common >= prob_threshold:
# TO DO: keep count; suggest user decrease scaled value. If that is unsuccessful, maybe decrease ksize
notify(
"WARNING: These sketches may have no hashes in common based on chance alone."
)

if return_identity:
if any([sol1 is None, sol2 is None]):
point_estimate = distance_to_identity(point_estimate)
else:
point_estimate, sol2, sol1 = distance_to_identity(
point_estimate, sol2, sol1
)

if return_ci:
return point_estimate, sol2, sol1, prob_nothing_in_common

return point_estimate, prob_nothing_in_common


def jaccard_to_distance(
jaccard,
ksize,
scaled,
n_unique_kmers=None,
bluegenes marked this conversation as resolved.
Show resolved Hide resolved
sequence_len_bp=None,
return_identity=False,
prob_threshold=10.0 ** (-3),
bluegenes marked this conversation as resolved.
Show resolved Hide resolved
err_threshold=10.0 ** (-4.0),
bluegenes marked this conversation as resolved.
Show resolved Hide resolved
):
"""
Given parameters, calculate point estimate for mutation rate from jaccard index.
First checks if parameters are valid (checks are not exhaustive). Then uses formulas
derived mathematically to compute the point estimate. The formula uses approximations,
therefore a tiny error is associated with it. A lower bound of that error is also returned.
A high error indicates that the point estimate cannot be trusted. Threshold of the error
is open to interpretation, but suggested that > 10^-4 should be handled with caution.

Note that the error is NOT a mutation rate, and therefore cannot be considered in
something like mut.rate +/- error.

Arguments: jaccard, ksize, scaled, n_unique_kmers
# Returns: tuple (point_estimate_of_mutation_rate, lower_bound_of_error)

# Returns: point_estimate_of_mutation_rate

Note: point estimate does not consider impact of scaled, but p_nothing_in_common can be
useful for determining whether scaled is sufficient for these comparisons.
"""
error_lower_bound = None
if sequence_len_bp and not n_unique_kmers:
n_unique_kmers = sequence_len_to_n_kmers(sequence_len_bp, ksize)
if jaccard <= 0.0001:
point_estimate = 1.0
error_lower_bound = 0.0
elif jaccard >= 0.9999:
point_estimate = 0.0
error_lower_bound = 0.0
else:
point_estimate = 1.0 - (2.0 * jaccard / float(1 + jaccard)) ** (
1.0 / float(ksize)
)

exp_n_mut = exp_n_mutated(n_unique_kmers, ksize, point_estimate)
var_n_mut = var_n_mutated(n_unique_kmers, ksize, point_estimate)
error_lower_bound = (
1.0 * n_unique_kmers * var_n_mut / (n_unique_kmers + exp_n_mut) ** 3
)

if error_lower_bound is not None and error_lower_bound > err_threshold:
notify(
f"WARNING: Error on Jaccard distance point estimate is too high ({error_lower_bound})."
)

prob_nothing_in_common = get_exp_probability_nothing_common(
point_estimate, ksize, scaled, n_unique_kmers=n_unique_kmers
)
if prob_nothing_in_common >= prob_threshold:
# to do: keep count and recommend user lower scaled val
notify(
"WARNING: These sketches may have no hashes in common based on chance alone."
)

if return_identity:
point_estimate = distance_to_identity(point_estimate)

return point_estimate, prob_nothing_in_common, error_lower_bound
62 changes: 62 additions & 0 deletions src/sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ class MinHash - core MinHash class.
class FrozenMinHash - read-only MinHash class.
"""
from __future__ import unicode_literals, division
from .distance_utils import jaccard_to_distance, containment_to_distance

__all__ = ['get_minhash_default_seed',
'get_minhash_max_hash',
Expand Down Expand Up @@ -646,6 +647,26 @@ def jaccard(self, other, downsample=False):
raise TypeError(err)
return self._methodcall(lib.kmerminhash_similarity, other._get_objptr(), True, downsample)

def jaccard_ani(self, other, downsample=False, jaccard=None, prob_threshold=10.0 ** (-3), err_threshold=10.0 ** (-4.0)):
bluegenes marked this conversation as resolved.
Show resolved Hide resolved
"Calculate Jaccard --> ANI of two MinHash objects."
self_mh = self
other_mh = other
scaled = self.scaled
if downsample:
scaled = max(self_mh.scaled, other_mh.scaled)
self_mh = self.downsample(scaled=scaled)
other_mh = other.downsample(scaled=scaled)
if jaccard is None:
jaccard = self_mh.similarity(other_mh, ignore_abundance=True)
avg_scaled_kmers = round((len(self_mh) + len(other_mh))/2)
avg_n_kmers = avg_scaled_kmers * scaled # would be better if hll estimate - see #1798
j_ani,p_nothing_in_common,error_lower_bound = jaccard_to_distance(jaccard, self_mh.ksize, scaled,
n_unique_kmers=avg_n_kmers,
return_identity=True,
prob_threshold = prob_threshold,
err_threshold = err_threshold)
return j_ani, p_nothing_in_common, error_lower_bound

def similarity(self, other, ignore_abundance=False, downsample=False):
"""Calculate similarity of two sketches.

Expand Down Expand Up @@ -683,6 +704,26 @@ def contained_by(self, other, downsample=False):

return self.count_common(other, downsample) / len(self)

def containment_ani(self, other, downsample=False, containment=None, confidence=0.95, return_ci = False):
bluegenes marked this conversation as resolved.
Show resolved Hide resolved
"Estimate ANI from containment with the other MinHash."
self_mh = self
other_mh = other
scaled = self.scaled
if downsample:
scaled = max(self_mh.scaled, other_mh.scaled)
self_mh = self.downsample(scaled=scaled)
other_mh = other.downsample(scaled=scaled)
if containment is None:
containment = self_mh.contained_by(other_mh)
n_kmers = len(self_mh) * scaled # would be better if hll estimate - see #1798

## ani_info will be two values if return_ci is False; four values if return_ci is True
## not ideal, but I'm not sure how to fix.
ani_info = containment_to_distance(containment, self_mh.ksize, self_mh.scaled,
bluegenes marked this conversation as resolved.
Show resolved Hide resolved
n_unique_kmers=n_kmers, confidence=confidence,
return_identity=True, return_ci = return_ci)
return ani_info

def max_containment(self, other, downsample=False):
"""
Calculate maximum containment.
Expand All @@ -695,6 +736,27 @@ def max_containment(self, other, downsample=False):

return self.count_common(other, downsample) / min_denom

def max_containment_ani(self, other, downsample=False, max_containment=None, confidence=0.95, return_ci=False):
bluegenes marked this conversation as resolved.
Show resolved Hide resolved
"Estimate ANI from containment with the other MinHash."
self_mh = self
other_mh = other
scaled = self.scaled
if downsample:
scaled = max(self_mh.scaled, other_mh.scaled)
self_mh = self.downsample(scaled=scaled)
other_mh = other.downsample(scaled=scaled)
if max_containment is None:
max_containment = self_mh.max_containment(other_mh)
min_n_kmers = min(len(self_mh), len(other_mh))
n_kmers = min_n_kmers * scaled # would be better if hll estimate - see #1798

## ani_info will be two values if return_ci is False; four values if return_ci is True
## not ideal, but I'm not sure how to fix.
ani_info = containment_to_distance(max_containment, self_mh.ksize, scaled,
n_unique_kmers=n_kmers,confidence=confidence,
return_identity=True, return_ci = return_ci)
return ani_info

def __add__(self, other):
if not isinstance(other, MinHash):
raise TypeError("can only add MinHash objects to MinHash objects!")
Expand Down
Loading