sourmash-bio · luizirber · Jan 3, 2017 · Dec 31, 2016 · Jan 1, 2017 · Jan 1, 2017
diff --git a/sourmash_lib/__init__.py b/sourmash_lib/__init__.py
@@ -4,6 +4,7 @@
 """
 from __future__ import print_function
 import re
+import math
 from ._minhash import MinHash
 
 khmer_available = False
@@ -97,19 +98,96 @@ def jaccard(self, other):
         return self.mh.compare(other.mh)
 
     def similarity(self, other, ignore_abundance=False):
+        """\
+        Calculate similarity of two sketches.
+
+        If the sketches are not abundance weighted, or ignore_abundance=True,
+        compute Jaccard similarity.
+
+        If the sketches are abundance weighted, calculate a distance metric
+        based on the cosine similarity.
+
+        Note, because the term frequencies (tf-idf weights) cannot be negative,
+        the angle will never be < 0deg or > 90deg.
+
+        See https://en.wikipedia.org/wiki/Cosine_similarity
+        """
+
         if not self.track_abundance or ignore_abundance:
             return self.jaccard(other)
         else:
             a = self.mh.get_mins(with_abundance=True)
             b = other.mh.get_mins(with_abundance=True)
+            prod = dotproduct(a, b)
+            prod = min(1.0, prod)
 
-            common_abund = 0
-            total_abund = 0
-            for k, abundance in a.items():
-                common_abund += b.get(k, 0)
-                total_abund += abundance
-            return common_abund / float(total_abund)
+            distance = 2*math.acos(prod) / math.pi
+            return 1.0 - distance
 
     def count_common(self, other):
         "Calculate number of common k-mers between two sketches."
         return self.mh.count_common(other.mh)
+
+
+def dotproduct(a, b, normalize=True):
+    """
+    Compute the dot product of two dictionaries {k: v} where v is
+    abundance.
+    """
+
+    if normalize:
+        norm_a = math.sqrt(sum([ x*x for x in a.values() ]))
+        norm_b = math.sqrt(sum([ x*x for x in b.values() ]))
+
+        if norm_a == 0.0 or norm_b == 0.0:
+            return 0.0
+    else:
+        norm_a = 1.0
+        norm_b = 1.0
+
+    prod = 0.
+    for k, abundance in a.items():
+        prod += (float(abundance) / norm_a) * (b.get(k, 0) / norm_b)
+
+    return prod
+
+
+def test_dotproduct_1():
+    a = {'x': 1}
+    assert dotproduct(a, a, normalize=True) == 1.0
+
+    a = {'x': 1}
+    b = {'x': 1}
+    assert dotproduct(a, b, normalize=True) == 1.0
+
+    c = {'x': 1, 'y': 1}
+    prod = dotproduct(c, c, normalize=True)
+    assert round(prod, 2) == 1.0
+
+    # check a.c => 45 degree angle
+    a = {'x': 1}
+    c = {'x': 1, 'y': 1}
+
+    angle = 45
+    rad = math.radians(angle)
+    cosval = math.cos(rad)
+    prod = dotproduct(a, c, normalize=True)
+    assert round(prod, 2) == 0.71
+    assert round(cosval, 2) == round(prod, 2)
+
+    c = {'x': 1, 'y': 1}
+    d = {'x': 1, 'y': 1}
+    prod = dotproduct(c, d, normalize=True)
+    assert round(prod, 2) == 1.0
+
+    a = {'x': 1}
+    e = {'y': 1}
+    assert dotproduct(a, e, normalize=True) == 0.0
+
+
+def test_dotproduct_zeroes():
+    a = {'x': 1}
+    b = {}
+
+    assert dotproduct(a, b) == 0.0
+    assert dotproduct(b, a) == 0.0
diff --git a/sourmash_lib/test_estimators.py b/sourmash_lib/test_estimators.py
@@ -107,3 +107,29 @@ def test_bad_construct_2(track_abundance):
         assert 0, "require ksize in constructor"
     except ValueError:
         pass
+
+
+def test_abund_similarity():
+    E1 = Estimators(n=5, ksize=20, track_abundance=True)
+    E2 = Estimators(n=5, ksize=20, track_abundance=True)
+
+    for i in [1]:
+        E1.mh.add_hash(i)
+    for i in [1, 2]:
+        E2.mh.add_hash(i)
+
+    assert round(E1.similarity(E1)) == 1.0
+    assert round(E1.similarity(E2), 2) == 0.5
+
+    assert round(E1.similarity(E1, ignore_abundance=True)) == 1.0
+    assert round(E1.similarity(E2, ignore_abundance=True), 2) == 1.0
+
+
+def test_abund_similarity_zero():
+    E1 = Estimators(n=5, ksize=20, track_abundance=True)
+    E2 = Estimators(n=5, ksize=20, track_abundance=True)
+
+    for i in [1]:
+        E1.mh.add_hash(i)
+
+    assert E1.similarity(E2) == 0.0
diff --git a/sourmash_lib/test_sourmash.py b/sourmash_lib/test_sourmash.py
@@ -116,11 +116,13 @@ def test_do_sourmash_compute_multik_with_protein():
         outfile = os.path.join(location, 'short.fa.sig')
         assert os.path.exists(outfile)
 
-        siglist = list(signature.load_signatures(outfile))
-        assert len(siglist) == 4
-        ksizes = set([ x.estimator.ksize for x in siglist ])
-        assert 21 in ksizes
-        assert 30 in ksizes
+        with open(outfile, 'rt') as fp:
+            sigdata = fp.read()
+            siglist = list(signature.load_signatures(sigdata))
+            assert len(siglist) == 4
+            ksizes = set([ x.estimator.ksize for x in siglist ])
+            assert 21 in ksizes
+            assert 30 in ksizes
 
 
 def test_do_sourmash_compute_multik_with_nothing():
@@ -161,11 +163,13 @@ def test_do_sourmash_compute_multik_only_protein():
         outfile = os.path.join(location, 'short.fa.sig')
         assert os.path.exists(outfile)
 
-        siglist = list(signature.load_signatures(outfile))
-        assert len(siglist) == 2
-        ksizes = set([ x.estimator.ksize for x in siglist ])
-        assert 21 in ksizes
-        assert 30 in ksizes
+        with open(outfile, 'rt') as fp:
+            sigdata = fp.read()
+            siglist = list(signature.load_signatures(sigdata))
+            assert len(siglist) == 2
+            ksizes = set([ x.estimator.ksize for x in siglist ])
+            assert 21 in ksizes
+            assert 30 in ksizes
 
 
 def test_do_sourmash_compute_multik_input_is_protein():
@@ -179,16 +183,18 @@ def test_do_sourmash_compute_multik_input_is_protein():
         outfile = os.path.join(location, 'ecoli.faa.sig')
         assert os.path.exists(outfile)
 
-        siglist = list(signature.load_signatures(outfile))
-        assert len(siglist) == 2
-        ksizes = set([ x.estimator.ksize for x in siglist ])
-        assert 21 in ksizes
-        assert 30 in ksizes
+        with open(outfile, 'rt') as fp:
+            sigdata = fp.read()
+            siglist = list(signature.load_signatures(sigdata))
+            assert len(siglist) == 2
+            ksizes = set([ x.estimator.ksize for x in siglist ])
+            assert 21 in ksizes
+            assert 30 in ksizes
 
-        moltype = set([ x.estimator.is_molecule_type('protein')
-                        for x in siglist ])
-        assert len(moltype) == 1
-        assert True in moltype
+            moltype = set([ x.estimator.is_molecule_type('protein')
+                            for x in siglist ])
+            assert len(moltype) == 1
+            assert True in moltype
 
 
 def test_do_sourmash_compute_multik_outfile():
@@ -603,11 +609,12 @@ def test_sourmash_compare_with_abundance_2():
     with utils.TempDirectory() as location:
         # create two signatures
         E1 = Estimators(ksize=5, n=5, protein=False,
-                                     track_abundance=True)
+                        track_abundance=True)
         E2 = Estimators(ksize=5, n=5, protein=False,
-                                     track_abundance=True)
+                        track_abundance=True)
 
         E1.mh.add_sequence('ATGGA')
+
         E1.mh.add_sequence('ATGGA')
         E2.mh.add_sequence('ATGGA')
 
@@ -623,107 +630,33 @@ def test_sourmash_compare_with_abundance_2():
                                            ['search', 'e1.sig', 'e2.sig',
                                             '-k' ,'5'],
                                            in_directory=location)
-        assert '0.500' in out
-
-
-def test_do_sourmash_categorize():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        testdata2 = utils.get_test_data('short2.fa')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', testdata1, testdata2],
-                                           in_directory=location)
+        assert '1.0' in out
 
-        status, out, err = utils.runscript('sourmash',
-                                           ['sbt_index', 'zzz',
-                                            'short.fa.sig',
-                                            'short2.fa.sig'],
-                                           in_directory=location)
 
-        assert os.path.exists(os.path.join(location, 'zzz.sbt.json'))
-
-        status, out, err = utils.runscript('sourmash',
-                                           ['categorize', 'zzz',
-                                            'short.fa.sig',
-                                            '--csv', 'xxx.csv'],
-                                           in_directory=location)
-
-        assert os.path.exists(os.path.join(location, 'xxx.csv'))
-
-        import csv
-        r = csv.reader(open(os.path.join(location, 'xxx.csv')))
-        sig, matchname, match = next(r)
-        assert sig == 'short.fa.sig'
-        assert matchname.endswith('short2.fa')
-        assert round(float(match), 2) == 0.96
-
-
-def test_do_sourmash_categorize_traverse():
+def test_sourmash_compare_with_abundance_3():
     with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        testdata2 = utils.get_test_data('short2.fa')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', testdata1, testdata2],
-                                           in_directory=location)
-
-        status, out, err = utils.runscript('sourmash',
-                                           ['sbt_index', 'zzz',
-                                            'short.fa.sig',
-                                            'short2.fa.sig'],
-                                           in_directory=location)
-
-        assert os.path.exists(os.path.join(location, 'zzz.sbt.json'))
-
-        status, out, err = utils.runscript('sourmash',
-                                           ['categorize', 'zzz',
-                                            '--traverse-directory', '.',
-                                            '--csv', 'xxx.csv'],
-                                           in_directory=location)
-
-        assert os.path.exists(os.path.join(location, 'xxx.csv'))
+        # create two signatures
+        E1 = Estimators(ksize=5, n=5, protein=False,
+                        track_abundance=True)
+        E2 = Estimators(ksize=5, n=5, protein=False,
+                        track_abundance=True)
 
-        import csv
-        r = csv.reader(open(os.path.join(location, 'xxx.csv')))
-        sig, matchname, match = next(r)
-        assert sig == './short.fa.sig'
-        assert matchname.endswith('short2.fa')
-        assert round(float(match), 2) == 0.96
+        E1.mh.add_sequence('ATGGA')
+        E1.mh.add_sequence('GGACA')
 
-        sig, matchname, match = next(r)
-        print((sig, matchname, match,))
-        assert sig == './short2.fa.sig'
-        assert matchname == ''
-        assert round(float(match), 2) == 0.0
+        E1.mh.add_sequence('ATGGA')
+        E2.mh.add_sequence('ATGGA')
 
+        s1 = signature.SourmashSignature('', E1, filename='e1', name='e1')
+        s2 = signature.SourmashSignature('', E2, filename='e2', name='e2')
 
-def test_do_sourmash_watch():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        testdata2 = utils.get_test_data('short2.fa')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', testdata1, testdata2],
-                                           in_directory=location)
+        signature.save_signatures([s1],
+                                  open(os.path.join(location, 'e1.sig'), 'w'))
+        signature.save_signatures([s2],
+                                  open(os.path.join(location, 'e2.sig'), 'w'))
 
         status, out, err = utils.runscript('sourmash',
-                                           ['sbt_index', 'zzz',
-                                            'short.fa.sig',
-                                            'short2.fa.sig'],
+                                           ['search', 'e1.sig', 'e2.sig',
+                                            '-k' ,'5'],
                                            in_directory=location)
-
-        assert os.path.exists(os.path.join(location, 'zzz.sbt.json'))
-
-        cmd = """
-
-           cat {testdata} |
-           {scripts}/sourmash watch zzz
-
-        """.format(testdata=testdata1,
-                   scripts=utils.scriptpath())
-        (status, out, err) = utils.run_shell_cmd(cmd, in_directory=location)
-        print(out)
-        print(err)
-
-        assert not out
-        assert 'FOUND' in err
-        assert 'short.fa' in err
-        assert 'at 1.000' in err
+        assert '0.705' in out