sourmash-bio · ctb · Dec 19, 2018 · Aug 18, 2018 · Aug 18, 2018 · Oct 15, 2018
diff --git a/doc/databases.md b/doc/databases.md
@@ -56,14 +56,15 @@ to build the databases.
 
 ## Genbank LCA Database
 
-These databases are formatted for use with `sourmash lca`.
+These databases are formatted for use with `sourmash lca`; they are
+v2 LCA databases and will work with sourmash v2.0a11 and later.
 
 Approximately 87,000 microbial genomes (including viral and fungal)
 from NCBI Genbank.
 
-* [Genbank k=21, 2017.11.07](https://osf.io/s3jx8/download), 105 MB
-* [Genbank k=31, 2017.11.07](https://osf.io/zskb9/download), 118 MB
-* [Genbank k=51, 2017.11.07](https://osf.io/md2nt/download), 123 MB
+* [Genbank k=21, 2017.11.07](https://osf.io/d7rv8/download), 109 MB
+* [Genbank k=31, 2017.11.07](https://osf.io/4f8n3/download), 120 MB
+* [Genbank k=51, 2017.11.07](https://osf.io/nemkw/download), 125 MB
 
 ### Details
 

diff --git a/sourmash/__main__.py b/sourmash/__main__.py
@@ -9,7 +9,7 @@
 
 from .commands import (categorize, compare, compute, dump, import_csv,
                        gather, index, sbt_combine, search,
-                       plot, watch, info, storage, migrate)
+                       plot, watch, info, storage, migrate, multigather)
 from .lca import main as lca_main
 
 usage='''
@@ -58,7 +58,8 @@ def main():
                 'sbt_combine': sbt_combine, 'info': info,
                 'storage': storage,
                 'lca': lca_main,
-                'migrate': migrate}
+                'migrate': migrate,
+                'multigather': multigather}
     parser = argparse.ArgumentParser(
         description='work with compressed sequence representations')
     parser.add_argument('command', nargs='?')

diff --git a/sourmash/_minhash.pyx b/sourmash/_minhash.pyx
@@ -273,7 +273,7 @@ cdef class MinHash(object):
 
         old_scaled = get_scaled_for_max_hash(self.max_hash)
         if old_scaled > new_num:
-            raise ValueError('new scaled is lower than current sample scaled')
+            raise ValueError('new scaled {} is lower than current sample scaled {}'.format(new_num, old_scaled))
 
         new_max_hash = get_max_hash_for_scaled(new_num)
 

diff --git a/sourmash/commands.py b/sourmash/commands.py
@@ -833,7 +833,7 @@ def search(args):
         query.minhash = query.minhash.downsample_scaled(args.scaled)
 
     # set up the search databases
-    databases = sourmash_args.load_sbts_and_sigs(args.databases, query,
+    databases = sourmash_args.load_dbs_and_sigs(args.databases, query,
                                                  not args.containment,
                                                  args.traverse_directory)
 
@@ -1018,12 +1018,12 @@ def gather(args):
         query.minhash = query.minhash.downsample_scaled(args.scaled)
 
     # empty?
-    if not query.minhash.get_mins():
+    if not len(query.minhash):
         error('no query hashes!? exiting.')
         sys.exit(-1)
 
     # set up the search databases
-    databases = sourmash_args.load_sbts_and_sigs(args.databases, query, False,
+    databases = sourmash_args.load_dbs_and_sigs(args.databases, query, False,
                                                  args.traverse_directory)
 
     if not len(databases):
@@ -1097,7 +1097,7 @@ def gather(args):
     if args.output_unassigned:
         if not found:
             notify('nothing found - entire query signature unassigned.')
-        elif not query.minhash.get_mins():
+        elif not len(query.minhash):
             notify('no unassigned hashes! not saving.')
         else:
             outname = args.output_unassigned.name
@@ -1109,6 +1109,165 @@ def gather(args):
                                 args.output_unassigned)
 
 
+def multigather(args):
+    from .search import gather_databases, format_bp
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--db', nargs='+', action='append')
+    parser.add_argument('--query', nargs='+', action='append')
+    parser.add_argument('--traverse-directory', action='store_true',
+                        help='search all signatures underneath directories.')
+    parser.add_argument('--threshold-bp', type=float, default=5e4,
+                        help='threshold (in bp) for reporting results')
+    parser.add_argument('--scaled', type=float, default=0,
+                        help='downsample query to this scaled factor')
+    parser.add_argument('-q', '--quiet', action='store_true',
+                        help='suppress non-error output')
+    parser.add_argument('--ignore-abundance',  action='store_true',
+                        help='do NOT use k-mer abundances if present')
+    parser.add_argument('-d', '--debug', action='store_true')
+
+    sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K)
+    sourmash_args.add_moltype_args(parser)
+
+    args = parser.parse_args(args)
+    set_quiet(args.quiet)
+    moltype = sourmash_args.calculate_moltype(args)
+
+    if not args.db:
+        error('Error! must specify at least one database with --db')
+        sys.exit(-1)
+
+    if not args.query:
+        error('Error! must specify at least one query signature with --query')
+        sys.exit(-1)
+
+    # flatten --db and --query
+    args.db = [item for sublist in args.db for item in sublist]
+    args.query = [item for sublist in args.query for item in sublist]
+
+    query = sourmash_args.load_query_signature(args.query[0],
+                                               ksize=args.ksize,
+                                               select_moltype=moltype)
+    # set up the search databases
+    databases = sourmash_args.load_dbs_and_sigs(args.db, query, False,
+                                                args.traverse_directory)
+
+    if not len(databases):
+        error('Nothing found to search!')
+        sys.exit(-1)
+
+    # run gather on all the queries.
+    for queryfile in args.query:
+        # load the query signature & figure out all the things
+        query = sourmash_args.load_query_signature(queryfile,
+                                                   ksize=args.ksize,
+                                                   select_moltype=moltype)
+        notify('loaded query: {}... (k={}, {})', query.name()[:30],
+                                                 query.minhash.ksize,
+                                                 sourmash_args.get_moltype(query))
+
+        # verify signature was computed right.
+        if query.minhash.max_hash == 0:
+            error('query signature needs to be created with --scaled; skipping')
+            continue
+
+        # downsample if requested
+        if args.scaled:
+            notify('downsampling query from scaled={} to {}',
+                   query.minhash.scaled, int(args.scaled))
+            query.minhash = query.minhash.downsample_scaled(args.scaled)
+
+        # empty?
+        if not len(query.minhash):
+            error('no query hashes!? skipping to next..')
+            continue
+
+        found = []
+        weighted_missed = 1
+        for result, weighted_missed, new_max_hash, next_query in gather_databases(query, databases, args.threshold_bp, args.ignore_abundance):
+            # print interim result & save in a list for later use
+            pct_query = '{:.1f}%'.format(result.f_orig_query*100)
+            pct_genome = '{:.1f}%'.format(result.f_match*100)
+
+            name = result.leaf._display_name(40)
+
+
+            if not len(found):                # first result? print header.
+                if query.minhash.track_abundance and not args.ignore_abundance:
+                    print_results("")
+                    print_results("overlap     p_query p_match avg_abund")
+                    print_results("---------   ------- ------- ---------")
+                else:
+                    print_results("")
+                    print_results("overlap     p_query p_match")
+                    print_results("---------   ------- -------")
+
+
+            # print interim result & save in a list for later use
+            pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
+            pct_genome = '{:.1f}%'.format(result.f_match*100)
+            average_abund ='{:.1f}'.format(result.average_abund)
+            name = result.leaf._display_name(40)
+
+            if query.minhash.track_abundance and not args.ignore_abundance:
+                print_results('{:9}   {:>7} {:>7} {:>9}    {}',
+                          format_bp(result.intersect_bp), pct_query, pct_genome,
+                          average_abund, name)
+            else:
+                print_results('{:9}   {:>7} {:>7}    {}',
+                          format_bp(result.intersect_bp), pct_query, pct_genome,
+                          name)
+            found.append(result)
+
+
+        # basic reporting
+        print_results('\nfound {} matches total;', len(found))
+
+        print_results('the recovered matches hit {:.1f}% of the query',
+               (1 - weighted_missed) * 100)
+        print_results('')
+
+        if not found:
+            notify('nothing found... skipping.')
+            continue
+
+        output_base = os.path.basename(queryfile)
+        output_csv = output_base + '.csv'
+
+        fieldnames = ['intersect_bp', 'f_orig_query', 'f_match',
+                  'f_unique_to_query', 'f_unique_weighted',
+                  'average_abund', 'median_abund', 'std_abund', 'name', 'filename', 'md5']
+        with open(output_csv, 'wt') as fp:
+            w = csv.DictWriter(fp, fieldnames=fieldnames)
+            w.writeheader()
+            for result in found:
+                d = dict(result._asdict())
+                del d['leaf']                 # actual signature not in CSV.
+                w.writerow(d)
+
+        output_matches = output_base + '.matches.sig'
+        with open(output_matches, 'wt') as fp:
+            outname = output_matches
+            notify('saving all matches to "{}"', outname)
+            sig.save_signatures([ r.leaf for r in found ], fp)
+
+        output_unassigned = output_base + '.unassigned.sig'
+        with open(output_unassigned, 'wt') as fp:
+            if not found:
+                notify('nothing found - entire query signature unassigned.')
+            elif not len(query.minhash):
+                notify('no unassigned hashes! not saving.')
+            else:
+                notify('saving unassigned hashes to "{}"', output_unassigned)
+
+                e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash)
+                e.add_many(next_query.minhash.get_mins())
+                sig.save_signatures([ sig.SourmashSignature(e) ], fp)
+
+        # fini, next query!
+
+
 def watch(args):
     "Build a signature from raw FASTA/FASTQ coming in on stdin, search."
 

diff --git a/sourmash/lca/command_gather.py b/sourmash/lca/command_gather.py
@@ -81,71 +81,59 @@ def gather_signature(query_sig, dblist, ignore_abundance):
     md5_to_lineage = {}
     md5_to_name = {}
 
-    x = set()
-    for hashval in query_mins:
-        for lca_db in dblist:
-            lineage_ids = lca_db.hashval_to_lineage_id.get(hashval, [])
-            for lid in lineage_ids:
-                md5 = lca_db.lineage_id_to_signature[lid]
-                x.add((lca_db, lid, md5))
-
-    for lca_db, lid, md5 in x:
-        md5_to_lineage[md5] = lca_db.lineage_dict[lid]
-        if lca_db.signature_to_name:
-            md5_to_name[md5] = lca_db.signature_to_name[md5]
-        else:
-            md5_to_name[md5] = ''
-
     # now! do the gather:
     while 1:
         # find all of the assignments for the current set of hashes
         assignments = defaultdict(set)
         for hashval in query_mins:
             for lca_db in dblist:
-                lineage_ids = lca_db.hashval_to_lineage_id.get(hashval, [])
-                for lid in lineage_ids:
-                    md5 = lca_db.lineage_id_to_signature[lid]
-                    signature_size = lca_db.lineage_id_counts[lid]
-                    assignments[hashval].add((md5, signature_size))
+                idx_list = lca_db.hashval_to_idx.get(hashval, [])
+
+                for idx in idx_list:
+                    assignments[hashval].add((lca_db, idx))
+
         # none? quit.
         if not assignments:
             break
 
         # count the distinct signatures.
         counts = Counter()
-        for hashval, md5_set in assignments.items():
-            for (md5, sigsize) in md5_set:
-                counts[(md5, sigsize)] += 1
+        for hashval, match_set in assignments.items():
+            for (lca_db, idx) in match_set:
+                counts[(lca_db, idx)] += 1
 
         # collect the most abundant assignments
         common_iter = iter(counts.most_common())
         best_list = []
-        (md5, sigsize), top_count = next(common_iter)
+        (best_lca_db, best_idx), top_count = next(common_iter)
 
-        best_list.append((md5_to_name[md5], md5, sigsize))
-        for (md5, sigsize), count in common_iter:
+        best_list.append((best_lca_db, best_idx))
+        for (lca_db, idx), count in common_iter:
             if count != top_count:
                 break
-            best_list.append((md5_to_name[md5], md5, sigsize))
+            best_list.append((lca_db, idx))
 
-        # sort on name and pick the top (for consistency).
-        best_list.sort()
-        _, top_md5, top_sigsize = best_list[0]
+        # sort on idx and pick the lowest (for consistency).
+        best_list.sort(key=lambda x: x[1])
+        best_lca_db, best_idx = best_list[0]
 
         equiv_counts = len(best_list) - 1
 
-        # now, remove from query mins.
+        # now, remove hashes from query mins.
         intersect_mins = set()
-        for hashval, md5_set in assignments.items():
-            if (top_md5, top_sigsize) in md5_set:
+        for hashval, match_set in assignments.items():
+            if (best_lca_db, best_idx) in match_set:
                 query_mins.remove(hashval)
                 intersect_mins.add(hashval)
 
         # should match!
         assert top_count == len(intersect_mins)
 
         # calculate size of match (# of hashvals belonging to that sig)
-        match_size = top_sigsize
+        match_size = 0
+        for hashval, idx_list in best_lca_db.hashval_to_idx.items():
+            if best_idx in idx_list:
+                match_size += 1
 
         # construct 'result' object
         intersect_bp = top_count * query_sig.minhash.scaled
@@ -155,13 +143,23 @@ def gather_signature(query_sig, dblist, ignore_abundance):
                / len(intersect_mins)
         f_match = len(intersect_mins) / match_size
 
+        # XXX name and lineage
+        for ident, idx in best_lca_db.ident_to_idx.items():
+            if idx == best_idx:
+                name = best_lca_db.ident_to_name[ident]
+
+        lid = best_lca_db.idx_to_lid.get(best_idx)
+        lineage = ()
+        if lid is not None:
+            lineage = best_lca_db.lid_to_lineage[lid]
+
         result = LCAGatherResult(intersect_bp = intersect_bp,
                                  f_unique_to_query= top_count / n_mins,
                                  f_unique_weighted=f_unique_weighted,
                                  average_abund=average_abund,
                                  f_match=f_match,
-                                 lineage=md5_to_lineage[top_md5],
-                                 name=md5_to_name[top_md5],
+                                 lineage=lineage,
+                                 name=name,
                                  n_equal_matches=equiv_counts)
 
         f_unassigned = len(query_mins) / n_mins