Skip to content

Commit

Permalink
add lca DBs as inputs to 'sourmash search' and 'gather' (#533)
Browse files Browse the repository at this point in the history
[MRG] add lca DBs as inputs to 'sourmash search' and 'gather'
  • Loading branch information
ctb authored Dec 19, 2018
1 parent 21b400d commit 954d729
Show file tree
Hide file tree
Showing 22 changed files with 774 additions and 227 deletions.
9 changes: 5 additions & 4 deletions doc/databases.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,15 @@ to build the databases.

## Genbank LCA Database

These databases are formatted for use with `sourmash lca`.
These databases are formatted for use with `sourmash lca`; they are
v2 LCA databases and will work with sourmash v2.0a11 and later.

Approximately 87,000 microbial genomes (including viral and fungal)
from NCBI Genbank.

* [Genbank k=21, 2017.11.07](https://osf.io/s3jx8/download), 105 MB
* [Genbank k=31, 2017.11.07](https://osf.io/zskb9/download), 118 MB
* [Genbank k=51, 2017.11.07](https://osf.io/md2nt/download), 123 MB
* [Genbank k=21, 2017.11.07](https://osf.io/d7rv8/download), 109 MB
* [Genbank k=31, 2017.11.07](https://osf.io/4f8n3/download), 120 MB
* [Genbank k=51, 2017.11.07](https://osf.io/nemkw/download), 125 MB

### Details

Expand Down
5 changes: 3 additions & 2 deletions sourmash/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from .commands import (categorize, compare, compute, dump, import_csv,
gather, index, sbt_combine, search,
plot, watch, info, storage, migrate)
plot, watch, info, storage, migrate, multigather)
from .lca import main as lca_main

usage='''
Expand Down Expand Up @@ -58,7 +58,8 @@ def main():
'sbt_combine': sbt_combine, 'info': info,
'storage': storage,
'lca': lca_main,
'migrate': migrate}
'migrate': migrate,
'multigather': multigather}
parser = argparse.ArgumentParser(
description='work with compressed sequence representations')
parser.add_argument('command', nargs='?')
Expand Down
2 changes: 1 addition & 1 deletion sourmash/_minhash.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ cdef class MinHash(object):

old_scaled = get_scaled_for_max_hash(self.max_hash)
if old_scaled > new_num:
raise ValueError('new scaled is lower than current sample scaled')
raise ValueError('new scaled {} is lower than current sample scaled {}'.format(new_num, old_scaled))

new_max_hash = get_max_hash_for_scaled(new_num)

Expand Down
167 changes: 163 additions & 4 deletions sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -833,7 +833,7 @@ def search(args):
query.minhash = query.minhash.downsample_scaled(args.scaled)

# set up the search databases
databases = sourmash_args.load_sbts_and_sigs(args.databases, query,
databases = sourmash_args.load_dbs_and_sigs(args.databases, query,
not args.containment,
args.traverse_directory)

Expand Down Expand Up @@ -1018,12 +1018,12 @@ def gather(args):
query.minhash = query.minhash.downsample_scaled(args.scaled)

# empty?
if not query.minhash.get_mins():
if not len(query.minhash):
error('no query hashes!? exiting.')
sys.exit(-1)

# set up the search databases
databases = sourmash_args.load_sbts_and_sigs(args.databases, query, False,
databases = sourmash_args.load_dbs_and_sigs(args.databases, query, False,
args.traverse_directory)

if not len(databases):
Expand Down Expand Up @@ -1097,7 +1097,7 @@ def gather(args):
if args.output_unassigned:
if not found:
notify('nothing found - entire query signature unassigned.')
elif not query.minhash.get_mins():
elif not len(query.minhash):
notify('no unassigned hashes! not saving.')
else:
outname = args.output_unassigned.name
Expand All @@ -1109,6 +1109,165 @@ def gather(args):
args.output_unassigned)


def multigather(args):
from .search import gather_databases, format_bp

parser = argparse.ArgumentParser()
parser.add_argument('--db', nargs='+', action='append')
parser.add_argument('--query', nargs='+', action='append')
parser.add_argument('--traverse-directory', action='store_true',
help='search all signatures underneath directories.')
parser.add_argument('--threshold-bp', type=float, default=5e4,
help='threshold (in bp) for reporting results')
parser.add_argument('--scaled', type=float, default=0,
help='downsample query to this scaled factor')
parser.add_argument('-q', '--quiet', action='store_true',
help='suppress non-error output')
parser.add_argument('--ignore-abundance', action='store_true',
help='do NOT use k-mer abundances if present')
parser.add_argument('-d', '--debug', action='store_true')

sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K)
sourmash_args.add_moltype_args(parser)

args = parser.parse_args(args)
set_quiet(args.quiet)
moltype = sourmash_args.calculate_moltype(args)

if not args.db:
error('Error! must specify at least one database with --db')
sys.exit(-1)

if not args.query:
error('Error! must specify at least one query signature with --query')
sys.exit(-1)

# flatten --db and --query
args.db = [item for sublist in args.db for item in sublist]
args.query = [item for sublist in args.query for item in sublist]

query = sourmash_args.load_query_signature(args.query[0],
ksize=args.ksize,
select_moltype=moltype)
# set up the search databases
databases = sourmash_args.load_dbs_and_sigs(args.db, query, False,
args.traverse_directory)

if not len(databases):
error('Nothing found to search!')
sys.exit(-1)

# run gather on all the queries.
for queryfile in args.query:
# load the query signature & figure out all the things
query = sourmash_args.load_query_signature(queryfile,
ksize=args.ksize,
select_moltype=moltype)
notify('loaded query: {}... (k={}, {})', query.name()[:30],
query.minhash.ksize,
sourmash_args.get_moltype(query))

# verify signature was computed right.
if query.minhash.max_hash == 0:
error('query signature needs to be created with --scaled; skipping')
continue

# downsample if requested
if args.scaled:
notify('downsampling query from scaled={} to {}',
query.minhash.scaled, int(args.scaled))
query.minhash = query.minhash.downsample_scaled(args.scaled)

# empty?
if not len(query.minhash):
error('no query hashes!? skipping to next..')
continue

found = []
weighted_missed = 1
for result, weighted_missed, new_max_hash, next_query in gather_databases(query, databases, args.threshold_bp, args.ignore_abundance):
# print interim result & save in a list for later use
pct_query = '{:.1f}%'.format(result.f_orig_query*100)
pct_genome = '{:.1f}%'.format(result.f_match*100)

name = result.leaf._display_name(40)


if not len(found): # first result? print header.
if query.minhash.track_abundance and not args.ignore_abundance:
print_results("")
print_results("overlap p_query p_match avg_abund")
print_results("--------- ------- ------- ---------")
else:
print_results("")
print_results("overlap p_query p_match")
print_results("--------- ------- -------")


# print interim result & save in a list for later use
pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
pct_genome = '{:.1f}%'.format(result.f_match*100)
average_abund ='{:.1f}'.format(result.average_abund)
name = result.leaf._display_name(40)

if query.minhash.track_abundance and not args.ignore_abundance:
print_results('{:9} {:>7} {:>7} {:>9} {}',
format_bp(result.intersect_bp), pct_query, pct_genome,
average_abund, name)
else:
print_results('{:9} {:>7} {:>7} {}',
format_bp(result.intersect_bp), pct_query, pct_genome,
name)
found.append(result)


# basic reporting
print_results('\nfound {} matches total;', len(found))

print_results('the recovered matches hit {:.1f}% of the query',
(1 - weighted_missed) * 100)
print_results('')

if not found:
notify('nothing found... skipping.')
continue

output_base = os.path.basename(queryfile)
output_csv = output_base + '.csv'

fieldnames = ['intersect_bp', 'f_orig_query', 'f_match',
'f_unique_to_query', 'f_unique_weighted',
'average_abund', 'median_abund', 'std_abund', 'name', 'filename', 'md5']
with open(output_csv, 'wt') as fp:
w = csv.DictWriter(fp, fieldnames=fieldnames)
w.writeheader()
for result in found:
d = dict(result._asdict())
del d['leaf'] # actual signature not in CSV.
w.writerow(d)

output_matches = output_base + '.matches.sig'
with open(output_matches, 'wt') as fp:
outname = output_matches
notify('saving all matches to "{}"', outname)
sig.save_signatures([ r.leaf for r in found ], fp)

output_unassigned = output_base + '.unassigned.sig'
with open(output_unassigned, 'wt') as fp:
if not found:
notify('nothing found - entire query signature unassigned.')
elif not len(query.minhash):
notify('no unassigned hashes! not saving.')
else:
notify('saving unassigned hashes to "{}"', output_unassigned)

e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash)
e.add_many(next_query.minhash.get_mins())
sig.save_signatures([ sig.SourmashSignature(e) ], fp)

# fini, next query!


def watch(args):
"Build a signature from raw FASTA/FASTQ coming in on stdin, search."

Expand Down
70 changes: 34 additions & 36 deletions sourmash/lca/command_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,71 +81,59 @@ def gather_signature(query_sig, dblist, ignore_abundance):
md5_to_lineage = {}
md5_to_name = {}

x = set()
for hashval in query_mins:
for lca_db in dblist:
lineage_ids = lca_db.hashval_to_lineage_id.get(hashval, [])
for lid in lineage_ids:
md5 = lca_db.lineage_id_to_signature[lid]
x.add((lca_db, lid, md5))

for lca_db, lid, md5 in x:
md5_to_lineage[md5] = lca_db.lineage_dict[lid]
if lca_db.signature_to_name:
md5_to_name[md5] = lca_db.signature_to_name[md5]
else:
md5_to_name[md5] = ''

# now! do the gather:
while 1:
# find all of the assignments for the current set of hashes
assignments = defaultdict(set)
for hashval in query_mins:
for lca_db in dblist:
lineage_ids = lca_db.hashval_to_lineage_id.get(hashval, [])
for lid in lineage_ids:
md5 = lca_db.lineage_id_to_signature[lid]
signature_size = lca_db.lineage_id_counts[lid]
assignments[hashval].add((md5, signature_size))
idx_list = lca_db.hashval_to_idx.get(hashval, [])

for idx in idx_list:
assignments[hashval].add((lca_db, idx))

# none? quit.
if not assignments:
break

# count the distinct signatures.
counts = Counter()
for hashval, md5_set in assignments.items():
for (md5, sigsize) in md5_set:
counts[(md5, sigsize)] += 1
for hashval, match_set in assignments.items():
for (lca_db, idx) in match_set:
counts[(lca_db, idx)] += 1

# collect the most abundant assignments
common_iter = iter(counts.most_common())
best_list = []
(md5, sigsize), top_count = next(common_iter)
(best_lca_db, best_idx), top_count = next(common_iter)

best_list.append((md5_to_name[md5], md5, sigsize))
for (md5, sigsize), count in common_iter:
best_list.append((best_lca_db, best_idx))
for (lca_db, idx), count in common_iter:
if count != top_count:
break
best_list.append((md5_to_name[md5], md5, sigsize))
best_list.append((lca_db, idx))

# sort on name and pick the top (for consistency).
best_list.sort()
_, top_md5, top_sigsize = best_list[0]
# sort on idx and pick the lowest (for consistency).
best_list.sort(key=lambda x: x[1])
best_lca_db, best_idx = best_list[0]

equiv_counts = len(best_list) - 1

# now, remove from query mins.
# now, remove hashes from query mins.
intersect_mins = set()
for hashval, md5_set in assignments.items():
if (top_md5, top_sigsize) in md5_set:
for hashval, match_set in assignments.items():
if (best_lca_db, best_idx) in match_set:
query_mins.remove(hashval)
intersect_mins.add(hashval)

# should match!
assert top_count == len(intersect_mins)

# calculate size of match (# of hashvals belonging to that sig)
match_size = top_sigsize
match_size = 0
for hashval, idx_list in best_lca_db.hashval_to_idx.items():
if best_idx in idx_list:
match_size += 1

# construct 'result' object
intersect_bp = top_count * query_sig.minhash.scaled
Expand All @@ -155,13 +143,23 @@ def gather_signature(query_sig, dblist, ignore_abundance):
/ len(intersect_mins)
f_match = len(intersect_mins) / match_size

# XXX name and lineage
for ident, idx in best_lca_db.ident_to_idx.items():
if idx == best_idx:
name = best_lca_db.ident_to_name[ident]

lid = best_lca_db.idx_to_lid.get(best_idx)
lineage = ()
if lid is not None:
lineage = best_lca_db.lid_to_lineage[lid]

result = LCAGatherResult(intersect_bp = intersect_bp,
f_unique_to_query= top_count / n_mins,
f_unique_weighted=f_unique_weighted,
average_abund=average_abund,
f_match=f_match,
lineage=md5_to_lineage[top_md5],
name=md5_to_name[top_md5],
lineage=lineage,
name=name,
n_equal_matches=equiv_counts)

f_unassigned = len(query_mins) / n_mins
Expand Down
Loading

0 comments on commit 954d729

Please sign in to comment.