Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] add lca DBs as inputs to 'sourmash search' and 'gather' #533

Merged
merged 21 commits into from
Dec 19, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions doc/databases.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,15 @@ to build the databases.

## Genbank LCA Database

These databases are formatted for use with `sourmash lca`.
These databases are formatted for use with `sourmash lca`; they are
v2 LCA databases and will work with sourmash v2.0a11 and later.

Approximately 87,000 microbial genomes (including viral and fungal)
from NCBI Genbank.

* [Genbank k=21, 2017.11.07](https://osf.io/s3jx8/download), 105 MB
* [Genbank k=31, 2017.11.07](https://osf.io/zskb9/download), 118 MB
* [Genbank k=51, 2017.11.07](https://osf.io/md2nt/download), 123 MB
* [Genbank k=21, 2017.11.07](https://osf.io/d7rv8/download), 109 MB
* [Genbank k=31, 2017.11.07](https://osf.io/4f8n3/download), 120 MB
* [Genbank k=51, 2017.11.07](https://osf.io/nemkw/download), 125 MB

### Details

Expand Down
5 changes: 3 additions & 2 deletions sourmash/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from .commands import (categorize, compare, compute, dump, import_csv,
gather, index, sbt_combine, search,
plot, watch, info, storage, migrate)
plot, watch, info, storage, migrate, multigather)
from .lca import main as lca_main

usage='''
Expand Down Expand Up @@ -58,7 +58,8 @@ def main():
'sbt_combine': sbt_combine, 'info': info,
'storage': storage,
'lca': lca_main,
'migrate': migrate}
'migrate': migrate,
'multigather': multigather}
parser = argparse.ArgumentParser(
description='work with compressed sequence representations')
parser.add_argument('command', nargs='?')
Expand Down
2 changes: 1 addition & 1 deletion sourmash/_minhash.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ cdef class MinHash(object):

old_scaled = get_scaled_for_max_hash(self.max_hash)
if old_scaled > new_num:
raise ValueError('new scaled is lower than current sample scaled')
raise ValueError('new scaled {} is lower than current sample scaled {}'.format(new_num, old_scaled))

new_max_hash = get_max_hash_for_scaled(new_num)

Expand Down
167 changes: 163 additions & 4 deletions sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -833,7 +833,7 @@ def search(args):
query.minhash = query.minhash.downsample_scaled(args.scaled)

# set up the search databases
databases = sourmash_args.load_sbts_and_sigs(args.databases, query,
databases = sourmash_args.load_dbs_and_sigs(args.databases, query,
not args.containment,
args.traverse_directory)

Expand Down Expand Up @@ -1018,12 +1018,12 @@ def gather(args):
query.minhash = query.minhash.downsample_scaled(args.scaled)

# empty?
if not query.minhash.get_mins():
if not len(query.minhash):
error('no query hashes!? exiting.')
sys.exit(-1)

# set up the search databases
databases = sourmash_args.load_sbts_and_sigs(args.databases, query, False,
databases = sourmash_args.load_dbs_and_sigs(args.databases, query, False,
args.traverse_directory)

if not len(databases):
Expand Down Expand Up @@ -1097,7 +1097,7 @@ def gather(args):
if args.output_unassigned:
if not found:
notify('nothing found - entire query signature unassigned.')
elif not query.minhash.get_mins():
elif not len(query.minhash):
notify('no unassigned hashes! not saving.')
else:
outname = args.output_unassigned.name
Expand All @@ -1109,6 +1109,165 @@ def gather(args):
args.output_unassigned)


def multigather(args):
from .search import gather_databases, format_bp

parser = argparse.ArgumentParser()
parser.add_argument('--db', nargs='+', action='append')
parser.add_argument('--query', nargs='+', action='append')
parser.add_argument('--traverse-directory', action='store_true',
help='search all signatures underneath directories.')
parser.add_argument('--threshold-bp', type=float, default=5e4,
help='threshold (in bp) for reporting results')
parser.add_argument('--scaled', type=float, default=0,
help='downsample query to this scaled factor')
parser.add_argument('-q', '--quiet', action='store_true',
help='suppress non-error output')
parser.add_argument('--ignore-abundance', action='store_true',
help='do NOT use k-mer abundances if present')
parser.add_argument('-d', '--debug', action='store_true')

sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K)
sourmash_args.add_moltype_args(parser)

args = parser.parse_args(args)
set_quiet(args.quiet)
moltype = sourmash_args.calculate_moltype(args)

if not args.db:
error('Error! must specify at least one database with --db')
sys.exit(-1)

if not args.query:
error('Error! must specify at least one query signature with --query')
sys.exit(-1)

# flatten --db and --query
args.db = [item for sublist in args.db for item in sublist]
args.query = [item for sublist in args.query for item in sublist]

query = sourmash_args.load_query_signature(args.query[0],
ksize=args.ksize,
select_moltype=moltype)
# set up the search databases
databases = sourmash_args.load_dbs_and_sigs(args.db, query, False,
args.traverse_directory)

if not len(databases):
error('Nothing found to search!')
sys.exit(-1)

# run gather on all the queries.
for queryfile in args.query:
# load the query signature & figure out all the things
query = sourmash_args.load_query_signature(queryfile,
ksize=args.ksize,
select_moltype=moltype)
notify('loaded query: {}... (k={}, {})', query.name()[:30],
query.minhash.ksize,
sourmash_args.get_moltype(query))

# verify signature was computed right.
if query.minhash.max_hash == 0:
error('query signature needs to be created with --scaled; skipping')
continue

# downsample if requested
if args.scaled:
notify('downsampling query from scaled={} to {}',
ctb marked this conversation as resolved.
Show resolved Hide resolved
query.minhash.scaled, int(args.scaled))
query.minhash = query.minhash.downsample_scaled(args.scaled)

# empty?
if not len(query.minhash):
error('no query hashes!? skipping to next..')
continue

found = []
weighted_missed = 1
for result, weighted_missed, new_max_hash, next_query in gather_databases(query, databases, args.threshold_bp, args.ignore_abundance):
# print interim result & save in a list for later use
pct_query = '{:.1f}%'.format(result.f_orig_query*100)
pct_genome = '{:.1f}%'.format(result.f_match*100)

name = result.leaf._display_name(40)


if not len(found): # first result? print header.
if query.minhash.track_abundance and not args.ignore_abundance:
print_results("")
print_results("overlap p_query p_match avg_abund")
print_results("--------- ------- ------- ---------")
else:
print_results("")
print_results("overlap p_query p_match")
print_results("--------- ------- -------")


# print interim result & save in a list for later use
pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
pct_genome = '{:.1f}%'.format(result.f_match*100)
average_abund ='{:.1f}'.format(result.average_abund)
name = result.leaf._display_name(40)

if query.minhash.track_abundance and not args.ignore_abundance:
print_results('{:9} {:>7} {:>7} {:>9} {}',
format_bp(result.intersect_bp), pct_query, pct_genome,
average_abund, name)
else:
print_results('{:9} {:>7} {:>7} {}',
format_bp(result.intersect_bp), pct_query, pct_genome,
name)
found.append(result)


# basic reporting
print_results('\nfound {} matches total;', len(found))

print_results('the recovered matches hit {:.1f}% of the query',
(1 - weighted_missed) * 100)
print_results('')

if not found:
notify('nothing found... skipping.')
continue

output_base = os.path.basename(queryfile)
output_csv = output_base + '.csv'

fieldnames = ['intersect_bp', 'f_orig_query', 'f_match',
'f_unique_to_query', 'f_unique_weighted',
'average_abund', 'median_abund', 'std_abund', 'name', 'filename', 'md5']
with open(output_csv, 'wt') as fp:
w = csv.DictWriter(fp, fieldnames=fieldnames)
w.writeheader()
for result in found:
d = dict(result._asdict())
del d['leaf'] # actual signature not in CSV.
w.writerow(d)

output_matches = output_base + '.matches.sig'
with open(output_matches, 'wt') as fp:
outname = output_matches
notify('saving all matches to "{}"', outname)
sig.save_signatures([ r.leaf for r in found ], fp)

output_unassigned = output_base + '.unassigned.sig'
with open(output_unassigned, 'wt') as fp:
if not found:
notify('nothing found - entire query signature unassigned.')
elif not len(query.minhash):
notify('no unassigned hashes! not saving.')
else:
notify('saving unassigned hashes to "{}"', output_unassigned)

e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash)
e.add_many(next_query.minhash.get_mins())
sig.save_signatures([ sig.SourmashSignature(e) ], fp)

# fini, next query!


def watch(args):
"Build a signature from raw FASTA/FASTQ coming in on stdin, search."

Expand Down
70 changes: 34 additions & 36 deletions sourmash/lca/command_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,71 +81,59 @@ def gather_signature(query_sig, dblist, ignore_abundance):
md5_to_lineage = {}
md5_to_name = {}

x = set()
for hashval in query_mins:
for lca_db in dblist:
lineage_ids = lca_db.hashval_to_lineage_id.get(hashval, [])
for lid in lineage_ids:
md5 = lca_db.lineage_id_to_signature[lid]
x.add((lca_db, lid, md5))

for lca_db, lid, md5 in x:
md5_to_lineage[md5] = lca_db.lineage_dict[lid]
if lca_db.signature_to_name:
md5_to_name[md5] = lca_db.signature_to_name[md5]
else:
md5_to_name[md5] = ''

# now! do the gather:
while 1:
# find all of the assignments for the current set of hashes
assignments = defaultdict(set)
for hashval in query_mins:
for lca_db in dblist:
lineage_ids = lca_db.hashval_to_lineage_id.get(hashval, [])
for lid in lineage_ids:
md5 = lca_db.lineage_id_to_signature[lid]
signature_size = lca_db.lineage_id_counts[lid]
assignments[hashval].add((md5, signature_size))
idx_list = lca_db.hashval_to_idx.get(hashval, [])

for idx in idx_list:
assignments[hashval].add((lca_db, idx))

# none? quit.
if not assignments:
break

# count the distinct signatures.
counts = Counter()
for hashval, md5_set in assignments.items():
for (md5, sigsize) in md5_set:
counts[(md5, sigsize)] += 1
for hashval, match_set in assignments.items():
for (lca_db, idx) in match_set:
counts[(lca_db, idx)] += 1

# collect the most abundant assignments
common_iter = iter(counts.most_common())
best_list = []
(md5, sigsize), top_count = next(common_iter)
(best_lca_db, best_idx), top_count = next(common_iter)

best_list.append((md5_to_name[md5], md5, sigsize))
for (md5, sigsize), count in common_iter:
best_list.append((best_lca_db, best_idx))
for (lca_db, idx), count in common_iter:
if count != top_count:
break
best_list.append((md5_to_name[md5], md5, sigsize))
best_list.append((lca_db, idx))

# sort on name and pick the top (for consistency).
best_list.sort()
_, top_md5, top_sigsize = best_list[0]
# sort on idx and pick the lowest (for consistency).
best_list.sort(key=lambda x: x[1])
best_lca_db, best_idx = best_list[0]

equiv_counts = len(best_list) - 1

# now, remove from query mins.
# now, remove hashes from query mins.
intersect_mins = set()
for hashval, md5_set in assignments.items():
if (top_md5, top_sigsize) in md5_set:
for hashval, match_set in assignments.items():
if (best_lca_db, best_idx) in match_set:
query_mins.remove(hashval)
intersect_mins.add(hashval)

# should match!
assert top_count == len(intersect_mins)

# calculate size of match (# of hashvals belonging to that sig)
match_size = top_sigsize
match_size = 0
for hashval, idx_list in best_lca_db.hashval_to_idx.items():
ctb marked this conversation as resolved.
Show resolved Hide resolved
if best_idx in idx_list:
match_size += 1

# construct 'result' object
intersect_bp = top_count * query_sig.minhash.scaled
Expand All @@ -155,13 +143,23 @@ def gather_signature(query_sig, dblist, ignore_abundance):
/ len(intersect_mins)
f_match = len(intersect_mins) / match_size

# XXX name and lineage
for ident, idx in best_lca_db.ident_to_idx.items():
if idx == best_idx:
name = best_lca_db.ident_to_name[ident]

lid = best_lca_db.idx_to_lid.get(best_idx)
lineage = ()
if lid is not None:
lineage = best_lca_db.lid_to_lineage[lid]

result = LCAGatherResult(intersect_bp = intersect_bp,
f_unique_to_query= top_count / n_mins,
f_unique_weighted=f_unique_weighted,
average_abund=average_abund,
f_match=f_match,
lineage=md5_to_lineage[top_md5],
name=md5_to_name[top_md5],
lineage=lineage,
name=name,
n_equal_matches=equiv_counts)

f_unassigned = len(query_mins) / n_mins
Expand Down
Loading