Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] add script for merging signatures by value in spreadsheet column #1

Merged
merged 5 commits into from
Apr 6, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@ Another example: [building protein and DNA databases starting from genomes](./ex
* `genbank-to-fromfile.py` - build a `fromfile` CSV file from a list of FASTA files downloaded from Genbank
* `kiln.py` - support library for building `fromfile` CSVs.
* `mass-rename.py` - a script to bulk-rename sourmash signatures.
* `mass-merge.py` - a script to bulk-merge sourmash signatures by spreadsheet column attribute.
* `sigs-to-manifest.py` - a script to extract and/or update sourmash manifests from many databases.
203 changes: 203 additions & 0 deletions mass-merge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
#! /usr/bin/env python
"""
Merge signatures in bulk, based on an attribute in the spreadsheet.

mass-merge.py takes a list of databases along with spreadsheets (w/-F),
and merges signatures by values in the specified `merge-col`.

The spreadsheet must contain two columns, 'ident' and the value of `--merge-col`;
signatures are selected based on 'ident' and renamed to the value found in the merge column.
Singletons (no additional signatures to merge) will be renamed for consistency.
"""
import sys
import argparse
import csv
from collections import defaultdict

import sourmash

from sourmash.picklist import SignaturePicklist
from sourmash.logging import set_quiet, error, notify, print_results, debug
from sourmash import sourmash_args
from sourmash.cli.utils import (add_moltype_args, add_ksize_arg)
#from sourmash.sig import _check_abundance_compatibility

def _check_abundance_compatibility(sig1, sig2):
if sig1.minhash.track_abundance != sig2.minhash.track_abundance:
raise ValueError("incompatible signatures: track_abundance is {} in first sig, {} in second".format(sig1.minhash.track_abundance, sig2.minhash.track_abundance))


def massmerge(args):
"""
rename one or more signatures.
"""
set_quiet(args.quiet, args.quiet)
moltype = sourmash_args.calculate_moltype(args)
merge_col = args.merge_col

# load spreadsheets
merge_d = defaultdict(list)
all_idents=set()
for filename in args.from_spreadsheet:
count = 0
with open(filename, newline='') as fp:
first_entry= True
r = csv.DictReader(fp)
for row in r:
if first_entry:
if not merge_col in r.fieldnames:
error(f"ERROR on spreadsheet '{filename}'.")
error(f"Merge column {merge_col} is not present.")
sys.exit(-1)
merge_name = row[merge_col]
ident = row['ident']

assert ' ' not in ident, f"identifiers cannot have spaces - but '{ident}' does."
assert ident not in all_idents, f"duplicate identifer: '{ident}'"
all_idents.add(ident)

merge_d[merge_name].append(ident)
count += 1
notify(f"loaded {count} identifiers from '{filename}'")

num_merge_names = len(merge_d)
notify(f"found a total of {num_merge_names} distinct values for signature merging.")

# load each db and check that we can find all idents
found_idents = set()
idx_list = []
for db in args.dblist:
notify(f"loading index '{db}'")
idx = sourmash.load_file_as_index(db)

manifest = idx.manifest
if manifest is None:
error(f"ERROR on filename '{db}'.")
error("No manifest, but a manifest is required.")
sys.exit(-1)

ident_picklist = SignaturePicklist('ident')
ident_picklist.pickset = all_idents

check_idx = idx.select(ksize=args.ksize,
moltype=moltype,
picklist=ident_picklist)
found_idents.update(ident_picklist.found)
idx_list.append(idx)

# make sure that we get all the things.
if not all_idents.issubset(found_idents):
remaining = all_idents - found_idents
error(f"ERROR: {len(remaining)} identifiers from spreadsheet not found.")
example_missing = "\n".join(remaining)
error(f"Here are some examples: {example_missing}")
sys.exit(-1)

if args.check:
notify("Everything looks copacetic. Exiting as requested by `--check`")
sys.exit(0)

notify("Everything looks copacetic. Proceeding to merge!")

# go through, do merge, save.
with sourmash_args.SaveSignaturesToLocation(args.output) as save_sigs:
n=0
n_singletons = 0
for m, (merge_name, idents) in enumerate(merge_d.items()):
# keep count of singletons
if len(idents) == 1:
n_singletons+=1
if m % 100 == 0:
merge_percent = float(n)/len(found_idents) * 100
notify(f"...merging sigs for {merge_name} ({merge_percent:.1f}% of sigs merged)", end="\r")

# build a new picklist for idents to be merged
ident_picklist = SignaturePicklist('ident')
ident_picklist.pickset = set(idents)

# loop through each idx (db), select sigs, merge mh
first_sig = None
mh = None
merged_ss = None
for idx in idx_list:
this_idx = idx.select(ksize=args.ksize,
moltype=moltype,
picklist=ident_picklist)
for ss in this_idx.signatures():
n += 1
# first sig? initialize some things
if first_sig is None:
first_sig = ss
mh = first_sig.minhash.copy_and_clear()

# forcibly remove abundance?
if args.flatten:
mh.track_abundance = False

try:
sigobj_mh = ss.minhash
if not args.flatten:
_check_abundance_compatibility(first_sig, ss)
else:
sigobj_mh.track_abundance = False

mh.merge(sigobj_mh)
except (TypeError, ValueError) as exc:
error("ERROR when merging signature '{}' ({}) from file {}",
ss, ss.md5sum()[:8])
error(str(exc))
sys.exit(-1)

merge_percent = float(n)/len(found_idents) * 100
notify(f"...merged {len(idents)} sigs for {merge_name} ({merge_percent:.1f}% of sigs merged)", end="\r")
# create merged sig and write to output
merged_ss = sourmash.SourmashSignature(mh, name=merge_name)
save_sigs.add(merged_ss)

notify(f"merged {n} signatures into {len(save_sigs)} signatures by column: {merge_col}")
notify(f" of these, {n_singletons} were singletons (no merge; just renamed)")


def main():
p = argparse.ArgumentParser()
p.add_argument('dblist', nargs='+')

p.add_argument(
'-q', '--quiet', action='store_true',
help='suppress non-error output'
)
p.add_argument(
'--flatten', action='store_true',
help='remove abundances from all signatures while merging'
)
p.add_argument(
'--merge-col', required=True,
help='the column to merge signatures by (required)'
)
p.add_argument(
'-o', '--output', metavar='FILE', default='-',
help='output signature to this file (default stdout)'
)
p.add_argument(
'-f', '--force', action='store_true',
help='try to load all files as signatures'
)
p.add_argument(
'--check', action='store_true',
help='Just check for ability to merge; do not actually merge signatures.'
)
p.add_argument('-F', '--from-spreadsheet',
required=True,
action='append', default=[],
help="input spreadsheet containing 'ident' and '--merge-col` columns")

add_ksize_arg(p, 31)
add_moltype_args(p)

args = p.parse_args()

massmerge(args)


if __name__ == '__main__':
sys.exit(main())