Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: upgrade sig overlap and sig subtract to load more than JSON signatures #3153

Merged
merged 10 commits into from
Jun 4, 2024
16 changes: 8 additions & 8 deletions src/sourmash/sig/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,10 +384,10 @@ def overlap(args):

moltype = sourmash_args.calculate_moltype(args)

sig1 = sourmash.load_one_signature(
sig1 = sourmash_args.load_one_signature(
args.signature1, ksize=args.ksize, select_moltype=moltype
)
sig2 = sourmash.load_one_signature(
sig2 = sourmash_args.load_one_signature(
args.signature2, ksize=args.ksize, select_moltype=moltype
)

Expand Down Expand Up @@ -573,7 +573,7 @@ def intersect(args):
# borrow abundances from a signature?
if args.abundances_from:
notify(f"loading signature from {args.abundances_from}, keeping abundances")
abund_sig = sourmash.load_one_signature(
abund_sig = sourmash_args.load_one_signature(
args.abundances_from, ksize=args.ksize, select_moltype=moltype
)
if not abund_sig.minhash.track_abundance:
Expand Down Expand Up @@ -646,9 +646,8 @@ def subtract(args):
set_quiet(args.quiet)
moltype = sourmash_args.calculate_moltype(args)

from_sigfile = args.signature_from
from_sigobj = sourmash.load_one_signature(
from_sigfile, ksize=args.ksize, select_moltype=moltype
from_sigobj = sourmash_args.load_one_signature(
args.signature_from, ksize=args.ksize, select_moltype=moltype
)

if args.abundances_from: # it's ok to work with abund signatures if -A.
Expand All @@ -661,7 +660,7 @@ def subtract(args):

subtract_mins = set(from_mh.hashes)

notify(f"loaded signature from {from_sigfile}...", end="\r")
notify(f"loaded signature from {args.signature_from}...", end="\r")

progress = sourmash_args.SignatureLoadingProgress()

Expand Down Expand Up @@ -694,9 +693,10 @@ def subtract(args):
# borrow abundances from somewhere?
if args.abundances_from:
notify(f"loading signature from {args.abundances_from}, keeping abundances")
abund_sig = sourmash.load_one_signature(
abund_sig = sourmash_args.load_one_signature(
args.abundances_from, ksize=args.ksize, select_moltype=moltype
)

if not abund_sig.minhash.track_abundance:
error("--track-abundance not set on loaded signature?! exiting.")
sys.exit(-1)
Expand Down
37 changes: 37 additions & 0 deletions src/sourmash/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -810,3 +810,40 @@ def load_file_as_signatures(
return progress.start_file(filename, loader)
else:
return loader


def load_one_signature(
filename,
*,
select_moltype=None,
ksize=None,
picklist=None,
yield_all_files=False,
pattern=None,
):
db = _load_database(filename, yield_all_files)

db = db.select(moltype=select_moltype, ksize=ksize)

# apply pattern search & picklist
db = apply_picklist_and_pattern(db, picklist, pattern)

loader = db.signatures()

# load exactly one!
try:
ss = next(iter(loader))
except StopIteration:
raise ValueError(f"no signatures in '{filename}'? expected exactly one.")

# make sure there's not a second one...
try:
_ = next(iter(loader))

raise ValueError(
f"more than one signature in '{filename}'; expected exactly one"
)
except StopIteration:
pass

return ss
Binary file added tests/test-data/47.fa.sig.zip
Binary file not shown.
Binary file added tests/test-data/63.fa.sig.zip
Binary file not shown.
68 changes: 54 additions & 14 deletions tests/test_cmd_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -769,8 +769,8 @@ def test_sig_inflate_5_bad_moltype(runtmp):
assert "no signatures to inflate" in runtmp.last_result.err


@utils.in_tempdir
def test_sig_subtract_1(c):
def test_sig_subtract_1(runtmp):
c = runtmp
# subtract of 63 from 47
sig47 = utils.get_test_data("47.fa.sig")
sig63 = utils.get_test_data("63.fa.sig")
Expand All @@ -789,6 +789,28 @@ def test_sig_subtract_1(c):
assert set(actual_subtract_sig.minhash.hashes.keys()) == set(mins)


def test_sig_subtract_1_sigzip(runtmp):
c = runtmp
# subtract of 63 from 47
sig47 = utils.get_test_data("47.fa.sig.zip")
sig63 = utils.get_test_data("63.fa.sig.zip")
c.run_sourmash("sig", "subtract", sig47, sig63)

# stdout should be new signature
out = c.last_result.out

from sourmash import sourmash_args

test1_sig = sourmash_args.load_one_signature(sig47)
test2_sig = sourmash_args.load_one_signature(sig63)
actual_subtract_sig = sourmash.load_one_signature(out)

mins = set(test1_sig.minhash.hashes.keys())
mins -= set(test2_sig.minhash.hashes.keys())

assert set(actual_subtract_sig.minhash.hashes.keys()) == set(mins)


def test_sig_subtract_1_abund(runtmp):
# subtract 63 from 47, with abundances borrowed from 47

Expand Down Expand Up @@ -856,8 +878,8 @@ def test_sig_subtract_1_flatten(runtmp):
assert set(actual_subtract_sig.minhash.hashes.keys()) == set(mins)


@utils.in_tempdir
def test_sig_subtract_1_multisig(c):
def test_sig_subtract_1_multisig(runtmp):
c = runtmp
# subtract of everything from 47
sig47 = utils.get_test_data("47.fa.sig")
multisig = utils.get_test_data("47+63-multisig.sig")
Expand All @@ -871,8 +893,8 @@ def test_sig_subtract_1_multisig(c):
assert not set(actual_subtract_sig.minhash.hashes.keys())


@utils.in_tempdir
def test_sig_subtract_2(c):
def test_sig_subtract_2(runtmp):
c = runtmp
# subtract of 63 from 47 should fail if 47 has abund
sig47 = utils.get_test_data("track_abund/47.fa.sig")
sig63 = utils.get_test_data("63.fa.sig")
Expand All @@ -881,8 +903,8 @@ def test_sig_subtract_2(c):
c.run_sourmash("sig", "subtract", sig47, sig63)


@utils.in_tempdir
def test_sig_subtract_3(c):
def test_sig_subtract_3(runtmp):
c = runtmp
# subtract of 63 from 47 should fail if 63 has abund
sig47 = utils.get_test_data("47.fa.sig")
sig63 = utils.get_test_data("track_abund/63.fa.sig")
Expand All @@ -891,8 +913,8 @@ def test_sig_subtract_3(c):
c.run_sourmash("sig", "subtract", sig47, sig63)


@utils.in_tempdir
def test_sig_subtract_4_ksize_fail(c):
def test_sig_subtract_4_ksize_fail(runtmp):
c = runtmp
# subtract of 2 from 47 should fail without -k specified
sig47 = utils.get_test_data("47.fa.sig")
sig2 = utils.get_test_data("2.fa.sig")
Expand All @@ -901,8 +923,8 @@ def test_sig_subtract_4_ksize_fail(c):
c.run_sourmash("sig", "subtract", sig47, sig2)


@utils.in_tempdir
def test_sig_subtract_4_ksize_succeed(c):
def test_sig_subtract_4_ksize_succeed(runtmp):
c = runtmp
# subtract of 2 from 47 should fail without -k specified
sig47 = utils.get_test_data("47.fa.sig")
sig2 = utils.get_test_data("2.fa.sig")
Expand Down Expand Up @@ -3839,8 +3861,8 @@ def test_sig_describe_3_manifest_fails_when_moved(runtmp):
runtmp.sourmash("sig", "describe", "mf.csv")


@utils.in_tempdir
def test_sig_overlap(c):
def test_sig_overlap(runtmp):
c = runtmp
# get overlap details
sig47 = utils.get_test_data("47.fa.sig")
sig63 = utils.get_test_data("63.fa.sig")
Expand All @@ -3857,6 +3879,24 @@ def test_sig_overlap(c):
assert "number of hashes in common: 2529" in out


def test_sig_overlap_2(runtmp):
c = runtmp
# get overlap details
sig47 = utils.get_test_data("47.fa.sig.zip")
sig63 = utils.get_test_data("63.fa.sig.zip")
c.run_sourmash("sig", "overlap", sig47, sig63)
out = c.last_result.out

print(out)

# md5s
assert "09a08691ce52952152f0e866a59f6261" in out
assert "38729c6374925585db28916b82a6f513" in out

assert "similarity: 0.32069" in out
assert "number of hashes in common: 2529" in out


@utils.in_tempdir
def test_import_export_1(c):
# check to make sure we can import what we've exported!
Expand Down
28 changes: 28 additions & 0 deletions tests/test_sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -841,3 +841,31 @@ def test_bug_2370(runtmp):
# try running sourmash_args.load_file_as_index
# runtmp.sourmash('sig', 'describe', runtmp.output('not_really_gzipped.gz'))
sourmash_args.load_file_as_index(runtmp.output("not_really_gzipped.gz"))


def test_load_one_signature_1(runtmp):
# test the sourmash_args.load_one_signature function
sigfile = utils.get_test_data("63.fa.sig.zip")

ss = sourmash_args.load_one_signature(sigfile, ksize=31)
assert ss.name.startswith("NC_011663.1 ")


def test_load_one_signature_2_fail(runtmp):
# test the sourmash_args.load_one_signature function on failure - no sig
sigfile = utils.get_test_data("63.fa.sig.zip")

with pytest.raises(ValueError) as exc:
sourmash_args.load_one_signature(sigfile, ksize=21)

assert "expected exactly one." in str(exc)


def test_load_one_signature_3_fail(runtmp):
# test the sourmash_args.load_one_signature function on failure - many sigs
sigfile = utils.get_test_data("prot/all.zip")

with pytest.raises(ValueError) as exc:
sourmash_args.load_one_signature(sigfile)

assert "more than one signature" in str(exc)
Loading