Skip to content

Commit

Permalink
[MRG] use picklist to exclude signatures (#1623)
Browse files Browse the repository at this point in the history
* one way to do exclusionary picklists

* add extract picklist exclude tests; minor reporting change for exclude

* propagate exclude testing

* use PickStyle enum

* do not count n_missing for exclude; document command line

* text format

* add explicit test for include
  • Loading branch information
bluegenes authored Jun 22, 2021
1 parent 0814bcc commit c04f137
Show file tree
Hide file tree
Showing 9 changed files with 848 additions and 26 deletions.
12 changes: 12 additions & 0 deletions doc/command-line.md
Original file line number Diff line number Diff line change
Expand Up @@ -982,6 +982,18 @@ One way to build a picklist is to use `sourmash sig describe --csv
out.csv <signatures>` to construct an initial CSV file that you can
then edit further.

The picklist functionality also supports excluding (rather than
including) signatures matching the picklist arguments. To specify a
picklist for exclusion, add `:exclude` to the `--picklist` argument
string, e.g. `pickfile:colname:coltype:exclude`.

For example,
```
sourmash sig extract --picklist list.csv:md5:md5sum:exclude <signatures>
```
will extract only the signatures that have md5sums that **do not** match
entries in the column `md5sum` in the CSV file `list.csv`.

In addition to `sig extract`, the following commands support
`--picklist` selection: `index`, `search`, `gather`, `prefetch`,
`compare`, `index`, and `lca index`.
Expand Down
30 changes: 26 additions & 4 deletions src/sourmash/picklist.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"Picklist code for extracting subsets of signatures."
import csv
from enum import Enum

# set up preprocessing functions for column stuff
preprocess = {}
Expand All @@ -17,6 +18,11 @@
preprocess['md5short'] = lambda x: x[:8]


class PickStyle(Enum):
INCLUDE = 1
EXCLUDE = 2


class SignaturePicklist:
"""Picklist class for subsetting collections of signatures.
Expand All @@ -41,15 +47,17 @@ class SignaturePicklist:
supported_coltypes = ('md5', 'md5prefix8', 'md5short',
'name', 'ident', 'identprefix')

def __init__(self, coltype, *, pickfile=None, column_name=None):
def __init__(self, coltype, *, pickfile=None, column_name=None, pickstyle=PickStyle.INCLUDE):
"create a picklist of column type 'coltype'."
self.coltype = coltype
self.pickfile = pickfile
self.column_name = column_name
self.pickstyle = pickstyle

if coltype not in self.supported_coltypes:
raise ValueError(f"invalid picklist column type '{coltype}'")


self.preprocess_fn = preprocess[coltype]
self.pickset = None
self.found = set()
Expand All @@ -60,6 +68,15 @@ def from_picklist_args(cls, argstr):
"load a picklist from an argument string 'pickfile:column:coltype'"
picklist = argstr.split(':')
if len(picklist) != 3:
if len(picklist) == 4:
pickfile, column, coltype, pickstyle = picklist
if pickstyle == 'include':
return cls(coltype, pickfile=pickfile, column_name=column, pickstyle=PickStyle.INCLUDE)
elif pickstyle == 'exclude':
return cls(coltype, pickfile=pickfile, column_name=column, pickstyle=PickStyle.EXCLUDE)
else:
raise ValueError(f"invalid picklist 'pickstyle' argument, '{pickstyle}': must be 'include' or 'exclude'")

raise ValueError(f"invalid picklist argument '{argstr}'")

assert len(picklist) == 3
Expand Down Expand Up @@ -131,9 +148,14 @@ def __contains__(self, ss):
self.n_queries += 1

# determine if ok or not.
if q in self.pickset:
self.found.add(q)
return True
if self.pickstyle == PickStyle.INCLUDE:
if q in self.pickset:
self.found.add(q)
return True
elif self.pickstyle == PickStyle.EXCLUDE:
if q not in self.pickset:
self.found.add(q)
return True
return False

def filter(self, it):
Expand Down
17 changes: 11 additions & 6 deletions src/sourmash/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from .index import (LinearIndex, ZipFileLinearIndex, MultiIndex)
from . import signature as sigmod
from .picklist import SignaturePicklist
from .picklist import SignaturePicklist, PickStyle


DEFAULT_LOAD_K = 31
Expand Down Expand Up @@ -84,10 +84,15 @@ def load_picklist(args):


def report_picklist(args, picklist):
notify(f"for given picklist, found {len(picklist.found)} matches to {len(picklist.pickset)} distinct values")
n_missing = len(picklist.pickset - picklist.found)
if picklist.pickstyle == PickStyle.INCLUDE:
notify(f"for given picklist, found {len(picklist.found)} matches to {len(picklist.pickset)} distinct values")
n_missing = len(picklist.pickset - picklist.found)
elif picklist.pickstyle == PickStyle.EXCLUDE:
notify(f"for given picklist, found {len(picklist.found)} matches by excluding {len(picklist.pickset)} distinct values")
n_missing = 0
if n_missing:
notify(f"WARNING: {n_missing} missing picklist values.")
# Note - picklist_require_all is currently only relevant for PickStyle.INCLUDE
if args.picklist_require_all:
error("ERROR: failing because --picklist-require-all was set")
sys.exit(-1)
Expand Down Expand Up @@ -435,7 +440,7 @@ def load_pathlist_from_file(filename):
if not os.path.exists(checkfile):
raise ValueError(f"file '{checkfile}' inside the pathlist does not exist")
except IOError:
raise ValueError(f"pathlist file '{filename}' does not exist")
raise ValueError(f"pathlist file '{filename}' does not exist")
except OSError:
raise ValueError(f"cannot open file '{filename}'")
except UnicodeDecodeError:
Expand Down Expand Up @@ -632,7 +637,7 @@ class SaveSignatures_Directory(_BaseSaveSignaturesToLocation):
"Save signatures within a directory, using md5sum names."
def __init__(self, location):
super().__init__(location)

def __repr__(self):
return f"SaveSignatures_Directory('{self.location}')"

Expand Down Expand Up @@ -709,7 +714,7 @@ class SaveSignatures_ZipFile(_BaseSaveSignaturesToLocation):
def __init__(self, location):
super().__init__(location)
self.zf = None

def __repr__(self):
return f"SaveSignatures_ZipFile('{self.location}')"

Expand Down
Loading

0 comments on commit c04f137

Please sign in to comment.