Skip to content

Commit

Permalink
move manifest stuff into manifest class
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb committed Jun 16, 2021
1 parent a4057e6 commit 72d8497
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 49 deletions.
60 changes: 36 additions & 24 deletions src/sourmash/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -951,6 +951,11 @@ class CollectionManifest:
* 'locations()' returns all distinct locations for e.g. lazy loading
* supports container protocol for signatures, e.g. 'if ss in manifest: ...'
"""
# each manifest row must have the following, although they may be empty.
required_keys = ('internal_location',
'md5', 'md5short', 'ksize', 'moltype', 'num',
'scaled', 'n_hashes', 'with_abundance',
'name', 'filename')

def __init__(self, rows):
"Initialize from an iterable of metadata dictionaries."
Expand All @@ -973,12 +978,11 @@ def load_from_csv(cls, fp):
"load a manifest from a CSV file."
manifest_list = []
r = csv.DictReader(fp)
for k in ('internal_location', 'md5', 'md5short', 'ksize',
'moltype', 'num', 'scaled', 'n_hashes', 'seed',
'with_abundance', 'name'):
if not r.fieldnames:
return None
if not r.fieldnames:
raise ValueError("missing column headers in manifest")
return None

for k in cls.required_keys:
if k not in r.fieldnames:
raise ValueError(f"missing column '{k}' in manifest.")

Expand All @@ -990,32 +994,40 @@ def load_from_csv(cls, fp):
return cls(manifest_list)

@classmethod
def create_manifest(cls, locations_iter):
def make_manifest_row(cls, ss, location, *, include_signature=True):
row = {}
row['md5'] = ss.md5sum()
row['md5short'] = row['md5'][:8]
row['ksize'] = ss.minhash.ksize
row['moltype'] = ss.minhash.moltype
row['num'] = ss.minhash.num
row['scaled'] = ss.minhash.scaled
row['n_hashes'] = len(ss.minhash)
row['with_abundance'] = 1 if ss.minhash.track_abundance else 0
row['name'] = ss.name
row['filename'] = ss.filename
# @CTB: do we want filename in manifests?
row['internal_location'] = location
# @CTB: change key, maybe just make it 'location'

assert set(row.keys()) == set(cls.required_keys)

if include_signature:
# CTB: track signature when creating manifest w/this info.
row['signature'] = ss
return row

@classmethod
def create_manifest(cls, locations_iter, *, include_signature=True):
"""Create a manifest from an iterator that yields (ss, location)
Stores signatures in manifest rows.
Stores signatures in manifest rows by default.
Note: do NOT catch exceptions here, so this passes through load excs.
"""
manifest_list = []
for ss, location in locations_iter:
row = {}
row['md5'] = ss.md5sum()
row['md5short'] = row['md5'][:8]
row['ksize'] = ss.minhash.ksize
row['moltype'] = ss.minhash.moltype
row['num'] = ss.minhash.num
row['scaled'] = ss.minhash.scaled
row['n_hashes'] = len(ss.minhash)
row['with_abundance'] = 1 if ss.minhash.track_abundance else 0
row['name'] = ss.name
# @CTB: do we want filename in manifests?
row['internal_location'] = location
# @CTB: change key, maybe just make it 'location'

# CTB: track signature when creating manifest w/this info.
row['signature'] = ss

row = cls.make_manifest_row(ss, location, include_signature=True)
manifest_list.append(row)

return cls(manifest_list)
Expand Down
33 changes: 8 additions & 25 deletions src/sourmash/sig/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,16 +255,15 @@ def manifest(args):
"""
build a signature manifest
"""
from sourmash.index import CollectionManifest

set_quiet(args.quiet)

# CTB: might want to switch to sourmash_args.FileOutputCSV here?
csv_fp = open(args.output, 'w', newline='')
w = csv.DictWriter(csv_fp,
['internal_location',
'md5', 'md5short', 'ksize', 'moltype', 'num',
'scaled', 'n_hashes', 'seed', 'with_abundance',
'name', 'filename', 'license'],
extrasaction='ignore')

keys = CollectionManifest.required_keys
w = csv.DictWriter(csv_fp, fieldnames=keys)
w.writeheader()

try:
Expand All @@ -278,26 +277,10 @@ def manifest(args):

n = 0
for n, (sig, parent, loc) in enumerate(loader.signatures_with_internal()):
internal_location = loc

# extract info, write as appropriate.
mh = sig.minhash
ksize = mh.ksize
moltype = mh.moltype
scaled = mh.scaled
num = mh.num
seed = mh.seed
n_hashes = len(mh)
with_abundance = 0
if mh.track_abundance:
with_abundance = 1
md5 = sig.md5sum()
md5short = md5[:8]
name = sig.name
filename = sig.filename
license = sig.license

w.writerow(locals())
row = CollectionManifest.make_manifest_row(sig, loc,
include_signature=False)
w.writerow(row)

notify(f'built manifest for {n} signatures total.')

Expand Down

0 comments on commit 72d8497

Please sign in to comment.