Skip to content

Commit

Permalink
change internal zipfile writing to use ZipStorage
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb committed Jun 16, 2021
1 parent 39abe57 commit 5a185bb
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 18 deletions.
51 changes: 33 additions & 18 deletions src/sourmash/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from enum import Enum
import traceback
import gzip
import zipfile
from io import StringIO

import screed
import sourmash
Expand All @@ -17,7 +17,8 @@

from .logging import notify, error, debug_literal

from .index import (LinearIndex, ZipFileLinearIndex, LoadedCollection)
from .index import (LinearIndex, ZipFileLinearIndex, LoadedCollection,
CollectionManifest)
from . import signature as sigmod

DEFAULT_LOAD_K = 31
Expand Down Expand Up @@ -671,43 +672,57 @@ class SaveSignatures_ZipFile(_BaseSaveSignaturesToLocation):
"Save compressed signatures in an uncompressed Zip file."
def __init__(self, location):
super().__init__(location)
self.zf = None
self.storage = None

def __repr__(self):
return f"SaveSignatures_ZipFile('{self.location}')"

def close(self):
self.zf.close()
# finish constructing manifest object & save
manifest = CollectionManifest(self.manifest_rows)
manifest_name = f"SOURMASH-MANIFEST.csv"

manifest_fp = StringIO()
manifest.write_to_csv(manifest_fp)
manifest_data = manifest_fp.getvalue().encode("utf-8")

self.storage.save(manifest_name, manifest_data, overwrite=True)
self.storage.flush()
self.storage.close()

def open(self):
self.zf = zipfile.ZipFile(self.location, 'w', zipfile.ZIP_STORED)
from .sbt_storage import ZipStorage
storage = ZipStorage(self.location)
if not storage.subdir:
storage.subdir = 'signatures'

self.storage = storage
self.manifest_rows = [] # CTB: load manifest here for append?

def _exists(self, name):
try:
self.zf.getinfo(name)
self.storage.load(name)
return True
except KeyError:
return False

def add(self, ss):
if not self.zf:
if not self.storage:
raise ValueError("this output is not open")

super().add(ss)

buf = sigmod.save_signatures([ss], compression=1)
md5 = ss.md5sum()
outname = f"signatures/{md5}.sig.gz"

# don't overwrite even if duplicate md5sum.
if self._exists(outname):
i = 0
while 1:
outname = os.path.join(self.location, f"{md5}_{i}.sig.gz")
if not self._exists(outname):
break
i += 1
storage = self.storage
path = f'{storage.subdir}/{md5}.sig.gz'
location = storage.save(path, buf)

json_str = sourmash.save_signatures([ss], compression=1)
self.zf.writestr(outname, json_str)
# update manifest
row = CollectionManifest.make_manifest_row(ss, location,
include_signature=False)
self.manifest_rows.append(row)


class SigFileSaveType(Enum):
Expand Down
5 changes: 5 additions & 0 deletions tests/test_sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,12 @@ def test_save_signatures_to_location_1_zip_dup(runtmp):
print(save_sig)
save_sig.add(ss2)
save_sig.add(ss47)

# here we have to change the names so the sig content is different;
# exactly duplicates will not be saved, otherwise.
ss2.name = 'different name for ss2'
save_sig.add(ss2)
ss47.name = 'different name for ss47'
save_sig.add(ss47)

# can we open as a .zip file?
Expand Down

0 comments on commit 5a185bb

Please sign in to comment.