Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…o add/zipfile_use_storage
  • Loading branch information
ctb committed Jun 24, 2021
2 parents c039fd6 + 9dbd8b5 commit dec537a
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 30 deletions.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,18 @@ Tests require py.test and can be run with `make test`.
Please see [the developer notes](doc/developer.md) for more information
on getting set up with a development environment.

## Research notice

Please note that this repository is participating in a study into sustainability
of open source projects. Data will be gathered about this repository for
approximately the next 12 months, starting from 2021-06-11.

Data collected will include number of contributors, number of PRs, time taken to
close/merge these PRs, and issues closed.

For more information, please visit
[our informational page](https://sustainable-open-science-and-software.github.io/) or download our [participant information sheet](https://sustainable-open-science-and-software.github.io/assets/PIS_sustainable_software.pdf).

----

CTB
Expand Down
4 changes: 2 additions & 2 deletions src/sourmash/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ def add_moltype_args(parser):
parser.set_defaults(hp=False)

parser.add_argument(
'--dna', '--rna', dest='dna', default=None, action='store_true',
'--dna', '--rna', '--nucleotide', dest='dna', default=None, action='store_true',
help='choose a nucleotide signature (default: True)')
parser.add_argument(
'--no-dna', '--no-rna', dest='dna', action='store_false',
'--no-dna', '--no-rna', '--no-nucleotide', dest='dna', action='store_false',
help='do not choose a nucleotide signature')
parser.set_defaults(dna=None)

Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -744,7 +744,7 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False):
if kind == "Zip":
manifest_name = os.path.join(storage.subdir, manifest_name)
manifest_path = storage.save(manifest_name, manifest_data,
overwrite=True)
overwrite=True, compress=True)
elif kind == "FS":
manifest_name = manifest_name
manifest_path = storage.save(manifest_name, manifest_data,
Expand Down
29 changes: 23 additions & 6 deletions src/sourmash/sbt_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,22 @@ def _generate_filename(self, zf, path, content):

assert 0 # should never get here!

def save(self, path, content, *, overwrite=False):
def _write_to_zf(self, zf, path, content, *, compress=False):
compress_type = zipfile.ZIP_STORED
if compress:
compress_type = zipfile.ZIP_DEFLATED

# save to zipfile
zf.writestr(path, content, compress_type=compress_type)

# set permissions
zi = zf.getinfo(path)
perms = 0o444 << 16 # give a+r access
if path.endswith('/'):
perms = 0o755 << 16 # directories get u+rwx, a+rx
zi.external_attr = perms

def save(self, path, content, *, overwrite=False, compress=False):
# First try to save to self.zipfile, if it is not writable
# or would introduce duplicates then try to save it in the buffer
if overwrite:
Expand All @@ -161,13 +176,15 @@ def save(self, path, content, *, overwrite=False):
newpath, do_write = self._generate_filename(self.zipfile, path, content)
if do_write:
try:
self.zipfile.writestr(newpath, content)
self._write_to_zf(self.zipfile, newpath, content,
compress=compress)
except (ValueError, RuntimeError):
# Can't write in the zipfile, write in buffer instead
# CTB: do we need to generate a new filename wrt to the
# bufferzip, too? Not sure this code is working as intended...
if self.bufferzip:
self.bufferzip.writestr(newpath, content)
self._write_to_zf(self.bufferzip, newpath, content,
compress=compress)
else:
# Throw error, can't write the data
raise ValueError("can't write data")
Expand Down Expand Up @@ -235,10 +252,10 @@ def flush(self, *, keep_closed=False):
if item in duplicated or item in buffer_names:
# we prioritize writing data from the buffer to the
# final file
final_file.writestr(item, self.bufferzip.read(item))
self._write_to_zf(final_file, item, self.bufferzip.read(item))
else:
# it is only in the zipfile, so write from it
final_file.writestr(item, self.zipfile.read(item))
self._write_to_zf(final_file, item, self.zipfile.read(item))

# close the files, remove the old one and copy the final
# file to the right place.
Expand All @@ -257,7 +274,7 @@ def flush(self, *, keep_closed=False):
zf = zipfile.ZipFile(self.path, mode='a',
compression=zipfile.ZIP_STORED)
for item in new_data:
zf.writestr(item, self.bufferzip.read(item))
self._write_to_zf(zf, item, self.bufferzip.read(item))
self.zipfile = zf
# finally, close the buffer and release memory
self.bufferzip.close()
Expand Down
52 changes: 32 additions & 20 deletions src/sourmash/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,8 +723,8 @@ class SaveSignatures_ZipFile(_BaseSaveSignaturesToLocation):
"Save compressed signatures in an uncompressed Zip file."
def __init__(self, location):
super().__init__(location)
self.storage = None
self.zf = None

def __repr__(self):
return f"SaveSignatures_ZipFile('{self.location}')"

Expand All @@ -737,41 +737,53 @@ def close(self):
manifest.write_to_csv(manifest_fp, write_header=True)
manifest_data = manifest_fp.getvalue().encode("utf-8")

self.storage.save(manifest_name, manifest_data, overwrite=True)
self.storage.flush()
self.storage.close()
# compress the manifest --
self.zf.writestr(manifest_name, manifest_data,
compress_type=zipfile.ZIP_DEFLATED)

def open(self):
from .sbt_storage import ZipStorage
storage = ZipStorage(self.location)
if not storage.subdir:
storage.subdir = 'signatures'
# set permissions:
zi = self.zf.getinfo(manifest_name)
zi.external_attr = 0o444 << 16 # give a+r access

self.zf.close()

self.storage = storage
self.manifest_rows = [] # CTB: load manifest here for append?
def open(self):
self.zf = zipfile.ZipFile(self.location, 'w', zipfile.ZIP_STORED)
self.manifest_rows = []

def _exists(self, name):
try:
self.storage.load(name)
self.zf.getinfo(name)
return True
except KeyError:
return False

def add(self, ss):
if not self.storage:
if not self.zf:
raise ValueError("this output is not open")

super().add(ss)

buf = sigmod.save_signatures([ss], compression=1)
md5 = ss.md5sum()
outname = f"signatures/{md5}.sig.gz"

# don't overwrite even if duplicate md5sum.
if self._exists(outname):
i = 0
while 1:
outname = os.path.join(self.location, f"{md5}_{i}.sig.gz")
if not self._exists(outname):
break
i += 1

json_str = sourmash.save_signatures([ss], compression=1)
self.zf.writestr(outname, json_str)

storage = self.storage
path = f'{storage.subdir}/{md5}.sig.gz'
location = storage.save(path, buf)
# set permissions:
zi = self.zf.getinfo(outname)
zi.external_attr = 0o444 << 16 # give a+r access

# update manifest
row = CollectionManifest.make_manifest_row(ss, location,
row = CollectionManifest.make_manifest_row(ss, outname,
include_signature=False)
self.manifest_rows.append(row)

Expand Down
28 changes: 27 additions & 1 deletion tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,32 @@ def test_do_basic_compare_using_rna_arg(c):
assert (cmp_out == cmp_calc).all()


def test_do_basic_compare_using_nucleotide_arg(runtmp):
# try doing a basic compare using --nucleotide instead of --dna/--rna
c=runtmp
import numpy
testsigs = utils.get_test_data('genome-s1*.sig')
testsigs = glob.glob(testsigs)

c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--nucleotide', *testsigs)

cmp_outfile = c.output('cmp')
assert os.path.exists(cmp_outfile)
cmp_out = numpy.load(cmp_outfile)

sigs = []
for fn in testsigs:
sigs.append(sourmash.load_one_signature(fn, ksize=21,
select_moltype='dna'))

cmp_calc = numpy.zeros([len(sigs), len(sigs)])
for i, si in enumerate(sigs):
for j, sj in enumerate(sigs):
cmp_calc[i][j] = si.similarity(sj)

assert (cmp_out == cmp_calc).all()


@utils.in_tempdir
def test_do_compare_quiet(c):
testdata1 = utils.get_test_data('short.fa')
Expand Down Expand Up @@ -2112,7 +2138,7 @@ def test_do_sourmash_index_bad_args():
in_directory=location, fail_ok=True)

print(out, err)
assert 'cannot specify more than one of --dna/--rna/--protein/--hp/--dayhoff' in err
assert 'cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff' in err
assert status != 0


Expand Down

0 comments on commit dec537a

Please sign in to comment.