Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

databases and utilities #77

Merged
merged 13 commits into from
Jun 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions .condarc.yaml

This file was deleted.

3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -138,13 +138,14 @@ dmypy.json
autometa/*.pyc
autometa/taxonomy/*.pyc
autometa/databases/markers/*.h3*
autometa/databases/ncbi/*

# databases / testing
tests/data/*
!tests/data/metagenome.fna

# visualStudioCode
.vscode/*
.vscode/
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
Expand Down
171 changes: 130 additions & 41 deletions autometa/common/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,12 @@
import logging
import os
import pickle
import sys
import tarfile
import time

import numpy as np

from functools import wraps


Expand Down Expand Up @@ -101,15 +104,19 @@ def make_pickle(obj, outfpath):
return outfpath


def gunzip(infpath, outfpath):
"""Decompress gzipped `infpath` to `outfpath`.
def gunzip(infpath, outfpath, delete_original=False, block_size=65536):
"""Decompress gzipped `infpath` to `outfpath` and write checksum of `outfpath` upon successful decompression.

Parameters
----------
infpath : str
</path/to/file.gz>
outfpath : str
</path/to/file>
delete_original : bool
Will delete the original file after successfully decompressing `infpath` (Default is False).
block_size : int
Amount of `infpath` to read in to memory before writing to `outfpath` (Default is 65536 bytes).

Returns
-------
Expand All @@ -125,15 +132,21 @@ def gunzip(infpath, outfpath):
logger.debug(
f"gunzipping {os.path.basename(infpath)} to {os.path.basename(outfpath)}"
)
if os.path.exists(outfpath) and os.stat(outfpath).st_size > 0:
if os.path.exists(outfpath) and os.path.getsize(outfpath) > 0:
raise FileExistsError(outfpath)
lines = ""
with gzip.open(infpath) as fh:
for line in fh:
lines += line.decode()
with open(outfpath, "w") as out:
with gzip.open(infpath, "rt") as fh, open(outfpath, "w") as out:
for i, line in enumerate(fh):
lines += line
if sys.getsizeof(lines) >= block_size:
out.write(lines)
lines = ""
out.write(lines)
logger.debug(f"gunzipped {infpath} to {outfpath}")
write_checksum(outfpath, f"{outfpath}.md5")
if delete_original:
os.remove(infpath)
logger.debug(f"removed original file: {infpath}")
return outfpath
jason-c-kwan marked this conversation as resolved.
Show resolved Hide resolved


Expand All @@ -149,7 +162,7 @@ def untar(tarchive, outdir, member=None):
</path/tarchive.tar.[compression]>
outdir : str
</path/to/output/directory>
member : str
member : str, optional
member file to extract.

Returns
Expand All @@ -165,14 +178,15 @@ def untar(tarchive, outdir, member=None):
`tarchive` is not a tar archive
KeyError
`member` was not found in `tarchive`

"""
if not member and not outdir:
raise ValueError(
f"`member` or `outdir` must be passed: member={member} outdir={outdir}"
)
logger.debug(f"decompressing tarchive {tarchive} to {outdir}")
outfpath = os.path.join(outdir, member) if member else None
if member and os.path.exists(outfpath) and os.stat(outfpath).st_size > 0:
if member and os.path.exists(outfpath) and os.path.getsize(outfpath) > 0:
raise FileExistsError(outfpath)
if not tarfile.is_tarfile(tarchive):
raise ValueError(f"{tarchive} is not a tar archive")
Expand Down Expand Up @@ -219,6 +233,7 @@ def tarchive_results(outfpath, src_dirpath):
-------
FileExistsError
`outfpath` already exists

"""
logger.debug(f"tar archiving {src_dirpath} to {outfpath}")
if os.path.exists(outfpath):
Expand All @@ -229,16 +244,17 @@ def tarchive_results(outfpath, src_dirpath):
return outfpath


def file_length(fpath):
def file_length(fpath, approximate=False):
"""Retrieve the number of lines in `fpath`

See:
https://stackoverflow.com/questions/845058/how-to-get-line-count-of-a-large-file-cheaply-in-python
See: https://stackoverflow.com/q/845058/13118765

Parameters
----------
fpath : str
Description of parameter `fpath`.
approximate: bool
If True, will approximate the length of the file from the file size.

Returns
-------
Expand All @@ -253,18 +269,28 @@ def file_length(fpath):
"""
if not os.path.exists(fpath):
raise FileNotFoundError(fpath)
if fpath.endswith(".gz"):
fh = gzip.open(fpath, "rb")
else:
fh = open(fpath, "rb")

fh = gzip.open(fpath, "rt") if fpath.endswith(".gz") else open(fpath, "rb")
if approximate:
lines = []
n_sample_lines = 100000
for i, l in enumerate(fh):
if i > n_sample_lines:
break
lines.append(sys.getsizeof(l))
fh.close()
avg_size_per_line = np.average(lines)
total_size = os.path.getsize(fpath)
return int(np.ceil(total_size / avg_size_per_line))

for i, l in enumerate(fh):
pass
fh.close()
return i + 1


def get_checksum(fpath):
"""Retrieve sha256 checksums from provided `args`.
def calc_checksum(fpath):
"""Retrieve md5 checksum from provided `fpath`.

See:
https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
Expand All @@ -277,18 +303,20 @@ def get_checksum(fpath):
Returns
-------
str
hexdigest of `fpath` using sha256
space-delimited hexdigest of `fpath` using md5sum and basename of `fpath`.
e.g. 'hash filename\n'

Raises
-------
FileNotFoundError
Provided `fpath` does not exist
TypeError
`fpath` is not a string

"""

def sha(block):
hasher = hashlib.sha256()
def md5sum(block):
hasher = hashlib.md5()
for bytes in block:
hasher.update(bytes)
return hasher.hexdigest()
Expand All @@ -300,14 +328,78 @@ def blockiter(fh, blocksize=65536):
yield block
block = fh.read(blocksize)

if type(fpath) != str:
if not isinstance(fpath, str):
raise TypeError(type(fpath))
if not os.path.exists(fpath):
raise FileNotFoundError(fpath)
fh = open(fpath, "rb")
cksum = sha(blockiter(fh))
hash = md5sum(blockiter(fh))
fh.close()
return cksum
return f"{hash} {os.path.basename(fpath)}\n"


def read_checksum(fpath):
"""Read checksum from provided checksum formatted `fpath`.

Note: See `write_checksum` for how a checksum file is generated.

Parameters
----------
fpath : str
</path/to/file.md5>

Returns
-------
str
checksum retrieved from `fpath`.

Raises
-------
TypeError
Provided `fpath` was not a string.
FileNotFoundError
Provided `fpath` does not exist.

"""
if not isinstance(fpath, str):
raise TypeError(type(fpath))
if not os.path.exists(fpath):
raise FileNotFoundError(fpath)
with open(fpath) as fh:
return fh.readline()
jason-c-kwan marked this conversation as resolved.
Show resolved Hide resolved


def write_checksum(infpath, outfpath):
"""Calculate checksum for `infpath` and write to `outfpath`.

Parameters
----------
infpath : str
</path/to/input/file>
outfpath : str
</path/to/output/checksum/file>

Returns
-------
NoneType
Description of returned object.

Raises
-------
FileNotFoundError
Provided `infpath` does not exist
TypeError
`infpath` or `outfpath` is not a string

"""
if not os.path.exists(infpath):
raise FileNotFoundError(infpath)
if not isinstance(outfpath, str):
raise TypeError(type(outfpath))
checksum = calc_checksum(infpath)
with open(outfpath, "w") as fh:
fh.write(checksum)
logger.debug(f"Wrote {infpath} checksum to {outfpath}")


def valid_checkpoint(checkpoint_fp, fpath):
Expand All @@ -331,9 +423,10 @@ def valid_checkpoint(checkpoint_fp, fpath):
Either `fpath` or `checkpoint_fp` does not exist
TypeError
Either `fpath` or `checkpoint_fp` is not a string

"""
for fp in [checkpoint_fp, fpath]:
if not type(fp) is str:
if not isinstance(fp, str):
raise TypeError(f"{fp} is type: {type(fp)}")
if not os.path.exists(fp):
raise FileNotFoundError(fp)
Expand All @@ -345,8 +438,8 @@ def valid_checkpoint(checkpoint_fp, fpath):
# If filepaths never match, prev_chksum and new_chksum will not match.
# Giving expected result.
break
new_chksum = get_checksum(fpath)
return True if new_chksum == prev_chksum else False
new_chksum = calc_checksum(fpath)
return new_chksum == prev_chksum


def get_checkpoints(checkpoint_fp, fpaths=None):
Expand All @@ -358,7 +451,7 @@ def get_checkpoints(checkpoint_fp, fpaths=None):
----------
checkpoint_fp : str
</path/to/checkpoints.tsv>
fpaths : [str, ...]
fpaths : [str, ...], optional
[</path/to/file>, ...]

Returns
Expand All @@ -371,6 +464,7 @@ def get_checkpoints(checkpoint_fp, fpaths=None):
ValueError
When `checkpoint_fp` first being written, will not populate an empty checkpoints file.
Raises an error if the `fpaths` list is empty or None

"""
if not os.path.exists(checkpoint_fp):
logger.debug(f"{checkpoint_fp} not found... Writing")
Expand All @@ -381,10 +475,10 @@ def get_checkpoints(checkpoint_fp, fpaths=None):
outlines = ""
for fpath in fpaths:
try:
checksum = get_checksum(fpath)
checksum = calc_checksum(fpath)
except FileNotFoundError as err:
checksum = ""
outlines += f"{checksum}\t{fpath}\n"
outlines += checksum
with open(checkpoint_fp, "w") as fh:
fh.write(outlines)
logger.debug(f"Written: {checkpoint_fp}")
Expand Down Expand Up @@ -412,20 +506,19 @@ def update_checkpoints(checkpoint_fp, fpath):
-------
dict
{fp:checksum, ...}

"""
checkpoints = get_checkpoints(checkpoint_fp)
if valid_checkpoint(checkpoint_fp, fpath):
return checkpoints
new_checksum = get_checksum(fpath)
new_checksum = calc_checksum(fpath)
checkpoints.update({fpath: new_checksum})
outlines = ""
for fp, chk in checkpoints.items():
outlines += f"{chk}\t{fp}\n"
with open(checkpoint_fp, "w") as fh:
fh.write(outlines)
logger.debug(
f"Updated checkpoints with {os.path.basename(fpath)} -> {new_checksum[:16]}"
)
logger.debug(f"Checkpoints updated: {new_checksum[:16]} {os.path.basename(fpath)}")
return checkpoints


Expand Down Expand Up @@ -469,13 +562,9 @@ def wrapper(*args, **kwds):


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser(
description="file containing utilities functions for Autometa pipeline"
print(
"This file contains utilities for Autometa pipeline and should not be run directly!"
)
print("file containing utilities functions for Autometa pipeline")
args = parser.parse_args()
import sys

sys.exit(1)
sys.exit(0)
Loading