Skip to content

Commit

Permalink
reformatted with black and isort
Browse files Browse the repository at this point in the history
  • Loading branch information
reece committed Sep 17, 2023
1 parent b48834d commit 7cc7f37
Show file tree
Hide file tree
Showing 10 changed files with 56 additions and 158 deletions.
113 changes: 27 additions & 86 deletions src/biocommons/seqrepo/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,7 @@
instance_name_new_re = re.compile(
r"^20[12]\d-\d\d-\d\d$"
) # smells like a new datestamp, 2017-01-17
instance_name_old_re = re.compile(
r"^20[12]1\d\d\d\d\d$"
) # smells like an old datestamp, 20170117
instance_name_old_re = re.compile(r"^20[12]1\d\d\d\d\d$") # smells like an old datestamp, 20170117
instance_name_re = re.compile(
r"^20[12]\d-?\d\d-?\d\d$"
) # smells like a datestamp, 20170117 or 2017-01-17
Expand All @@ -55,9 +53,7 @@


def _get_remote_instances(opts):
line_re = re.compile(
r"d[-rwx]{9}\s+[\d,]+ \d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2} (.+)"
)
line_re = re.compile(r"d[-rwx]{9}\s+[\d,]+ \d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2} (.+)")
rsync_cmd = [
opts.rsync_exe,
"--no-motd",
Expand Down Expand Up @@ -93,18 +89,14 @@ def parse_arguments():
+ ". See https://github.com/biocommons/biocommons.seqrepo for more information",
)
top_p.add_argument("--dry-run", "-n", default=False, action="store_true")
top_p.add_argument(
"--remote-host", default="dl.biocommons.org", help="rsync server host"
)
top_p.add_argument("--remote-host", default="dl.biocommons.org", help="rsync server host")
top_p.add_argument(
"--root-directory",
"-r",
default=SEQREPO_ROOT_DIR,
help="seqrepo root directory (SEQREPO_ROOT_DIR)",
)
top_p.add_argument(
"--rsync-exe", default="/usr/bin/rsync", help="path to rsync executable"
)
top_p.add_argument("--rsync-exe", default="/usr/bin/rsync", help="path to rsync executable")
top_p.add_argument(
"--verbose",
"-v",
Expand Down Expand Up @@ -152,9 +144,7 @@ def parse_arguments():
ap = subparsers.add_parser("export", help="export sequences")
ap.set_defaults(func=export)
ap.add_argument("ALIASES", nargs="*", help="specific aliases to export")
ap.add_argument(
"--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RO, help="instance name"
)
ap.add_argument("--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RO, help="instance name")
ap.add_argument(
"--namespace",
"-n",
Expand All @@ -164,9 +154,7 @@ def parse_arguments():
# export aliases
ap = subparsers.add_parser("export-aliases", help="export aliases")
ap.set_defaults(func=export_aliases)
ap.add_argument(
"--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RO, help="instance name"
)
ap.add_argument("--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RO, help="instance name")
ap.add_argument(
"--namespace",
"-n",
Expand Down Expand Up @@ -208,15 +196,11 @@ def parse_arguments():
)

# list-local-instances
ap = subparsers.add_parser(
"list-local-instances", help="list local seqrepo instances"
)
ap = subparsers.add_parser("list-local-instances", help="list local seqrepo instances")
ap.set_defaults(func=list_local_instances)

# list-remote-instances
ap = subparsers.add_parser(
"list-remote-instances", help="list remote seqrepo instances"
)
ap = subparsers.add_parser("list-remote-instances", help="list remote seqrepo instances")
ap.set_defaults(func=list_remote_instances)

# load
Expand All @@ -241,9 +225,7 @@ def parse_arguments():
)

# pull
ap = subparsers.add_parser(
"pull", help="pull incremental update from seqrepo mirror"
)
ap = subparsers.add_parser("pull", help="pull incremental update from seqrepo mirror")
ap.set_defaults(func=pull)
ap.add_argument("--instance-name", "-i", default=None, help="instance name")
ap.add_argument(
Expand All @@ -257,14 +239,10 @@ def parse_arguments():
# show-status
ap = subparsers.add_parser("show-status", help="show seqrepo status")
ap.set_defaults(func=show_status)
ap.add_argument(
"--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RO, help="instance name"
)
ap.add_argument("--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RO, help="instance name")

# snapshot
ap = subparsers.add_parser(
"snapshot", help="create a new read-only seqrepo snapshot"
)
ap = subparsers.add_parser("snapshot", help="create a new read-only seqrepo snapshot")
ap.set_defaults(func=snapshot)
ap.add_argument(
"--instance-name",
Expand All @@ -284,9 +262,7 @@ def parse_arguments():
"start-shell", help="start interactive shell with initialized seqrepo"
)
ap.set_defaults(func=start_shell)
ap.add_argument(
"--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RO, help="instance name"
)
ap.add_argument("--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RO, help="instance name")

# upgrade
ap = subparsers.add_parser("upgrade", help="upgrade seqrepo database and directory")
Expand All @@ -299,9 +275,7 @@ def parse_arguments():
)

# update digests
ap = subparsers.add_parser(
"update-digests", help="update computed digests in place"
)
ap = subparsers.add_parser("update-digests", help="update computed digests in place")
ap.set_defaults(func=update_digests)
ap.add_argument(
"--instance-name",
Expand Down Expand Up @@ -369,9 +343,7 @@ def add_assembly_names(opts):
sequences = assemblies[assy_name]["sequences"]
eq_sequences = [s for s in sequences if s["relationship"] in ("=", "<>")]
if not eq_sequences:
_logger.info(
"No '=' sequences to load for {an}; skipping".format(an=assy_name)
)
_logger.info("No '=' sequences to load for {an}; skipping".format(an=assy_name))
continue

# all assembled-molecules (1..22, X, Y, MT) have ncbi aliases in seqrepo
Expand All @@ -389,23 +361,17 @@ def add_assembly_names(opts):
)
)
if not opts.partial_load:
_logger.warning(
"Skipping {an} (-p to enable partial loading)".format(an=assy_name)
)
_logger.warning("Skipping {an} (-p to enable partial loading)".format(an=assy_name))
continue

eq_sequences = [es for es in eq_sequences if es["refseq_ac"] in ncbi_alias_map]
_logger.info(
"Loading {n} new accessions for assembly {an}".format(
an=assy_name, n=len(eq_sequences)
)
"Loading {n} new accessions for assembly {an}".format(an=assy_name, n=len(eq_sequences))
)

for s in eq_sequences:
seq_id = ncbi_alias_map[s["refseq_ac"]]
aliases = [
{"namespace": assy_name, "alias": a} for a in [s["name"]] + s["aliases"]
]
aliases = [{"namespace": assy_name, "alias": a} for a in [s["name"]] + s["aliases"]]
for alias in aliases:
sr.aliases.store_alias(seq_id=seq_id, **alias)
_logger.debug(
Expand Down Expand Up @@ -463,9 +429,7 @@ def _rec_iterator():
for srec, arecs in _rec_iterator():
nsad = _convert_alias_records_to_ns_dict(arecs)
aliases = [
"{ns}:{a}".format(ns=ns, a=a)
for ns, aliases in sorted(nsad.items())
for a in aliases
"{ns}:{a}".format(ns=ns, a=a) for ns, aliases in sorted(nsad.items()) for a in aliases
]
print(">" + " ".join(aliases))
for l in _wrap_lines(srec["seq"], 100):
Expand All @@ -476,9 +440,7 @@ def export_aliases(opts):
seqrepo_dir = os.path.join(opts.root_directory, opts.instance_name)
sr = SeqRepo(seqrepo_dir)
alias_iterator = sr.aliases.find_aliases(translate_ncbi_namespace=True)
grouped_alias_iterator = itertools.groupby(
alias_iterator, key=lambda arec: (arec["seq_id"])
)
grouped_alias_iterator = itertools.groupby(alias_iterator, key=lambda arec: (arec["seq_id"]))
for _, arecs in grouped_alias_iterator:
if opts.namespace:
if not any(arec for arec in arecs if arec["namespace"] == opts.namespace):
Expand Down Expand Up @@ -514,9 +476,7 @@ def fetch_load(opts):
def init(opts):
seqrepo_dir = os.path.join(opts.root_directory, opts.instance_name)
if os.path.exists(seqrepo_dir) and len(os.listdir(seqrepo_dir)) > 0:
raise IOError(
"{seqrepo_dir} exists and is not empty".format(seqrepo_dir=seqrepo_dir)
)
raise IOError("{seqrepo_dir} exists and is not empty".format(seqrepo_dir=seqrepo_dir))
sr = SeqRepo(seqrepo_dir, writeable=True) # flake8: noqa


Expand Down Expand Up @@ -559,9 +519,7 @@ def load(opts):
else:
fh = io.open(fn, mode="rt", encoding="ascii")
_logger.info("Opened " + fn)
seq_bar = tqdm.tqdm(
FastaIter(fh), unit=" seqs", disable=disable_bar, leave=False
)
seq_bar = tqdm.tqdm(FastaIter(fh), unit=" seqs", disable=disable_bar, leave=False)
for defline, seq in seq_bar:
n_seqs_seen += 1
seq_bar.set_description(
Expand All @@ -582,9 +540,7 @@ def pull(opts):
if opts.instance_name:
instance_name = opts.instance_name
if instance_name not in remote_instances:
raise KeyError(
"{}: not in list of remote instance names".format(instance_name)
)
raise KeyError("{}: not in list of remote instance names".format(instance_name))
else:
instance_name = remote_instances[-1]
_logger.info("most recent seqrepo instance is " + instance_name)
Expand All @@ -600,11 +556,7 @@ def pull(opts):
cmd = [opts.rsync_exe, "-aHP", "--no-motd"]
if local_instances:
latest_local_instance = local_instances[-1]
cmd += [
"--link-dest="
+ os.path.join(opts.root_directory, latest_local_instance)
+ "/"
]
cmd += ["--link-dest=" + os.path.join(opts.root_directory, latest_local_instance) + "/"]
cmd += ["{h}::seqrepo/{i}/".format(h=opts.remote_host, i=instance_name), tmp_dir]

_logger.debug("Executing: " + " ".join(cmd))
Expand All @@ -627,11 +579,7 @@ def show_status(opts):

sr = SeqRepo(seqrepo_dir)
print("seqrepo {version}".format(version=__version__))
print(
"instance directory: {sr._root_dir}, {ts:.1f} GB".format(
sr=sr, ts=tot_size / 1e9
)
)
print("instance directory: {sr._root_dir}, {ts:.1f} GB".format(sr=sr, ts=tot_size / 1e9))
print(
"backends: fastadir (schema {fd_v}), seqaliasdb (schema {sa_v}) ".format(
fd_v=sr.sequences.schema_version(), sa_v=sr.aliases.schema_version()
Expand Down Expand Up @@ -667,8 +615,7 @@ def snapshot(opts):

if os.path.commonpath([src_dir, dst_dir]).startswith(src_dir):
raise RuntimeError(
"Cannot nest seqrepo directories "
"({} is within {})".format(dst_dir, src_dir)
"Cannot nest seqrepo directories " "({} is within {})".format(dst_dir, src_dir)
)

if os.path.exists(dst_dir):
Expand Down Expand Up @@ -763,9 +710,7 @@ def update_latest(opts, mri=None):
if not mri:
instances = _get_local_instances(opts)
if not instances:
_logger.error(
"No seqrepo instances in {opts.root_directory}".format(opts=opts)
)
_logger.error("No seqrepo instances in {opts.root_directory}".format(opts=opts))
return
mri = instances[-1]
dst = os.path.join(opts.root_directory, "latest")
Expand All @@ -781,11 +726,7 @@ def main():
opts = parse_arguments()

verbose_log_level = (
logging.WARN
if opts.verbose == 0
else logging.INFO
if opts.verbose == 1
else logging.DEBUG
logging.WARN if opts.verbose == 0 else logging.INFO if opts.verbose == 1 else logging.DEBUG
)
logging.basicConfig(level=verbose_log_level)
opts.func(opts)
Expand Down
4 changes: 1 addition & 3 deletions src/biocommons/seqrepo/dataproxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,9 +214,7 @@ def create_dataproxy(uri: str = None) -> _DataProxy:
scheme = parsed_uri.scheme

if "+" not in scheme:
raise ValueError(
"create_dataproxy scheme must include provider (e.g., `seqrepo+http:...`)"
)
raise ValueError("create_dataproxy scheme must include provider (e.g., `seqrepo+http:...`)")

provider, proto = scheme.split("+")

Expand Down
20 changes: 5 additions & 15 deletions src/biocommons/seqrepo/fastadir/fabgz.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,26 +36,20 @@ def _get_bgzip_version(exe):
)
output = p.communicate()
version_line = output[0].splitlines()[1]
version = re.match(
r"(?:Version:|bgzip \(htslib\))\s+(\d+\.\d+(\.\d+)?)", version_line
).group(1)
version = re.match(r"(?:Version:|bgzip \(htslib\))\s+(\d+\.\d+(\.\d+)?)", version_line).group(1)
return version


def _find_bgzip():
"""return path to bgzip if found and meets version requirements, else exception"""
missing_file_exception = OSError if six.PY2 else FileNotFoundError
min_bgzip_version = ".".join(map(str, min_bgzip_version_info))
exe = os.environ.get(
"SEQREPO_BGZIP_PATH", shutil.which("bgzip") or "/usr/bin/bgzip"
)
exe = os.environ.get("SEQREPO_BGZIP_PATH", shutil.which("bgzip") or "/usr/bin/bgzip")

try:
bgzip_version = _get_bgzip_version(exe)
except AttributeError:
raise RuntimeError(
"Didn't find version string in bgzip executable ({exe})".format(exe=exe)
)
raise RuntimeError("Didn't find version string in bgzip executable ({exe})".format(exe=exe))
except missing_file_exception:
raise RuntimeError(
"{exe} doesn't exist; you need to install htslib and tabix (See https://github.com/biocommons/biocommons.seqrepo#requirements)".format(
Expand Down Expand Up @@ -151,15 +145,11 @@ def close(self):
os.chmod(self.filename + ".fai", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
os.chmod(self.filename + ".gzi", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)

_logger.info(
"{} written; added {} sequences".format(self.filename, len(self._added))
)
_logger.info("{} written; added {} sequences".format(self.filename, len(self._added)))

def __del__(self):
if self._fh is not None:
_logger.error(
"FabgzWriter({}) was not explicitly closed; data may be lost".format(
self.filename
)
"FabgzWriter({}) was not explicitly closed; data may be lost".format(self.filename)
)
self.close()
12 changes: 3 additions & 9 deletions src/biocommons/seqrepo/seqaliasdb/seqaliasdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,7 @@ def __init__(
if schema_version != expected_schema_version: # pragma: no cover
raise RuntimeError(
"Upgrade required: Database schema"
"version is {} and code expects {}".format(
schema_version, expected_schema_version
)
"version is {} and code expects {}".format(schema_version, expected_schema_version)
)

# ############################################################################
Expand Down Expand Up @@ -88,9 +86,7 @@ def fetch_aliases(self, seq_id, current_only=True, translate_ncbi_namespace=None
_logger.warning(
"translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed"
)
return [
dict(r) for r in self.find_aliases(seq_id=seq_id, current_only=current_only)
]
return [dict(r) for r in self.find_aliases(seq_id=seq_id, current_only=current_only)]

def find_aliases(
self,
Expand Down Expand Up @@ -207,9 +203,7 @@ def store_alias(self, seq_id, namespace, alias):
return current_rec["seqalias_id"]

# otherwise, we're reassigning; deprecate old record, then retry
_logger.debug(
log_pfx + ": collision; deprecating {s1}".format(s1=current_rec["seq_id"])
)
_logger.debug(log_pfx + ": collision; deprecating {s1}".format(s1=current_rec["seq_id"]))
cursor.execute(
"update seqalias set is_current = 0 where seqalias_id = ?",
[current_rec["seqalias_id"]],
Expand Down
Loading

0 comments on commit 7cc7f37

Please sign in to comment.