Skip to content

Commit

Permalink
Closes #31: Implemented store, fetch, and alias record translation fo…
Browse files Browse the repository at this point in the history
…r NCBI↔refseq, Ensembl↔ensembl, LRG↔lrg
  • Loading branch information
reece committed Jul 6, 2020
1 parent a3e8728 commit f2b48ef
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 40 deletions.
40 changes: 40 additions & 0 deletions src/biocommons/seqrepo/_internal/translate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""translate namespaces
Translates between namespaces exposed in the API and those in the DB.
This is a temporary operation in order to create a smooth transition
for clients. See biocommons.seqrepo#31 for background.
Use cases:
* Store: translate API namespace to DB namespace. e.g., "refseq" -> "NCBI"
* Find: as with store for query argument, plus translate DB to API for
returned records.
All translations occur in seqaliasdb.
"""

import copy

translations = [
# (DB namespace, API namespace)
("NCBI", "refseq"),
("Ensembl", "ensembl"),
("LRG", "lrg"),
]


ns_db2api = {db: api for db, api in translations}
ns_api2db = {api: db for db, api in translations}

def translate_alias_records(aliases_itr):
"""given an iterator of find_aliases results, return a stream with
translated records"""

for a in aliases_itr:
yield a

ns = a["namespace"]
if ns in ns_db2api:
a2 = copy.copy(a)
a2["namespace"] = ns_db2api[ns]
yield a2
40 changes: 22 additions & 18 deletions src/biocommons/seqrepo/seqaliasdb/seqaliasdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
import pkg_resources
import yoyo

from .._internal.translate import translate_alias_records, ns_api2db
from .._internal.logging_support import DuplicateFilter

_logger = logging.getLogger(__name__)
_logger.addFilter(DuplicateFilter())
#_logger.addFilter(DuplicateFilter())


expected_schema_version = 1
Expand All @@ -26,11 +27,13 @@ class SeqAliasDB(object):
"""

def __init__(self, db_path, writeable=False, translate_ncbi_namespace=False, check_same_thread=True):
def __init__(self, db_path, writeable=False, translate_ncbi_namespace=None, check_same_thread=True):
self._db_path = db_path
self._db = None
self._writeable = writeable
self.translate_ncbi_namespace = translate_ncbi_namespace

if translate_ncbi_namespace is not None:
_logger.warning("translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed")

if self._writeable:
self._upgrade_db()
Expand Down Expand Up @@ -64,12 +67,13 @@ def commit(self):
def fetch_aliases(self, seq_id, current_only=True, translate_ncbi_namespace=None):
"""return list of alias annotation records (dicts) for a given seq_id"""
_logger.warning("SeqAliasDB::fetch_aliases() is deprecated; use find_aliases(seq_id=...) instead")
if translate_ncbi_namespace is not None:
_logger.warning("translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed")
return [dict(r) for r in self.find_aliases(seq_id=seq_id,
current_only=current_only,
translate_ncbi_namespace=translate_ncbi_namespace)]
current_only=current_only)]

def find_aliases(self, seq_id=None, namespace=None, alias=None, current_only=True, translate_ncbi_namespace=None):
"""returns iterator over alias annotation records that match criteria
"""returns iterator over alias annotation dicts that match criteria
The arguments, all optional, restrict the records that are
returned. Without arguments, all aliases are returned.
Expand All @@ -81,21 +85,21 @@ def find_aliases(self, seq_id=None, namespace=None, alias=None, current_only=Tru
used. Otherwise arguments must match exactly.
"""

clauses = []
params = []

def eq_or_like(s):
return "like" if "%" in s else "="

if translate_ncbi_namespace is None:
translate_ncbi_namespace = self.translate_ncbi_namespace
if translate_ncbi_namespace is not None:
_logger.warning("translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed")

if alias is not None:
clauses += ["alias {} ?".format(eq_or_like(alias))]
params += [alias]
if namespace is not None:
# Switch to using refseq for RefSeq accessions
# issue #38: translate "refseq" to "NCBI" to enable RefSeq lookups
# issue #31: later breaking change, translate database
# #31: translate "refseq" in API to "NCBI" for db (transitional)
if namespace.lower() == "refseq":
namespace = "NCBI"
clauses += ["namespace {} ?".format(eq_or_like(namespace))]
Expand All @@ -107,19 +111,16 @@ def eq_or_like(s):
clauses += ["is_current = 1"]

cols = ["seqalias_id", "seq_id", "alias", "added", "is_current"]
if translate_ncbi_namespace:
cols += ["case namespace when 'NCBI' then 'refseq' else namespace end as namespace"]
else:
cols += ["namespace"]
cols += ["namespace"]
sql = "select {cols} from seqalias".format(cols=", ".join(cols))
if clauses:
sql += " where " + " and ".join("(" + c + ")" for c in clauses)
sql += " order by seq_id, namespace, alias"

_logger.debug("Executing: " + sql)
_logger.debug("Executing: {} with params {}".format(sql, params))
cursor = self._db.cursor()
cursor.execute(sql, params)
return cursor
return translate_alias_records(dict(r) for r in cursor)

def schema_version(self):
"""return schema version as integer"""
Expand Down Expand Up @@ -147,6 +148,9 @@ def store_alias(self, seq_id, namespace, alias):
if not self._writeable:
raise RuntimeError("Cannot write -- opened read-only")

if namespace in ns_api2db:
namespace = ns_api2db[namespace]

log_pfx = "store({q},{n},{a})".format(n=namespace, a=alias, q=seq_id)
cursor = self._db.cursor()
try:
Expand All @@ -163,7 +167,7 @@ def store_alias(self, seq_id, namespace, alias):
# IntegrityError fall-through

# existing record is guaranteed to exist uniquely; fetchone() should always succeed
current_rec = self.find_aliases(namespace=namespace, alias=alias).fetchone()
current_rec = next(self.find_aliases(namespace=namespace, alias=alias))

# if seq_id matches current record, it's a duplicate (seq_id, namespace, alias) tuple
# and we return current record
Expand Down
20 changes: 11 additions & 9 deletions src/biocommons/seqrepo/seqrepo.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class SeqRepo(object):
"""

def __init__(self, root_dir, writeable=False, upcase=True, translate_ncbi_namespace=False, check_same_thread=False):
def __init__(self, root_dir, writeable=False, upcase=True, translate_ncbi_namespace=None, check_same_thread=False):
self._root_dir = root_dir
self._upcase = upcase
self._db_path = os.path.join(self._root_dir, "aliases.sqlite3")
Expand All @@ -46,7 +46,6 @@ def __init__(self, root_dir, writeable=False, upcase=True, translate_ncbi_namesp
self._pending_sequences_len = 0
self._pending_aliases = 0
self._writeable = writeable
self.translate_ncbi_namespace = translate_ncbi_namespace
self._check_same_thread = True if writeable else check_same_thread

if self._writeable:
Expand All @@ -58,9 +57,11 @@ def __init__(self, root_dir, writeable=False, upcase=True, translate_ncbi_namesp
self.sequences = FastaDir(self._seq_path, writeable=self._writeable, check_same_thread=self._check_same_thread)
self.aliases = SeqAliasDB(self._db_path,
writeable=self._writeable,
translate_ncbi_namespace=self.translate_ncbi_namespace,
check_same_thread=self._check_same_thread)

if translate_ncbi_namespace is not None:
_logger.warn("translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed")

def __contains__(self, nsa):
ns, a = nsa.split(nsa_sep) if nsa_sep in nsa else (None, nsa)
return any(self.aliases.find_aliases(alias=a, namespace=ns))
Expand Down Expand Up @@ -177,11 +178,10 @@ def translate_alias(self, alias, namespace=None, target_namespaces=None, transla
"""

if translate_ncbi_namespace is None:
translate_ncbi_namespace = self.translate_ncbi_namespace
if translate_ncbi_namespace is not None:
_logger.warn("translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed")
seq_id = self._get_unique_seqid(alias=alias, namespace=namespace)
aliases = self.aliases.fetch_aliases(seq_id=seq_id,
translate_ncbi_namespace=translate_ncbi_namespace)
aliases = self.aliases.fetch_aliases(seq_id=seq_id)
if target_namespaces:
aliases = [a for a in aliases if a["namespace"] in target_namespaces]
return aliases
Expand All @@ -192,11 +192,13 @@ def translate_identifier(self, identifier, target_namespaces=None, translate_ncb
identifiers) that refer to the same sequence.
"""
if translate_ncbi_namespace is not None:
_logger.warn("translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed")

namespace, alias = identifier.split(nsa_sep) if nsa_sep in identifier else (None, identifier)
aliases = self.translate_alias(alias=alias,
namespace=namespace,
target_namespaces=target_namespaces,
translate_ncbi_namespace=translate_ncbi_namespace)
target_namespaces=target_namespaces)
return [nsa_sep.join((a["namespace"], a["alias"])) for a in aliases]


Expand Down
36 changes: 23 additions & 13 deletions tests/test_seqrepo.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,23 +79,33 @@ def test_refseq_lookup(seqrepo):
assert seqrepo["refseq:ncbiac"] == "NCBISEQUENCE"


def test_refseq_translation(tmpdir_factory):
def test_namespace_translation(tmpdir_factory):
dir = str(tmpdir_factory.mktemp('seqrepo'))

seqrepo = SeqRepo(dir, writeable=True)
seqrepo.store("NCBISEQUENCE", [{"namespace": "NCBI", "alias": "ncbiac"}])
seqrepo.commit()
del seqrepo

seqrepo = SeqRepo(dir, writeable=False, translate_ncbi_namespace=False)
aliases = list(seqrepo.aliases.find_aliases(alias="ncbiac"))
assert len(aliases) == 1
assert aliases[0]["namespace"] == "NCBI"
# store sequences
seqrepo.store("NCBISEQUENCE", [{"namespace": "NCBI", "alias": "ncbiac" }])
seqrepo.store("ENSEMBLSEQUENCE", [{"namespace": "Ensembl", "alias": "ensemblac"}])
seqrepo.store("LRGSEQUENCE", [{"namespace": "LRG", "alias": "lrgac" }])
seqrepo.store("REFSEQSEQUENCE", [{"namespace": "refseq", "alias": "refseqac" }]) # should be stored as NCBI:refseqac
seqrepo.commit()

seqrepo = SeqRepo(dir, writeable=False, translate_ncbi_namespace=True)
aliases = list(seqrepo.aliases.find_aliases(alias="ncbiac"))
assert len(aliases) == 1
assert aliases[0]["namespace"] == "refseq"
# lookups, no query translation
assert seqrepo["NCBI:ncbiac"] == "NCBISEQUENCE"
assert seqrepo["Ensembl:ensemblac"] == "ENSEMBLSEQUENCE"
assert seqrepo["LRG:lrgac"] == "LRGSEQUENCE"
assert seqrepo["NCBI:refseqac"] == "REFSEQSEQUENCE" # tests ns translation on store

# lookups, w/ query translation
assert seqrepo["refseq:ncbiac"] == "NCBISEQUENCE"
assert seqrepo["RefSeq:ncbiac"] == "NCBISEQUENCE" # case-squashed
assert seqrepo["Ensembl:ensemblac"] == "ENSEMBLSEQUENCE"
assert seqrepo["LRG:lrgac"] == "LRGSEQUENCE"

seq_id = seqrepo._get_unique_seqid(alias="ncbiac", namespace="NCBI")
aliases = list(seqrepo.aliases.find_aliases(seq_id=seq_id))
assert any(a for a in aliases if a["namespace"] == "refseq")



def test_translation(seqrepo):
Expand Down

0 comments on commit f2b48ef

Please sign in to comment.