Skip to content

Commit

Permalink
Improve Rhea import (#168)
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt authored Feb 2, 2024
1 parent 259d161 commit a2da824
Show file tree
Hide file tree
Showing 4 changed files with 177 additions and 49 deletions.
8 changes: 5 additions & 3 deletions src/pyobo/sources/famplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,11 @@ def _get_xref_df(version: str) -> Mapping[str, List[Reference]]:
}
xrefs_df[0] = xrefs_df[0].map(lambda s: ns_remapping.get(s, s))
xrefs_df[1] = [
bioregistry.standardize_identifier(xref_prefix, xref_identifier)
if xref_prefix != "nextprot.family"
else xref_identifier[len("FA:") :]
(
bioregistry.standardize_identifier(xref_prefix, xref_identifier)
if xref_prefix != "nextprot.family"
else xref_identifier[len("FA:") :]
)
for xref_prefix, xref_identifier in xrefs_df[[0, 1]].values
]

Expand Down
177 changes: 141 additions & 36 deletions src/pyobo/sources/rhea.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,51 @@
"""Converter for Rhea."""

import logging
from typing import Iterable
from typing import TYPE_CHECKING, Dict, Iterable, Optional

import bioversions
import pystow

from pyobo.struct import Obo, Reference, Term
from pyobo.struct.typedef import (
TypeDef,
enabled_by,
has_bidirectional_reaction,
has_input,
has_left_to_right_reaction,
has_output,
has_participant,
has_right_to_left_reaction,
reaction_enabled_by_molecular_function,
)
from pyobo.utils.path import ensure_df

if TYPE_CHECKING:
import rdflib

__all__ = [
"RheaGetter",
]

logger = logging.getLogger(__name__)
PREFIX = "rhea"
RHEA_RDF_GZ_URL = "ftp://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz"


class RheaGetter(Obo):
"""An ontology representation of Rhea's chemical reaction database."""

ontology = bioversions_key = PREFIX
typedefs = [has_left_to_right_reaction, has_bidirectional_reaction, has_right_to_left_reaction]
typedefs = [
has_left_to_right_reaction,
has_bidirectional_reaction,
has_right_to_left_reaction,
enabled_by,
has_input,
has_output,
has_participant,
reaction_enabled_by_molecular_function,
]

def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in the ontology."""
Expand All @@ -39,25 +59,54 @@ def get_obo(force: bool = False) -> Obo:
return RheaGetter(force=force)


def ensure_rhea_rdf(version: Optional[str] = None, force: bool = False) -> "rdflib.Graph":
"""Get the Rhea RDF graph."""
# see docs: https://ftp.expasy.org/databases/rhea/rdf/rhea_rdf_documentation.pdf
if version is None:
version = bioversions.get_version(PREFIX)
return pystow.ensure_rdf(
"pyobo",
"raw",
PREFIX,
version,
url=RHEA_RDF_GZ_URL,
force=force,
parse_kwargs=dict(format="xml"),
)


def _get_lr_name(name: str) -> str:
return name.replace(" = ", " => ")


def _get_rl_name(name: str) -> str:
left, right = name.split(" = ", 1)
return f"{right} => {left}"


def _get_bi_name(name: str) -> str:
return name.replace(" = ", " <=> ")


def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in Rhea."""
url = "ftp://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz"
graph = pystow.ensure_rdf(
"pyobo", "raw", PREFIX, version, url=url, force=force, parse_kwargs=dict(format="xml")
)
graph = ensure_rhea_rdf(version=version, force=force)
result = graph.query(
"""
PREFIX rh:<http://rdf.rhea-db.org/>
SELECT ?reaction ?reactionId ?reactionLabel WHERE {
?reaction rdfs:subClassOf rh:Reaction .
?reaction rh:id ?reactionId .
?reaction rdfs:label ?reactionLabel .
}
"""\
PREFIX rh:<http://rdf.rhea-db.org/>
SELECT ?reaction ?reactionId ?reactionLabel WHERE {
?reaction rdfs:subClassOf rh:Reaction ;
rh:id ?reactionId ;
rdfs:label ?reactionLabel .
}
"""
)
names = {str(identifier): name for _, identifier, name in result}
names = {str(identifier): str(name) for _, identifier, name in result}

terms = {}
terms: Dict[str, Term] = {}
master_to_left: Dict[str, str] = {}
master_to_right: Dict[str, str] = {}
master_to_bi: Dict[str, str] = {}

directions = ensure_df(
PREFIX,
Expand All @@ -66,12 +115,16 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
force=force,
)
for master, lr, rl, bi in directions.values:
terms[master] = Term(
reference=Reference(prefix=PREFIX, identifier=master, name=names.get(master))
)
terms[lr] = Term(reference=Reference(prefix=PREFIX, identifier=lr, name=names.get(lr)))
terms[rl] = Term(reference=Reference(prefix=PREFIX, identifier=rl, name=names.get(rl)))
terms[bi] = Term(reference=Reference(prefix=PREFIX, identifier=bi, name=names.get(bi)))
master_to_left[master] = lr
master_to_right[master] = rl
master_to_bi[master] = bi

name = names[master]

terms[master] = Term(reference=Reference(prefix=PREFIX, identifier=master, name=name))
terms[lr] = Term(reference=Reference(prefix=PREFIX, identifier=lr, name=_get_lr_name(name)))
terms[rl] = Term(reference=Reference(prefix=PREFIX, identifier=rl, name=_get_rl_name(name)))
terms[bi] = Term(reference=Reference(prefix=PREFIX, identifier=bi, name=_get_bi_name(name)))

terms[master].append_relationship(has_left_to_right_reaction, terms[lr])
terms[master].append_relationship(has_right_to_left_reaction, terms[rl])
Expand All @@ -80,6 +133,38 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
terms[rl].append_parent(terms[master])
terms[bi].append_parent(terms[master])

# inspired by https://github.com/geneontology/go-ontology/blob/master/src/sparql/construct-rhea-reactions.sparql
sparql = """\
PREFIX rh:<http://rdf.rhea-db.org/>
SELECT ?reactionId ?side ?chebi WHERE {
?reaction rdfs:subClassOf rh:Reaction ;
rh:id ?reactionId .
?reaction rh:side ?side .
?side rh:contains ?participant .
?participant rh:compound ?compound .
?compound rh:chebi|rh:underlyingChebi|(rh:reactivePart/rh:chebi) ?chebi .
}
"""
for master_rhea_id, side_uri, chebi_uri in graph.query(sparql):
master_rhea_id = str(master_rhea_id)
chebi_reference = Reference(
prefix="chebi", identifier=chebi_uri[len("http://purl.obolibrary.org/obo/CHEBI_") :]
)
side = side_uri.split("_")[-1] # L or R
if side == "L":
left_rhea_id = master_to_left[master_rhea_id]
right_rhea_id = master_to_right[master_rhea_id]
elif side == "R":
left_rhea_id = master_to_right[master_rhea_id]
right_rhea_id = master_to_left[master_rhea_id]
else:
raise ValueError(f"Invalid side: {side_uri}")
terms[master_rhea_id].append_relationship(has_participant, chebi_reference)
terms[master_to_bi[master_rhea_id]].append_relationship(has_participant, chebi_reference)
terms[left_rhea_id].append_relationship(has_input, chebi_reference)
terms[right_rhea_id].append_relationship(has_output, chebi_reference)

hierarchy = ensure_df(
PREFIX,
url="ftp://ftp.expasy.org/databases/rhea/tsv/rhea-relationships.tsv",
Expand All @@ -91,39 +176,59 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
raise ValueError(f"RHEA unrecognized relation: {relation}")
terms[source].append_parent(terms[target])

for xref_prefix, url in [
("ecocyc", "rhea2ecocyc"),
("kegg.reaction", "rhea2kegg_reaction"),
("reactome", "rhea2reactome"),
("macie", "rhea2macie"),
("metacyc", "rhea2metacyc"),
for xref_prefix, url, relation in [
("ecocyc", "rhea2ecocyc", None),
("kegg.reaction", "rhea2kegg_reaction", None),
("reactome", "rhea2reactome", None),
("macie", "rhea2macie", None),
("metacyc", "rhea2metacyc", None),
("go", "rhea2go", reaction_enabled_by_molecular_function),
("uniprot", "rhea2uniprot", enabled_by),
]:
xref_df = ensure_df(
PREFIX,
url=f"ftp://ftp.expasy.org/databases/rhea/tsv/{url}.tsv",
version=version,
force=force,
)
for rhea_id, _, _, xref_id in xref_df.values:
if rhea_id not in terms:
for directional_rhea_id, _direction, _master_rhea_id, xref_id in xref_df.values:
if directional_rhea_id not in terms:
logger.debug(
"[%s] could not find %s:%s for xref %s:%s",
PREFIX,
PREFIX,
rhea_id,
directional_rhea_id,
xref_prefix,
xref_id,
)
continue
terms[rhea_id].append_xref(Reference(prefix=xref_prefix, identifier=xref_id))
target_reference = Reference(prefix=xref_prefix, identifier=xref_id)
if isinstance(relation, TypeDef):
terms[directional_rhea_id].append_relationship(relation, target_reference)
else:
terms[directional_rhea_id].append_xref(target_reference)

# TODO are EC codes equivalent?
# TODO uniprot enabled by (RO:0002333)
# TODO names?
# TODO participants?
ec_df = ensure_df(
PREFIX,
url="ftp://ftp.expasy.org/databases/rhea/tsv/rhea-ec-iubmb.tsv",
version=version,
force=force,
)
for (
directional_rhea_id,
_status,
_direction,
_master_id,
ec,
_enzyme_status,
_iubmb,
) in ec_df.values:
terms[directional_rhea_id].append_relationship(
enabled_by, Reference(prefix="eccode", identifier=ec)
)

yield from terms.values()


if __name__ == "__main__":
RheaGetter.cli()
RheaGetter().write_default(write_obo=True, force=True)
10 changes: 5 additions & 5 deletions src/pyobo/struct/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,14 +395,14 @@ def _definition_fp(self) -> str:

def iterate_relations(self) -> Iterable[Tuple[TypeDef, Reference]]:
"""Iterate over pairs of typedefs and targets."""
for typedef, targets in self.relationships.items():
for target in targets:
for typedef, targets in sorted(self.relationships.items(), key=_sort_relations):
for target in sorted(targets, key=lambda ref: ref.preferred_curie):
yield typedef, target

def iterate_properties(self) -> Iterable[Tuple[str, str]]:
"""Iterate over pairs of property and values."""
for prop, values in self.properties.items():
for value in values:
for prop, values in sorted(self.properties.items()):
for value in sorted(values):
yield prop, value

def iterate_obo_lines(self, *, ontology, typedefs) -> Iterable[str]:
Expand Down Expand Up @@ -466,7 +466,7 @@ def _escape(s) -> str:

def _sort_relations(r):
typedef, _references = r
return typedef.reference.name or typedef.reference.identifier
return typedef.preferred_curie


def _sort_properties(r):
Expand Down
31 changes: 26 additions & 5 deletions src/pyobo/struct/typedef.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@
]


def _bool_to_obo(v: bool) -> str:
return "true" if v else "false"


@dataclass
class TypeDef(Referenced):
"""A type definition in OBO.
Expand Down Expand Up @@ -88,7 +92,7 @@ def iterate_obo_lines(self) -> Iterable[str]:
yield f'def: "{self.definition}"'

if self.is_metadata_tag is not None:
yield f'is_metadata_tag: {"true" if self.is_metadata_tag else "false"}'
yield f"is_metadata_tag: {_bool_to_obo(self.is_metadata_tag)}"

if self.namespace:
yield f"namespace: {self.namespace}"
Expand All @@ -113,6 +117,10 @@ def iterate_obo_lines(self) -> Iterable[str]:
yield f"holds_over_chain: {_chain} ! {_names}"
if self.inverse:
yield f"inverse_of: {self.inverse}"
if self.domain:
yield f"domain: {self.domain}"
if self.range:
yield f"range: {self.range}"

@classmethod
def from_triple(cls, prefix: str, identifier: str, name: Optional[str] = None) -> "TypeDef":
Expand Down Expand Up @@ -161,13 +169,19 @@ def get_reference_tuple(relation: RelationHint) -> Tuple[str, str]:
"species with RO:0002162 (in taxon)",
)
has_left_to_right_reaction = TypeDef(
Reference(prefix="debio", identifier="0000007", name="has left-to-right reaction")
Reference(prefix="debio", identifier="0000007", name="has left-to-right reaction"),
is_metadata_tag=True,
)
has_right_to_left_reaction = TypeDef(
Reference(prefix="debio", identifier="0000008", name="has right-to-left reaction")
Reference(prefix="debio", identifier="0000008", name="has right-to-left reaction"),
is_metadata_tag=True,
)
has_bidirectional_reaction = TypeDef(
Reference(prefix="debio", identifier="0000009", name="has bi-directional reaction")
Reference(prefix="debio", identifier="0000009", name="has bi-directional reaction"),
is_metadata_tag=True,
)
reaction_enabled_by_molecular_function = TypeDef(
Reference(prefix="debio", identifier="0000047", name="reaction enabled by molecular function")
)


Expand Down Expand Up @@ -291,7 +305,14 @@ def get_reference_tuple(relation: RelationHint) -> Tuple[str, str]:
is_immediately_transformed_from = TypeDef.from_triple(
prefix=SIO_PREFIX, identifier="000658", name="is immediately transformed from"
)
enables = TypeDef.from_triple(prefix="RO", identifier="0002327", name="enables")

_enables_reference = Reference(prefix=RO_PREFIX, identifier="0002327", name="enables")
_enabled_by_reference = Reference(prefix=RO_PREFIX, identifier="0002333", name="enabled by")
enables = TypeDef(reference=_enables_reference, inverse=_enabled_by_reference)
enabled_by = TypeDef(reference=_enabled_by_reference, inverse=_enables_reference)

has_input = TypeDef.from_triple(prefix=RO_PREFIX, identifier="0002233", name="has input")
has_output = TypeDef.from_triple(prefix=RO_PREFIX, identifier="0002234", name="has output")

"""ChEBI"""

Expand Down

0 comments on commit a2da824

Please sign in to comment.