diff --git a/src/pyobo/sources/famplex.py b/src/pyobo/sources/famplex.py
index 29011bf7..d0441815 100644
--- a/src/pyobo/sources/famplex.py
+++ b/src/pyobo/sources/famplex.py
@@ -151,9 +151,11 @@ def _get_xref_df(version: str) -> Mapping[str, List[Reference]]:
}
xrefs_df[0] = xrefs_df[0].map(lambda s: ns_remapping.get(s, s))
xrefs_df[1] = [
- bioregistry.standardize_identifier(xref_prefix, xref_identifier)
- if xref_prefix != "nextprot.family"
- else xref_identifier[len("FA:") :]
+ (
+ bioregistry.standardize_identifier(xref_prefix, xref_identifier)
+ if xref_prefix != "nextprot.family"
+ else xref_identifier[len("FA:") :]
+ )
for xref_prefix, xref_identifier in xrefs_df[[0, 1]].values
]
diff --git a/src/pyobo/sources/rhea.py b/src/pyobo/sources/rhea.py
index d680d395..7459005f 100644
--- a/src/pyobo/sources/rhea.py
+++ b/src/pyobo/sources/rhea.py
@@ -3,31 +3,51 @@
"""Converter for Rhea."""
import logging
-from typing import Iterable
+from typing import TYPE_CHECKING, Dict, Iterable, Optional
+import bioversions
import pystow
from pyobo.struct import Obo, Reference, Term
from pyobo.struct.typedef import (
+ TypeDef,
+ enabled_by,
has_bidirectional_reaction,
+ has_input,
has_left_to_right_reaction,
+ has_output,
+ has_participant,
has_right_to_left_reaction,
+ reaction_enabled_by_molecular_function,
)
from pyobo.utils.path import ensure_df
+if TYPE_CHECKING:
+ import rdflib
+
__all__ = [
"RheaGetter",
]
logger = logging.getLogger(__name__)
PREFIX = "rhea"
+RHEA_RDF_GZ_URL = "ftp://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz"
class RheaGetter(Obo):
"""An ontology representation of Rhea's chemical reaction database."""
ontology = bioversions_key = PREFIX
- typedefs = [has_left_to_right_reaction, has_bidirectional_reaction, has_right_to_left_reaction]
+ typedefs = [
+ has_left_to_right_reaction,
+ has_bidirectional_reaction,
+ has_right_to_left_reaction,
+ enabled_by,
+ has_input,
+ has_output,
+ has_participant,
+ reaction_enabled_by_molecular_function,
+ ]
def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in the ontology."""
@@ -39,25 +59,54 @@ def get_obo(force: bool = False) -> Obo:
return RheaGetter(force=force)
+def ensure_rhea_rdf(version: Optional[str] = None, force: bool = False) -> "rdflib.Graph":
+ """Get the Rhea RDF graph."""
+ # see docs: https://ftp.expasy.org/databases/rhea/rdf/rhea_rdf_documentation.pdf
+ if version is None:
+ version = bioversions.get_version(PREFIX)
+ return pystow.ensure_rdf(
+ "pyobo",
+ "raw",
+ PREFIX,
+ version,
+ url=RHEA_RDF_GZ_URL,
+ force=force,
+ parse_kwargs=dict(format="xml"),
+ )
+
+
+def _get_lr_name(name: str) -> str:
+ return name.replace(" = ", " => ")
+
+
+def _get_rl_name(name: str) -> str:
+ left, right = name.split(" = ", 1)
+ return f"{right} => {left}"
+
+
+def _get_bi_name(name: str) -> str:
+ return name.replace(" = ", " <=> ")
+
+
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in Rhea."""
- url = "ftp://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz"
- graph = pystow.ensure_rdf(
- "pyobo", "raw", PREFIX, version, url=url, force=force, parse_kwargs=dict(format="xml")
- )
+ graph = ensure_rhea_rdf(version=version, force=force)
result = graph.query(
- """
- PREFIX rh:
- SELECT ?reaction ?reactionId ?reactionLabel WHERE {
- ?reaction rdfs:subClassOf rh:Reaction .
- ?reaction rh:id ?reactionId .
- ?reaction rdfs:label ?reactionLabel .
- }
+ """\
+ PREFIX rh:
+ SELECT ?reaction ?reactionId ?reactionLabel WHERE {
+ ?reaction rdfs:subClassOf rh:Reaction ;
+ rh:id ?reactionId ;
+ rdfs:label ?reactionLabel .
+ }
"""
)
- names = {str(identifier): name for _, identifier, name in result}
+ names = {str(identifier): str(name) for _, identifier, name in result}
- terms = {}
+ terms: Dict[str, Term] = {}
+ master_to_left: Dict[str, str] = {}
+ master_to_right: Dict[str, str] = {}
+ master_to_bi: Dict[str, str] = {}
directions = ensure_df(
PREFIX,
@@ -66,12 +115,16 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
force=force,
)
for master, lr, rl, bi in directions.values:
- terms[master] = Term(
- reference=Reference(prefix=PREFIX, identifier=master, name=names.get(master))
- )
- terms[lr] = Term(reference=Reference(prefix=PREFIX, identifier=lr, name=names.get(lr)))
- terms[rl] = Term(reference=Reference(prefix=PREFIX, identifier=rl, name=names.get(rl)))
- terms[bi] = Term(reference=Reference(prefix=PREFIX, identifier=bi, name=names.get(bi)))
+ master_to_left[master] = lr
+ master_to_right[master] = rl
+ master_to_bi[master] = bi
+
+ name = names[master]
+
+ terms[master] = Term(reference=Reference(prefix=PREFIX, identifier=master, name=name))
+ terms[lr] = Term(reference=Reference(prefix=PREFIX, identifier=lr, name=_get_lr_name(name)))
+ terms[rl] = Term(reference=Reference(prefix=PREFIX, identifier=rl, name=_get_rl_name(name)))
+ terms[bi] = Term(reference=Reference(prefix=PREFIX, identifier=bi, name=_get_bi_name(name)))
terms[master].append_relationship(has_left_to_right_reaction, terms[lr])
terms[master].append_relationship(has_right_to_left_reaction, terms[rl])
@@ -80,6 +133,38 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
terms[rl].append_parent(terms[master])
terms[bi].append_parent(terms[master])
+ # inspired by https://github.com/geneontology/go-ontology/blob/master/src/sparql/construct-rhea-reactions.sparql
+ sparql = """\
+ PREFIX rh:
+ SELECT ?reactionId ?side ?chebi WHERE {
+ ?reaction rdfs:subClassOf rh:Reaction ;
+ rh:id ?reactionId .
+
+ ?reaction rh:side ?side .
+ ?side rh:contains ?participant .
+ ?participant rh:compound ?compound .
+ ?compound rh:chebi|rh:underlyingChebi|(rh:reactivePart/rh:chebi) ?chebi .
+ }
+ """
+ for master_rhea_id, side_uri, chebi_uri in graph.query(sparql):
+ master_rhea_id = str(master_rhea_id)
+ chebi_reference = Reference(
+ prefix="chebi", identifier=chebi_uri[len("http://purl.obolibrary.org/obo/CHEBI_") :]
+ )
+ side = side_uri.split("_")[-1] # L or R
+ if side == "L":
+ left_rhea_id = master_to_left[master_rhea_id]
+ right_rhea_id = master_to_right[master_rhea_id]
+ elif side == "R":
+ left_rhea_id = master_to_right[master_rhea_id]
+ right_rhea_id = master_to_left[master_rhea_id]
+ else:
+ raise ValueError(f"Invalid side: {side_uri}")
+ terms[master_rhea_id].append_relationship(has_participant, chebi_reference)
+ terms[master_to_bi[master_rhea_id]].append_relationship(has_participant, chebi_reference)
+ terms[left_rhea_id].append_relationship(has_input, chebi_reference)
+ terms[right_rhea_id].append_relationship(has_output, chebi_reference)
+
hierarchy = ensure_df(
PREFIX,
url="ftp://ftp.expasy.org/databases/rhea/tsv/rhea-relationships.tsv",
@@ -91,12 +176,14 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
raise ValueError(f"RHEA unrecognized relation: {relation}")
terms[source].append_parent(terms[target])
- for xref_prefix, url in [
- ("ecocyc", "rhea2ecocyc"),
- ("kegg.reaction", "rhea2kegg_reaction"),
- ("reactome", "rhea2reactome"),
- ("macie", "rhea2macie"),
- ("metacyc", "rhea2metacyc"),
+ for xref_prefix, url, relation in [
+ ("ecocyc", "rhea2ecocyc", None),
+ ("kegg.reaction", "rhea2kegg_reaction", None),
+ ("reactome", "rhea2reactome", None),
+ ("macie", "rhea2macie", None),
+ ("metacyc", "rhea2metacyc", None),
+ ("go", "rhea2go", reaction_enabled_by_molecular_function),
+ ("uniprot", "rhea2uniprot", enabled_by),
]:
xref_df = ensure_df(
PREFIX,
@@ -104,26 +191,44 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
version=version,
force=force,
)
- for rhea_id, _, _, xref_id in xref_df.values:
- if rhea_id not in terms:
+ for directional_rhea_id, _direction, _master_rhea_id, xref_id in xref_df.values:
+ if directional_rhea_id not in terms:
logger.debug(
"[%s] could not find %s:%s for xref %s:%s",
PREFIX,
PREFIX,
- rhea_id,
+ directional_rhea_id,
xref_prefix,
xref_id,
)
continue
- terms[rhea_id].append_xref(Reference(prefix=xref_prefix, identifier=xref_id))
+ target_reference = Reference(prefix=xref_prefix, identifier=xref_id)
+ if isinstance(relation, TypeDef):
+ terms[directional_rhea_id].append_relationship(relation, target_reference)
+ else:
+ terms[directional_rhea_id].append_xref(target_reference)
- # TODO are EC codes equivalent?
- # TODO uniprot enabled by (RO:0002333)
- # TODO names?
- # TODO participants?
+ ec_df = ensure_df(
+ PREFIX,
+ url="ftp://ftp.expasy.org/databases/rhea/tsv/rhea-ec-iubmb.tsv",
+ version=version,
+ force=force,
+ )
+ for (
+ directional_rhea_id,
+ _status,
+ _direction,
+ _master_id,
+ ec,
+ _enzyme_status,
+ _iubmb,
+ ) in ec_df.values:
+ terms[directional_rhea_id].append_relationship(
+ enabled_by, Reference(prefix="eccode", identifier=ec)
+ )
yield from terms.values()
if __name__ == "__main__":
- RheaGetter.cli()
+ RheaGetter().write_default(write_obo=True, force=True)
diff --git a/src/pyobo/struct/struct.py b/src/pyobo/struct/struct.py
index 1e3f052b..03c79f80 100644
--- a/src/pyobo/struct/struct.py
+++ b/src/pyobo/struct/struct.py
@@ -395,14 +395,14 @@ def _definition_fp(self) -> str:
def iterate_relations(self) -> Iterable[Tuple[TypeDef, Reference]]:
"""Iterate over pairs of typedefs and targets."""
- for typedef, targets in self.relationships.items():
- for target in targets:
+ for typedef, targets in sorted(self.relationships.items(), key=_sort_relations):
+ for target in sorted(targets, key=lambda ref: ref.preferred_curie):
yield typedef, target
def iterate_properties(self) -> Iterable[Tuple[str, str]]:
"""Iterate over pairs of property and values."""
- for prop, values in self.properties.items():
- for value in values:
+ for prop, values in sorted(self.properties.items()):
+ for value in sorted(values):
yield prop, value
def iterate_obo_lines(self, *, ontology, typedefs) -> Iterable[str]:
@@ -466,7 +466,7 @@ def _escape(s) -> str:
def _sort_relations(r):
typedef, _references = r
- return typedef.reference.name or typedef.reference.identifier
+ return typedef.preferred_curie
def _sort_properties(r):
diff --git a/src/pyobo/struct/typedef.py b/src/pyobo/struct/typedef.py
index 8cde88b9..dfc00522 100644
--- a/src/pyobo/struct/typedef.py
+++ b/src/pyobo/struct/typedef.py
@@ -48,6 +48,10 @@
]
+def _bool_to_obo(v: bool) -> str:
+ return "true" if v else "false"
+
+
@dataclass
class TypeDef(Referenced):
"""A type definition in OBO.
@@ -88,7 +92,7 @@ def iterate_obo_lines(self) -> Iterable[str]:
yield f'def: "{self.definition}"'
if self.is_metadata_tag is not None:
- yield f'is_metadata_tag: {"true" if self.is_metadata_tag else "false"}'
+ yield f"is_metadata_tag: {_bool_to_obo(self.is_metadata_tag)}"
if self.namespace:
yield f"namespace: {self.namespace}"
@@ -113,6 +117,10 @@ def iterate_obo_lines(self) -> Iterable[str]:
yield f"holds_over_chain: {_chain} ! {_names}"
if self.inverse:
yield f"inverse_of: {self.inverse}"
+ if self.domain:
+ yield f"domain: {self.domain}"
+ if self.range:
+ yield f"range: {self.range}"
@classmethod
def from_triple(cls, prefix: str, identifier: str, name: Optional[str] = None) -> "TypeDef":
@@ -161,13 +169,19 @@ def get_reference_tuple(relation: RelationHint) -> Tuple[str, str]:
"species with RO:0002162 (in taxon)",
)
has_left_to_right_reaction = TypeDef(
- Reference(prefix="debio", identifier="0000007", name="has left-to-right reaction")
+ Reference(prefix="debio", identifier="0000007", name="has left-to-right reaction"),
+ is_metadata_tag=True,
)
has_right_to_left_reaction = TypeDef(
- Reference(prefix="debio", identifier="0000008", name="has right-to-left reaction")
+ Reference(prefix="debio", identifier="0000008", name="has right-to-left reaction"),
+ is_metadata_tag=True,
)
has_bidirectional_reaction = TypeDef(
- Reference(prefix="debio", identifier="0000009", name="has bi-directional reaction")
+ Reference(prefix="debio", identifier="0000009", name="has bi-directional reaction"),
+ is_metadata_tag=True,
+)
+reaction_enabled_by_molecular_function = TypeDef(
+ Reference(prefix="debio", identifier="0000047", name="reaction enabled by molecular function")
)
@@ -291,7 +305,14 @@ def get_reference_tuple(relation: RelationHint) -> Tuple[str, str]:
is_immediately_transformed_from = TypeDef.from_triple(
prefix=SIO_PREFIX, identifier="000658", name="is immediately transformed from"
)
-enables = TypeDef.from_triple(prefix="RO", identifier="0002327", name="enables")
+
+_enables_reference = Reference(prefix=RO_PREFIX, identifier="0002327", name="enables")
+_enabled_by_reference = Reference(prefix=RO_PREFIX, identifier="0002333", name="enabled by")
+enables = TypeDef(reference=_enables_reference, inverse=_enabled_by_reference)
+enabled_by = TypeDef(reference=_enabled_by_reference, inverse=_enables_reference)
+
+has_input = TypeDef.from_triple(prefix=RO_PREFIX, identifier="0002233", name="has input")
+has_output = TypeDef.from_triple(prefix=RO_PREFIX, identifier="0002234", name="has output")
"""ChEBI"""