diff --git a/src/pyobo/sources/famplex.py b/src/pyobo/sources/famplex.py index 29011bf7..d0441815 100644 --- a/src/pyobo/sources/famplex.py +++ b/src/pyobo/sources/famplex.py @@ -151,9 +151,11 @@ def _get_xref_df(version: str) -> Mapping[str, List[Reference]]: } xrefs_df[0] = xrefs_df[0].map(lambda s: ns_remapping.get(s, s)) xrefs_df[1] = [ - bioregistry.standardize_identifier(xref_prefix, xref_identifier) - if xref_prefix != "nextprot.family" - else xref_identifier[len("FA:") :] + ( + bioregistry.standardize_identifier(xref_prefix, xref_identifier) + if xref_prefix != "nextprot.family" + else xref_identifier[len("FA:") :] + ) for xref_prefix, xref_identifier in xrefs_df[[0, 1]].values ] diff --git a/src/pyobo/sources/rhea.py b/src/pyobo/sources/rhea.py index d680d395..7459005f 100644 --- a/src/pyobo/sources/rhea.py +++ b/src/pyobo/sources/rhea.py @@ -3,31 +3,51 @@ """Converter for Rhea.""" import logging -from typing import Iterable +from typing import TYPE_CHECKING, Dict, Iterable, Optional +import bioversions import pystow from pyobo.struct import Obo, Reference, Term from pyobo.struct.typedef import ( + TypeDef, + enabled_by, has_bidirectional_reaction, + has_input, has_left_to_right_reaction, + has_output, + has_participant, has_right_to_left_reaction, + reaction_enabled_by_molecular_function, ) from pyobo.utils.path import ensure_df +if TYPE_CHECKING: + import rdflib + __all__ = [ "RheaGetter", ] logger = logging.getLogger(__name__) PREFIX = "rhea" +RHEA_RDF_GZ_URL = "ftp://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz" class RheaGetter(Obo): """An ontology representation of Rhea's chemical reaction database.""" ontology = bioversions_key = PREFIX - typedefs = [has_left_to_right_reaction, has_bidirectional_reaction, has_right_to_left_reaction] + typedefs = [ + has_left_to_right_reaction, + has_bidirectional_reaction, + has_right_to_left_reaction, + enabled_by, + has_input, + has_output, + has_participant, + reaction_enabled_by_molecular_function, + ] def iter_terms(self, force: bool = False) -> Iterable[Term]: """Iterate over terms in the ontology.""" @@ -39,25 +59,54 @@ def get_obo(force: bool = False) -> Obo: return RheaGetter(force=force) +def ensure_rhea_rdf(version: Optional[str] = None, force: bool = False) -> "rdflib.Graph": + """Get the Rhea RDF graph.""" + # see docs: https://ftp.expasy.org/databases/rhea/rdf/rhea_rdf_documentation.pdf + if version is None: + version = bioversions.get_version(PREFIX) + return pystow.ensure_rdf( + "pyobo", + "raw", + PREFIX, + version, + url=RHEA_RDF_GZ_URL, + force=force, + parse_kwargs=dict(format="xml"), + ) + + +def _get_lr_name(name: str) -> str: + return name.replace(" = ", " => ") + + +def _get_rl_name(name: str) -> str: + left, right = name.split(" = ", 1) + return f"{right} => {left}" + + +def _get_bi_name(name: str) -> str: + return name.replace(" = ", " <=> ") + + def iter_terms(version: str, force: bool = False) -> Iterable[Term]: """Iterate over terms in Rhea.""" - url = "ftp://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz" - graph = pystow.ensure_rdf( - "pyobo", "raw", PREFIX, version, url=url, force=force, parse_kwargs=dict(format="xml") - ) + graph = ensure_rhea_rdf(version=version, force=force) result = graph.query( - """ - PREFIX rh: - SELECT ?reaction ?reactionId ?reactionLabel WHERE { - ?reaction rdfs:subClassOf rh:Reaction . - ?reaction rh:id ?reactionId . - ?reaction rdfs:label ?reactionLabel . - } + """\ + PREFIX rh: + SELECT ?reaction ?reactionId ?reactionLabel WHERE { + ?reaction rdfs:subClassOf rh:Reaction ; + rh:id ?reactionId ; + rdfs:label ?reactionLabel . + } """ ) - names = {str(identifier): name for _, identifier, name in result} + names = {str(identifier): str(name) for _, identifier, name in result} - terms = {} + terms: Dict[str, Term] = {} + master_to_left: Dict[str, str] = {} + master_to_right: Dict[str, str] = {} + master_to_bi: Dict[str, str] = {} directions = ensure_df( PREFIX, @@ -66,12 +115,16 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]: force=force, ) for master, lr, rl, bi in directions.values: - terms[master] = Term( - reference=Reference(prefix=PREFIX, identifier=master, name=names.get(master)) - ) - terms[lr] = Term(reference=Reference(prefix=PREFIX, identifier=lr, name=names.get(lr))) - terms[rl] = Term(reference=Reference(prefix=PREFIX, identifier=rl, name=names.get(rl))) - terms[bi] = Term(reference=Reference(prefix=PREFIX, identifier=bi, name=names.get(bi))) + master_to_left[master] = lr + master_to_right[master] = rl + master_to_bi[master] = bi + + name = names[master] + + terms[master] = Term(reference=Reference(prefix=PREFIX, identifier=master, name=name)) + terms[lr] = Term(reference=Reference(prefix=PREFIX, identifier=lr, name=_get_lr_name(name))) + terms[rl] = Term(reference=Reference(prefix=PREFIX, identifier=rl, name=_get_rl_name(name))) + terms[bi] = Term(reference=Reference(prefix=PREFIX, identifier=bi, name=_get_bi_name(name))) terms[master].append_relationship(has_left_to_right_reaction, terms[lr]) terms[master].append_relationship(has_right_to_left_reaction, terms[rl]) @@ -80,6 +133,38 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]: terms[rl].append_parent(terms[master]) terms[bi].append_parent(terms[master]) + # inspired by https://github.com/geneontology/go-ontology/blob/master/src/sparql/construct-rhea-reactions.sparql + sparql = """\ + PREFIX rh: + SELECT ?reactionId ?side ?chebi WHERE { + ?reaction rdfs:subClassOf rh:Reaction ; + rh:id ?reactionId . + + ?reaction rh:side ?side . + ?side rh:contains ?participant . + ?participant rh:compound ?compound . + ?compound rh:chebi|rh:underlyingChebi|(rh:reactivePart/rh:chebi) ?chebi . + } + """ + for master_rhea_id, side_uri, chebi_uri in graph.query(sparql): + master_rhea_id = str(master_rhea_id) + chebi_reference = Reference( + prefix="chebi", identifier=chebi_uri[len("http://purl.obolibrary.org/obo/CHEBI_") :] + ) + side = side_uri.split("_")[-1] # L or R + if side == "L": + left_rhea_id = master_to_left[master_rhea_id] + right_rhea_id = master_to_right[master_rhea_id] + elif side == "R": + left_rhea_id = master_to_right[master_rhea_id] + right_rhea_id = master_to_left[master_rhea_id] + else: + raise ValueError(f"Invalid side: {side_uri}") + terms[master_rhea_id].append_relationship(has_participant, chebi_reference) + terms[master_to_bi[master_rhea_id]].append_relationship(has_participant, chebi_reference) + terms[left_rhea_id].append_relationship(has_input, chebi_reference) + terms[right_rhea_id].append_relationship(has_output, chebi_reference) + hierarchy = ensure_df( PREFIX, url="ftp://ftp.expasy.org/databases/rhea/tsv/rhea-relationships.tsv", @@ -91,12 +176,14 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]: raise ValueError(f"RHEA unrecognized relation: {relation}") terms[source].append_parent(terms[target]) - for xref_prefix, url in [ - ("ecocyc", "rhea2ecocyc"), - ("kegg.reaction", "rhea2kegg_reaction"), - ("reactome", "rhea2reactome"), - ("macie", "rhea2macie"), - ("metacyc", "rhea2metacyc"), + for xref_prefix, url, relation in [ + ("ecocyc", "rhea2ecocyc", None), + ("kegg.reaction", "rhea2kegg_reaction", None), + ("reactome", "rhea2reactome", None), + ("macie", "rhea2macie", None), + ("metacyc", "rhea2metacyc", None), + ("go", "rhea2go", reaction_enabled_by_molecular_function), + ("uniprot", "rhea2uniprot", enabled_by), ]: xref_df = ensure_df( PREFIX, @@ -104,26 +191,44 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]: version=version, force=force, ) - for rhea_id, _, _, xref_id in xref_df.values: - if rhea_id not in terms: + for directional_rhea_id, _direction, _master_rhea_id, xref_id in xref_df.values: + if directional_rhea_id not in terms: logger.debug( "[%s] could not find %s:%s for xref %s:%s", PREFIX, PREFIX, - rhea_id, + directional_rhea_id, xref_prefix, xref_id, ) continue - terms[rhea_id].append_xref(Reference(prefix=xref_prefix, identifier=xref_id)) + target_reference = Reference(prefix=xref_prefix, identifier=xref_id) + if isinstance(relation, TypeDef): + terms[directional_rhea_id].append_relationship(relation, target_reference) + else: + terms[directional_rhea_id].append_xref(target_reference) - # TODO are EC codes equivalent? - # TODO uniprot enabled by (RO:0002333) - # TODO names? - # TODO participants? + ec_df = ensure_df( + PREFIX, + url="ftp://ftp.expasy.org/databases/rhea/tsv/rhea-ec-iubmb.tsv", + version=version, + force=force, + ) + for ( + directional_rhea_id, + _status, + _direction, + _master_id, + ec, + _enzyme_status, + _iubmb, + ) in ec_df.values: + terms[directional_rhea_id].append_relationship( + enabled_by, Reference(prefix="eccode", identifier=ec) + ) yield from terms.values() if __name__ == "__main__": - RheaGetter.cli() + RheaGetter().write_default(write_obo=True, force=True) diff --git a/src/pyobo/struct/struct.py b/src/pyobo/struct/struct.py index 1e3f052b..03c79f80 100644 --- a/src/pyobo/struct/struct.py +++ b/src/pyobo/struct/struct.py @@ -395,14 +395,14 @@ def _definition_fp(self) -> str: def iterate_relations(self) -> Iterable[Tuple[TypeDef, Reference]]: """Iterate over pairs of typedefs and targets.""" - for typedef, targets in self.relationships.items(): - for target in targets: + for typedef, targets in sorted(self.relationships.items(), key=_sort_relations): + for target in sorted(targets, key=lambda ref: ref.preferred_curie): yield typedef, target def iterate_properties(self) -> Iterable[Tuple[str, str]]: """Iterate over pairs of property and values.""" - for prop, values in self.properties.items(): - for value in values: + for prop, values in sorted(self.properties.items()): + for value in sorted(values): yield prop, value def iterate_obo_lines(self, *, ontology, typedefs) -> Iterable[str]: @@ -466,7 +466,7 @@ def _escape(s) -> str: def _sort_relations(r): typedef, _references = r - return typedef.reference.name or typedef.reference.identifier + return typedef.preferred_curie def _sort_properties(r): diff --git a/src/pyobo/struct/typedef.py b/src/pyobo/struct/typedef.py index 8cde88b9..dfc00522 100644 --- a/src/pyobo/struct/typedef.py +++ b/src/pyobo/struct/typedef.py @@ -48,6 +48,10 @@ ] +def _bool_to_obo(v: bool) -> str: + return "true" if v else "false" + + @dataclass class TypeDef(Referenced): """A type definition in OBO. @@ -88,7 +92,7 @@ def iterate_obo_lines(self) -> Iterable[str]: yield f'def: "{self.definition}"' if self.is_metadata_tag is not None: - yield f'is_metadata_tag: {"true" if self.is_metadata_tag else "false"}' + yield f"is_metadata_tag: {_bool_to_obo(self.is_metadata_tag)}" if self.namespace: yield f"namespace: {self.namespace}" @@ -113,6 +117,10 @@ def iterate_obo_lines(self) -> Iterable[str]: yield f"holds_over_chain: {_chain} ! {_names}" if self.inverse: yield f"inverse_of: {self.inverse}" + if self.domain: + yield f"domain: {self.domain}" + if self.range: + yield f"range: {self.range}" @classmethod def from_triple(cls, prefix: str, identifier: str, name: Optional[str] = None) -> "TypeDef": @@ -161,13 +169,19 @@ def get_reference_tuple(relation: RelationHint) -> Tuple[str, str]: "species with RO:0002162 (in taxon)", ) has_left_to_right_reaction = TypeDef( - Reference(prefix="debio", identifier="0000007", name="has left-to-right reaction") + Reference(prefix="debio", identifier="0000007", name="has left-to-right reaction"), + is_metadata_tag=True, ) has_right_to_left_reaction = TypeDef( - Reference(prefix="debio", identifier="0000008", name="has right-to-left reaction") + Reference(prefix="debio", identifier="0000008", name="has right-to-left reaction"), + is_metadata_tag=True, ) has_bidirectional_reaction = TypeDef( - Reference(prefix="debio", identifier="0000009", name="has bi-directional reaction") + Reference(prefix="debio", identifier="0000009", name="has bi-directional reaction"), + is_metadata_tag=True, +) +reaction_enabled_by_molecular_function = TypeDef( + Reference(prefix="debio", identifier="0000047", name="reaction enabled by molecular function") ) @@ -291,7 +305,14 @@ def get_reference_tuple(relation: RelationHint) -> Tuple[str, str]: is_immediately_transformed_from = TypeDef.from_triple( prefix=SIO_PREFIX, identifier="000658", name="is immediately transformed from" ) -enables = TypeDef.from_triple(prefix="RO", identifier="0002327", name="enables") + +_enables_reference = Reference(prefix=RO_PREFIX, identifier="0002327", name="enables") +_enabled_by_reference = Reference(prefix=RO_PREFIX, identifier="0002333", name="enabled by") +enables = TypeDef(reference=_enables_reference, inverse=_enabled_by_reference) +enabled_by = TypeDef(reference=_enabled_by_reference, inverse=_enables_reference) + +has_input = TypeDef.from_triple(prefix=RO_PREFIX, identifier="0002233", name="has input") +has_output = TypeDef.from_triple(prefix=RO_PREFIX, identifier="0002234", name="has output") """ChEBI"""