-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparser.py
540 lines (437 loc) · 23.8 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
import re
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element
from typing import List
# dictionary with namespaces
NAMESPACES = {
"rh": "http://rdf.rhea-db.org/",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
}
# Collection of commonly used attribute keys to `xml.etree.ElementTree.Element.attrib`
ATTRIB_KEYS = {
# expands to "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource"
"rdf:resource": f"{{{NAMESPACES['rdf']}}}resource",
# expands to "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about"
"rdf:about": f"{{{NAMESPACES['rdf']}}}about"
}
# Collection of commonly used base IRIs
BASE_IRIS = {
"rh": NAMESPACES["rh"],
"pubmed": "http://rdf.ncbi.nlm.nih.gov/pubmed/",
"obo": "http://purl.obolibrary.org/obo/",
"ec": "http://purl.uniprot.org/enzyme/"
}
class ReactivePartDataFactory:
"""
A macromolecule (of type "rh:GenericCompound") has reactive parts, specified by description like:
<rdf:Description rdf:about="http://rdf.rhea-db.org/Compound_9846_rp1">
<rdfs:subClassOf rdf:resource="http://rdf.rhea-db.org/ReactivePart"/>
<rh:name>N(6)-methyl-L-lysine residue</rh:name>
<rh:htmlName>...</rh:htmlName>
<rh:formula>C7H15N2O</rh:formula>
<rh:charge>1</rh:charge>
<rh:chebi rdf:resource="http://purl.obolibrary.org/obo/CHEBI_61929"/>
<rdfs:subClassOf rdf:resource="http://purl.obolibrary.org/obo/CHEBI_61929"/>
</rdf:Description>
The relative IRI "Compound_9846_rp1" indicates this is the first reactive part of compound "9846".
This data factory produce all such associations in tuples (comp_num, rp_entry).
"""
relative_iri_pattern = re.compile(r"Compound_\d+_rp\d")
@classmethod
def is_valid_relative_iri(cls, relative_iri: str):
match = cls.relative_iri_pattern.fullmatch(relative_iri)
return match is not None
# Adds name key and associated value to compounds with reactive parts
@classmethod
def _add_rp_name(cls, rp_entry: dict, description: Element):
node = description.find("rh:name", NAMESPACES)
if node is not None:
rp_name = node.text
rp_entry["name"] = rp_name
# Adds formula key and associated value to compounds with reactive parts
@classmethod
def _add_rp_formula(cls, rp_entry: dict, description: Element):
node = description.find("rh:formula", NAMESPACES)
if node is not None:
rp_formula = node.text
rp_entry["formula"] = rp_formula
# Adds charge key and associated value to compounds with reactive parts
@classmethod
def _add_rp_charge(cls, rp_entry: dict, description: Element):
node = description.find("rh:charge", NAMESPACES)
if node is not None:
rp_charge = int(node.text)
rp_entry["charge"] = rp_charge
# Adds chebi key and associated id to compounds with reactive parts
@classmethod
def _add_rp_chebi_id(cls, rp_entry: dict, description: Element):
node = description.find("rh:chebi", NAMESPACES)
if node is not None:
rp_chebi = node.attrib[ATTRIB_KEYS["rdf:resource"]].lstrip(BASE_IRIS["obo"]).replace("_", ":")
rp_entry["chebi_id"] = rp_chebi
# Adds reactive part information to the list associated with the key "reactive_parts" to compounds with this annoted
@classmethod
def produce(cls, relative_iri: str, description: Element):
# we can assume here that relative IRI has a pattern of "Compound_\d+_rp\d", e.g. "Compound_10594_rp2"
# comp_num = re.sub(pattern=r"_rp\d", repl="", string=relative_iri).lstrip("Compound_")
comp_num = relative_iri.split("_")[1]
rp_entry = {}
cls._add_rp_name(rp_entry, description)
cls._add_rp_formula(rp_entry, description)
cls._add_rp_charge(rp_entry, description)
cls._add_rp_chebi_id(rp_entry, description)
yield comp_num, rp_entry
class SideDataFactory:
"""
Each reaction specifies its two side ("L" for left, "R" for right). E.g.
<rdf:Description rdf:about="http://rdf.rhea-db.org/35975">
# some other tags ignored
<rh:side rdf:resource="http://rdf.rhea-db.org/35975_L"/>
</rdf:Description>
<rdf:Description rdf:about="http://rdf.rhea-db.org/35975">
<rh:side rdf:resource="http://rdf.rhea-db.org/35975_R"/>
</rdf:Description>
Each side description specifies a "contains" relationship for each participating compound. E.g.
<rdf:Description rdf:about="http://rdf.rhea-db.org/35975_R">
<rh:contains rdf:resource="http://rdf.rhea-db.org/Participant_35975_compound_3512"/>
<rh:contains1 rdf:resource="http://rdf.rhea-db.org/Participant_35975_compound_3512"/>
</rdf:Description>
Note that "rh:contains" only specifies containing without stoichiometry. "rh:contains1" indicates a containing
relationship along with stoichiometry 1.
Therefore from this side description we derive the following associations:
- The right side of reaction "35975" contains compound "3512", and
- the compound's stoichiometry is 1 in the reaction.
We can also get the location ("in" or "out") of a compound, if its reaction is a transport reaction and the
compound is found in both sides of the reaction. E.g.
<rdf:Description rdf:about="http://rdf.rhea-db.org/69560_L">
<rh:contains rdf:resource="http://rdf.rhea-db.org/Participant_69560_compound_3249_out"/>
<rh:contains1 rdf:resource="http://rdf.rhea-db.org/Participant_69560_compound_3249_out"/>
</rdf:Description>
The location of compound "3249" is "out".
This data factory produces association quadruples of (rhea_id, side_key, comp_num, participation_entry), which
indicates:
- The reaction with `rhea_id`, on its left/right side specified by `side_key`, contains a compound indicated by
`comp_num`, and
- the compound's stoichiometry and location (if any) are wrapped in the `participation_entry`.
"""
# expands to "{http://rdf.rhea-db.org/}contains"
contains_prefix = f"{{{NAMESPACES['rh']}}}contains"
side_char_to_key = {
"L": "side_l",
"R": "side_r"
}
@classmethod
def is_valid_relative_iri(cls, relative_iri: str):
rmc = relative_iri[-1] # right-most character
return rmc in cls.side_char_to_key
@classmethod
def is_stoichiometric_tag(cls, tag: str):
"""
Check if a tag has the form of "rh:contains[x]" from which the stoichiometry of [x] can be inferred
"""
return (tag != cls.contains_prefix) and tag.startswith(cls.contains_prefix)
@classmethod
def get_stoichiometry(cls, tag: str):
"""
Trim the prefix "rh:contains" from a stoichimetric tag to get the stoichiometry.
Note that there exist special stoichimetric tags like
- "containsN",
- "contains2n" (no idea why it's not "2N"),
- "containsNplus1", and
- "containsNminus1"
Therefore stoichiometry's datatype must be string.
"""
return tag.lstrip(cls.contains_prefix)
@classmethod
def produce(cls, relative_iri: str, description: Element):
# we assume relative_iri is valid
side_char = relative_iri[-1]
side_key = cls.side_char_to_key[side_char]
child_tags = [child.tag for child in description if cls.is_stoichiometric_tag(child.tag)]
for tag in child_tags:
# namespaces not needed here for description.find() because `tag` has the expanded namespace already
contained_absolute_iri = description.find(tag).attrib[ATTRIB_KEYS["rdf:resource"]]
contained_relative_iri = contained_absolute_iri.lstrip(BASE_IRIS['rh'])
# E.g. contained_relative_iri = "Participant_69560_compound_3249_out"
contained_relative_iri_parts = contained_relative_iri.split("_")
rhea_num = contained_relative_iri_parts[1]
comp_num = contained_relative_iri_parts[3]
location = contained_relative_iri_parts[4] if len(contained_relative_iri_parts) == 5 else None
"""
Here we assume that rhea id can be inferred from the contained IRI (because accession id is not
available).
E.g. an IRI like "Participant_35975_compound_3512" indicates that compound_lib["3512"] is associated
with reaction "RHEA:35975"
It's also possible to infer from the upper-level IRI (the `relative_iri` argument). E.g. "35975_R" in
the above example
"""
rhea_id = "RHEA:" + rhea_num
stoich = cls.get_stoichiometry(tag)
participation_entry = {"stoich": stoich}
# Adds positional information for compounds to rhea entries that specify a transport reaction
if location:
participation_entry["location"] = location
yield rhea_id, side_key, comp_num, participation_entry
class CompoundDataFactory:
"""
A compound's description is like
<rdf:Description rdf:about="http://rdf.rhea-db.org/Compound_1454">
<rh:id rdf:datatype="http://www.w3.org/2001/XMLSchema#long">1454</rh:id>
<rh:accession>CHEBI:58413</rh:accession>
<rh:name>(R)-6-hydroxynicotine</rh:name>
<rh:htmlName>(<i>R</i>)-6-hydroxynicotine</rh:htmlName>
<rh:formula>C10H15N2O</rh:formula>
<rh:charge rdf:datatype="http://www.w3.org/2001/XMLSchema#int">1</rh:charge>
<rdfs:subClassOf rdf:resource="http://rdf.rhea-db.org/SmallMolecule"/>
<rh:chebi rdf:resource="http://purl.obolibrary.org/obo/CHEBI_58413"/>
<rdfs:subClassOf rdf:resource="http://purl.obolibrary.org/obo/CHEBI_58413"/>
</rdf:Description>
Note that only such descriptions with a valid accession ID will be parsed by this data factory. Currently there
are 3 types of valid accession IDs for compounds, i.e. "CHEBI", "GENERIC", and "POLYMER".
This data factory produces basic information (without reactive parts, stoichiometry, nor location) for each
compound.
"""
compound_prefixes = set(["CHEBI:", "GENERIC:", "POLYMER:"])
@classmethod
def is_valid_accession_id(cls, accession_id: str):
for prefix in cls.compound_prefixes:
if accession_id.startswith(prefix):
return True
return False
# Adds ID key and value to compound entries
@classmethod
def _add_comp_id(cls, comp_entry: dict, accession_id: str):
if "CHEBI:" in accession_id:
comp_entry["chebi_id"] = accession_id
elif "GENERIC:" in accession_id:
comp_entry["generic_id"] = accession_id.lstrip("GENERIC:")
elif "POLYMER:" in accession_id:
comp_entry["poly_id"] = accession_id.lstrip("POLYMER:")
else:
raise ValueError(f"Cannot recognize accession type. Got accession id {accession_id}")
# Adds name key and id to compound entries
@classmethod
def _add_comp_name(cls, comp_entry: dict, description: Element):
node = description.find("rh:name", NAMESPACES)
if node is not None:
comp_name = node.text
comp_entry["name"] = comp_name
# Adds formula key and value to compound entries
@classmethod
def _add_comp_formula(cls, comp_entry: dict, description: Element):
node = description.find("rh:formula", NAMESPACES)
if node is not None:
formula = node.text
if formula is not None:
formula = formula.rstrip("<i><sub>n</sub></i>")
comp_entry["formula"] = formula
# Adds charge key and value to compound entries
@classmethod
def _add_comp_charge(cls, comp_entry: dict, description: Element):
node = description.find("rh:charge", NAMESPACES)
if node is not None:
comp_charge = node.text.rstrip("<i><sub>n</sub></i>")
# comp_charge can be a string like '(-4)(-1)' so its datatype cannot be integer
comp_entry["charge"] = comp_charge
@classmethod
def produce(cls, relative_iri: str, accession_id: str, description: Element):
comp_entry = {}
# we can assume here that relative IRI has a pattern of "Compound_\d", e.g. "Compound_10594"
comp_num = relative_iri.split("_")[1]
comp_entry["comp_num"] = comp_num
cls._add_comp_id(comp_entry, accession_id)
cls._add_comp_name(comp_entry, description)
cls._add_comp_formula(comp_entry, description)
cls._add_comp_charge(comp_entry, description)
yield comp_entry
@classmethod
def pack(cls, comp_entries: List[dict]):
"""
Pack a list of compound entries into a dictionary of {comp_num : comp_entry}
"""
return dict((comp_entry["comp_num"], comp_entry) for comp_entry in comp_entries)
class ReactionDataFactory:
"""
A reaction's description is like
<rdf:Description rdf:about="http://rdf.rhea-db.org/10000">
<rdfs:subClassOf rdf:resource="http://rdf.rhea-db.org/Reaction"/>
<rh:id rdf:datatype="http://www.w3.org/2001/XMLSchema#long">10000</rh:id>
<rh:accession>RHEA:10000</rh:accession>
<rdfs:label>H2O + pentanamide = NH4(+) + pentanoate</rdfs:label>
<rh:equation>H2O + pentanamide = NH4(+) + pentanoate</rh:equation>
<rh:htmlEquation>...</rh:htmlEquation>
<rh:directionalReaction rdf:resource="http://rdf.rhea-db.org/10001"/>
<rh:directionalReaction rdf:resource="http://rdf.rhea-db.org/10002"/>
<rh:bidirectionalReaction rdf:resource="http://rdf.rhea-db.org/10003"/>
<rh:status rdf:resource="http://rdf.rhea-db.org/Approved"/>
<rh:isChemicallyBalanced rdf:datatype="http://www.w3.org/2001/XMLSchema#boolean">
true
</rh:isChemicallyBalanced>
<rh:isTransport rdf:datatype="http://www.w3.org/2001/XMLSchema#boolean">false</rh:isTransport>
<rdfs:comment>...</rdfs:comment>
<rh:ec rdf:resource="http://purl.uniprot.org/enzyme/3.5.1.50"/>
<rdfs:seeAlso rdf:resource="http://identifiers.org/biocyc/METACYC:PENTANAMIDASE-RXN"/>
<rdfs:seeAlso rdf:resource="http://purl.obolibrary.org/obo/GO_0050168"/>
<rh:side rdf:resource="http://rdf.rhea-db.org/10000_L"/>
</rdf:Description>
Note that only such descriptions with a valid accession ID will be selected to this data factory. Currently there
are 1 type of valid accession IDs for reactions, i.e. those starting with "RHEA".
Also note that for each reaction, there will be 4 variants, i.e.
- the master reaction (direction undefined, e.g. "RHEA:10000")
- 2 directional reactions (left-to-right, e.g. "RHEA:10001", and right-to-left, e.g. "RHEA:10002")
- the bidirectional reaction (e.g. RHEA:10003)
In this data factory, only the master reactions will be parsed to individual entries, the directional and
bidirectional reactions will be attached to their master reactions as "children_rheas".
"""
@classmethod
def is_valid_accession_id(cls, accession_id):
return accession_id.startswith("RHEA:")
@classmethod
def is_master_reaction(cls, description):
"""
There are multiple ways to tell if a reaction is a master reaction.
Method 1: tell by "rdfs:subClassOf", whose values are "Reaction", "DirectionalReaction",
"BidirectionalReaction".
Method 2: tell by the existence of "rh:substrates" and/or "rh:products" (only in "DirectionalReaction"),
plus "rh:substratesOrProducts" (only in "BidirectionalReaction")
Here we use method 2.
Note that a master reaction's RHEA ID is not necessarily a multiple of 4. E.g. "RHEA:26018" for some reason is
not used, and the next master reaction is "RHEA:26019". Therefore there is no modulo relationship between a
reaction's type and its RHEA ID.
"""
return (description.find("rh:substrates", NAMESPACES) is None) and \
(description.find("rh:substratesOrProducts", NAMESPACES) is None)
# Adds equation key and associated value to reaction entry
@classmethod
def _add_rhea_equation(cls, reaction_entry: dict, description: Element):
node = description.find("rh:equation", NAMESPACES)
if node is not None:
reaction_entry["equation"] = node.text
# Adds is_transport key and associated boolean to reaction entry
@classmethod
def _add_rhea_transport(cls, reaction_entry: dict, description: Element):
node = description.find("rh:isTransport", NAMESPACES)
if node is not None:
"""
"rh:isTransport" has specified rdf:datatype="http://www.w3.org/2001/XMLSchema#boolean",
therefore only 2 unique values are possible, "true" and "false"
"""
is_transport = node.text # string type
is_transport = (is_transport == "true") # boolean type
reaction_entry["is_transport"] = is_transport
# Adds ec_link and ec_id keys and associated values to reaction entry
# ENZYME is an enzyme nomenclature database, which assigns an EC (Enzyme Commission) number for each enzyme
@classmethod
def _add_rhea_ec(cls, reaction_entry: dict, description: Element):
node = description.find("rh:ec", NAMESPACES)
if node is not None:
ec_link = node.attrib[ATTRIB_KEYS["rdf:resource"]]
ec_id = ec_link.lstrip(BASE_IRIS["ec"])
reaction_entry["ec_link"] = ec_link
reaction_entry["ec_id"] = ec_id
# Adds status key and associated value to reaction entry. 3 possible values: Approved, Preliminary, Obsolete
@classmethod
def _add_rhea_status(cls, reaction_entry: dict, description: Element):
node = description.find("rh:status", NAMESPACES)
if node is not None:
status = node.attrib[ATTRIB_KEYS["rdf:resource"]].lstrip(BASE_IRIS["rh"])
reaction_entry["status"] = status
# Adds citations key and associated values in a list to reaction entry.
# Some entries will have no citations and thus no citations key
@classmethod
def _add_rhea_citations(cls, reaction_entry: dict, description: Element):
nodes = description.findall("rh:citation", NAMESPACES)
if nodes:
for node in nodes:
citation = node.attrib[ATTRIB_KEYS["rdf:resource"]].lstrip(BASE_IRIS["pubmed"])
citation = "PMID:" + citation
reaction_entry.setdefault("citations", []).append(citation)
# Adds the children_rheas key and list of associated rhea ids (should be 3) to reaction entry
@classmethod
def _add_rhea_children(cls, reaction_entry: dict, description: Element):
directional_reactions = description.findall("rh:directionalReaction", NAMESPACES)
if directional_reactions:
for reaction in directional_reactions:
child_absoulte_iri = reaction.attrib[ATTRIB_KEYS["rdf:resource"]]
child_relative_iri = child_absoulte_iri.lstrip(BASE_IRIS["rh"])
child_rhea_id = "RHEA:" + child_relative_iri
reaction_entry.setdefault("children_rheas", []).append(child_rhea_id)
bidirectional_reaction = description.find("rh:bidirectionalReaction", NAMESPACES)
if bidirectional_reaction is not None:
child_absoulte_iri = bidirectional_reaction.attrib[ATTRIB_KEYS["rdf:resource"]]
child_relative_iri = child_absoulte_iri.lstrip(BASE_IRIS["rh"])
child_rhea_id = "RHEA:" + child_relative_iri
reaction_entry.setdefault("children_rheas", []).append(child_rhea_id)
# Fills rhea entries with associated information
@classmethod
def produce(cls, accession_id: str, description: Element):
reaction_entry = {}
reaction_entry["rhea_id"] = accession_id
# reaction_entry["side_l"] = []
# reaction_entry["side_r"] = []
cls._add_rhea_equation(reaction_entry, description)
cls._add_rhea_transport(reaction_entry, description)
cls._add_rhea_ec(reaction_entry, description)
cls._add_rhea_status(reaction_entry, description)
cls._add_rhea_citations(reaction_entry, description)
cls._add_rhea_children(reaction_entry, description)
yield reaction_entry
@classmethod
def pack(cls, reaction_entries: List[dict]):
return dict((reaction_entry["rhea_id"], reaction_entry) for reaction_entry in reaction_entries)
def load_annotations(data_folder):
"""
Using ElementTree, rhea.rdf is collapsed into a hierarchy of tags with associated information accessed with .find()
and .findall() functions.
The main for-loop catches reactive part associations, reaction side associations, compound entries, and reaction
entries into 4 lists.
Reactive parts are augmented to their associated compounds in the 2nd for-loop.
The 3rd for-loop creates side components from compounds and participation fields, and then attaches side components
to the associated reactions.
The 4th for-loop yields all reaction docments.
"""
rhea_rdf = open(data_folder + "/rhea.rdf", "r")
tree = ET.parse(rhea_rdf)
root = tree.getroot()
reactive_part_associations = []
side_associations = []
compound_entries = []
reaction_entries = []
for description in root.findall("rdf:Description", NAMESPACES):
absolute_iri = description.attrib[ATTRIB_KEYS["rdf:about"]]
relative_iri = absolute_iri.lstrip(BASE_IRIS["rh"])
accession = description.find("rh:accession", NAMESPACES)
if accession is None:
if ReactivePartDataFactory.is_valid_relative_iri(relative_iri=relative_iri):
for rp_assoc in ReactivePartDataFactory.produce(relative_iri=relative_iri, description=description):
reactive_part_associations.append(rp_assoc)
elif SideDataFactory.is_valid_relative_iri(relative_iri=relative_iri):
for side_assoc in SideDataFactory.produce(relative_iri=relative_iri, description=description):
side_associations.append(side_assoc)
else:
accession_id = accession.text
if CompoundDataFactory.is_valid_accession_id(accession_id=accession_id):
for comp_entry in CompoundDataFactory.produce(relative_iri=relative_iri,
accession_id=accession_id,
description=description):
compound_entries.append(comp_entry)
elif ReactionDataFactory.is_valid_accession_id(accession_id=accession_id) and \
ReactionDataFactory.is_master_reaction(description=description):
for rhea_entry in ReactionDataFactory.produce(accession_id=accession_id, description=description):
reaction_entries.append(rhea_entry)
compound_lib = CompoundDataFactory.pack(comp_entries=compound_entries)
reaction_lib = ReactionDataFactory.pack(reaction_entries=reaction_entries)
for comp_num, rp_entry in reactive_part_associations:
compound_lib[comp_num].setdefault("reactive_parts", []).append(rp_entry)
for rhea_id, side_key, comp_num, participation_entry in side_associations:
side_component = {
**compound_lib[comp_num], # the "participant"
**participation_entry # describes how this compound partipates in the reaction
}
del side_component["comp_num"]
reaction_lib[rhea_id].setdefault(side_key, []).append(side_component)
for reaction_entry in reaction_lib.values():
reaction_entry["_id"] = reaction_entry["rhea_id"]
del reaction_entry["rhea_id"]
yield reaction_entry