-
Notifications
You must be signed in to change notification settings - Fork 42
/
sanityCheck.py
134 lines (105 loc) · 5.07 KB
/
sanityCheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# -*- coding: utf-8 -*-
"""Test functions"""
import pandas as pd
import cobra
from collections import Counter
def get_column_from_tsv(tsv_file, column_id, to_list=True):
"""
read a column from a tsv file and convert the content into a list by default
"""
tsv_content = pd.read_table(tsv_file)
desired_column = tsv_content[column_id]
if to_list:
return desired_column.to_list()
return desired_column
def load_yml(yml_file):
"""
import yml model file, and output rxn and met lists
"""
model = cobra.io.load_yaml_model(yml_file)
# get lists of rxns, mets and genes
modelRxns = list(map(lambda element : element.id, model.reactions))
modelMets = list(map(lambda element : element.id, model.metabolites))
modelGenes = list(map(lambda element : element.id, model.genes))
return modelRxns, modelMets, modelGenes, model
def checkRxnAnnotation(rxns):
"""
check consistency of rxn lists between model and annotation file
"""
rxnList = get_column_from_tsv("model/reactions.tsv", "rxns")
spontaneous = get_column_from_tsv("model/reactions.tsv", "spontaneous", False)
rxnDeprecated = get_column_from_tsv("data/deprecatedIdentifiers/deprecatedReactions.tsv", "rxns")
rxnMisused = set(rxns).intersection(set(rxnDeprecated))
assert len(rxnMisused) == 0, "Deprecated reaction(s) used: {}".format(rxnMisused)
rxnMisused = set(rxns).difference(set(rxnList))
assert len(rxnMisused) == 0, "Reaction(s) not annotated in reactions.tsv: {}".format(rxnMisused)
rxnMisused = set(rxnList).difference(set(rxns))
assert len(rxnMisused) == 0, "Annotated reactions are missing from the model: {}".format(rxnMisused)
assert pd.api.types.is_numeric_dtype(spontaneous), "Spontaneous column should be in numeric!"
def checkMetAnnotation(mets):
"""
check consistency of met lists between model and annotation file
"""
metList = get_column_from_tsv("model/metabolites.tsv", "mets")
metDeprecated = get_column_from_tsv("data/deprecatedIdentifiers/deprecatedMetabolites.tsv", "mets")
metMisused = set(mets).intersection(set(metDeprecated))
assert len(metMisused) == 0, "Deprecated metabolite(s) used: {}".format(metMisused)
metMisused = set(mets).difference(set(metList))
assert len(metMisused) == 0, "Metabolite(s) not annotated in metabolites.tsv: {}".format(metMisused)
metMisused = set(metList).difference(set(mets))
assert len(metMisused) == 0, "Annotated metabolite(s) are missing from the model: {}".format(metMisused)
def checkGeneAnnotation(genes):
"""
check consistency of gene lists between model and annotation file
"""
geneList = get_column_from_tsv("model/genes.tsv", "genes")
geneMisused = set(genes).difference(set(geneList))
assert len(geneMisused) == 0, "Gene(s) not annotated in genes.tsv: {}".format(geneMisused)
geneMisused = set(geneList).difference(set(genes))
assert len(geneMisused) == 0, "Annotated gene(s) are missing from the model: {}".format(geneMisused)
def find_unused_entities(model, entity_type):
"""
collect unused metabolites or genes if exist in the model
"""
# collect all metabolites actually used by reactions
entities_used = []
for reaction in model.reactions:
entities = reaction.genes if entity_type == "genes" else reaction.metabolites
# Loop through each entity and add if not already there
for entity in entities:
if entity not in entities_used:
entities_used.append(entity)
# go through entities in the model and collect unused ones
unused_entities = []
all_entities = model.genes if entity_type == "genes" else model.metabolites
for entity in all_entities:
if entity not in entities_used:
unused_entities.append(entity)
return unused_entities
def checkUnusedEntities(model, entity_type):
"""
check if unused genes or metabolites exist in the model
"""
# collect unused entites
unused_entities = find_unused_entities(model, entity_type)
assert len(unused_entities) == 0, f"Found unused {entity_type}: {unused_entities}"
def checkDupRxn(model):
"""
Check for duplicate reactions in the model
"""
reaction_equations = [rxn.build_reaction_string(use_metabolite_names=False) for rxn in model.reactions]
duplicate_reactions = [reaction for reaction, count in Counter(reaction_equations).items() if count > 1]
dup_list = [model.reactions[idx].id for idx, val in enumerate(reaction_equations) if val in duplicate_reactions]
if duplicate_reactions:
output = f"The following {len(dup_list)} reactions are duplicates, please check: " + ';'.join(dup_list)
print(output)
assert len(duplicate_reactions) == 0, "Found duplicated reactions!"
if __name__ == "__main__":
rxns, mets, genes, model = load_yml("model/Human-GEM.yml")
checkRxnAnnotation(rxns)
checkMetAnnotation(mets)
checkGeneAnnotation(genes)
checkUnusedEntities(model, "metabolites")
checkUnusedEntities(model, "genes")
checkDupRxn(model)
print("All checks have passed.")