-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathARAX_ranker.py
831 lines (731 loc) · 43.1 KB
/
ARAX_ranker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
#!/bin/env python3
import math
import os
import networkx as nx
import numpy as np
import scipy.stats
import sys
import json
import ast
import re
from typing import Set, Union, Dict, List, Callable
from ARAX_response import ARAXResponse
from query_graph_info import QueryGraphInfo
sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../../UI/OpenAPI/python-flask-server/")
from openapi_server.models.query_graph import QueryGraph
from openapi_server.models.result import Result
from openapi_server.models.edge import Edge
from openapi_server.models.attribute import Attribute
edge_confidence_manual_agent = 0.999
def _get_nx_edges_by_attr(G: Union[nx.MultiDiGraph, nx.MultiGraph], key: str, val: str) -> Set[tuple]:
res_set = set()
for edge_tuple in G.edges(data=True):
edge_val = edge_tuple[2].get(key, None)
if edge_val is not None and edge_val == val:
res_set.add(edge_tuple)
return res_set
def _get_query_graph_networkx_from_query_graph(query_graph: QueryGraph) -> nx.MultiDiGraph:
query_graph_nx = nx.MultiDiGraph()
query_graph_nx.add_nodes_from([key for key,node in query_graph.nodes.items()])
edge_list = [[edge.subject, edge.object, key, {'weight': 0.0}] for key,edge in query_graph.edges.items()]
query_graph_nx.add_edges_from(edge_list)
return query_graph_nx
def _get_weighted_graph_networkx_from_result_graph(kg_edge_id_to_edge: Dict[str, Edge],
qg_nx: Union[nx.MultiDiGraph, nx.MultiGraph],
result: Result) -> Union[nx.MultiDiGraph,
nx.MultiGraph]:
res_graph = qg_nx.copy()
qg_edge_tuples = tuple(qg_nx.edges(keys=True, data=True))
qg_edge_key_to_edge_tuple = {edge_tuple[2]: edge_tuple for edge_tuple in qg_edge_tuples}
for analysis in result.analyses: # For now we only ever have one Analysis per Result
for key, edge_binding_list in analysis.edge_bindings.items():
edge_count = 0
for edge_binding in edge_binding_list:
kg_edge = kg_edge_id_to_edge[edge_binding.id]
kg_edge_conf = kg_edge.confidence
#kg_edge_conf = kg_edge_attributes["confidence"]
qedge_keys = kg_edge.qedge_keys
for qedge_key in qedge_keys:
qedge_tuple = qg_edge_key_to_edge_tuple[qedge_key]
res_graph[qedge_tuple[0]][qedge_tuple[1]][qedge_key]['weight'] += kg_edge_conf
edge_count += 1
if edge_count > 0:
res_graph[qedge_tuple[0]][qedge_tuple[1]][qedge_key]['weight'] /= edge_count
return res_graph
def _get_weighted_graphs_networkx_from_result_graphs(kg_edge_id_to_edge: Dict[str, Edge],
qg_nx: Union[nx.MultiDiGraph, nx.MultiGraph],
results: List[Result]) -> List[Union[nx.MultiDiGraph,
nx.MultiGraph]]:
res_list = []
for result in results:
res_list.append(_get_weighted_graph_networkx_from_result_graph(kg_edge_id_to_edge,
qg_nx,
result))
return res_list
# credit: StackOverflow:15590812
def _collapse_nx_multigraph_to_weighted_graph(graph_nx: Union[nx.MultiDiGraph,
nx.MultiGraph]) -> Union[nx.DiGraph,
nx.Graph]:
if type(graph_nx) == nx.MultiGraph:
ret_graph = nx.Graph()
elif type(graph_nx) == nx.MultiDiGraph:
ret_graph = nx.DiGraph()
for u, v, data in graph_nx.edges(data=True):
w = data['weight'] if 'weight' in data else 1.0
if ret_graph.has_edge(u, v):
ret_graph[u][v]['weight'] += w
else:
ret_graph.add_edge(u, v, weight=w)
return ret_graph
# computes quantile ranks in *ascending* order (so a higher x entry has a higher
# "rank"), where ties have the same (average) rank (the reason for using scipy.stats
# here is specifically in order to handle ties correctly)
def _quantile_rank_list(x: List[float]) -> np.array:
y = scipy.stats.rankdata(x, method='max')
return y/len(y)
def _score_networkx_graphs_by_max_flow(result_graphs_nx: List[Union[nx.MultiDiGraph,
nx.MultiGraph]]) -> List[float]:
max_flow_values = []
for result_graph_nx in result_graphs_nx:
if len(result_graph_nx) > 1:
apsp_dict = dict(nx.algorithms.shortest_paths.unweighted.all_pairs_shortest_path_length(result_graph_nx))
path_len_with_pairs_list = [(node_i, node_j, path_len) for node_i, node_i_dict in apsp_dict.items() for node_j, path_len in node_i_dict.items()]
max_path_len = max([path_len_with_pair_list_item[2] for path_len_with_pair_list_item in
path_len_with_pairs_list])
pairs_with_max_path_len = [path_len_with_pair_list_item[0:2] for path_len_with_pair_list_item in path_len_with_pairs_list if
path_len_with_pair_list_item[2] == max_path_len]
max_flow_values_for_node_pairs = []
result_graph_collapsed_nx = _collapse_nx_multigraph_to_weighted_graph(result_graph_nx)
for source_node_id, target_node_id in pairs_with_max_path_len:
max_flow_values_for_node_pairs.append(nx.algorithms.flow.maximum_flow_value(result_graph_collapsed_nx,
source_node_id,
target_node_id,
capacity="weight"))
max_flow_value = 0.0
if len(max_flow_values_for_node_pairs) > 0:
max_flow_value = sum(max_flow_values_for_node_pairs)/float(len(max_flow_values_for_node_pairs))
else:
max_flow_value = 1.0
max_flow_values.append(max_flow_value)
return max_flow_values
def _score_networkx_graphs_by_longest_path(result_graphs_nx: List[Union[nx.MultiDiGraph,
nx.MultiGraph]]) -> List[float]:
result_scores = []
for result_graph_nx in result_graphs_nx:
apsp_dict = dict(nx.algorithms.shortest_paths.unweighted.all_pairs_shortest_path_length(result_graph_nx))
path_len_with_pairs_list = [(node_i, node_j, path_len) for node_i, node_i_dict in apsp_dict.items() for node_j, path_len in node_i_dict.items()]
max_path_len = max([path_len_with_pair_list_item[2] for path_len_with_pair_list_item in
path_len_with_pairs_list])
pairs_with_max_path_len = [path_len_with_pair_list_item[0:2] for path_len_with_pair_list_item in path_len_with_pairs_list if
path_len_with_pair_list_item[2] == max_path_len]
map_node_name_to_index = {node_id: node_index for node_index, node_id in enumerate(result_graph_nx.nodes)}
adj_matrix = nx.to_numpy_matrix(result_graph_nx)
adj_matrix_power = np.linalg.matrix_power(adj_matrix, max_path_len)/math.factorial(max_path_len)
score_list = [adj_matrix_power[map_node_name_to_index[node_i],
map_node_name_to_index[node_j]] for node_i, node_j in pairs_with_max_path_len]
result_score = np.mean(score_list)
result_scores.append(result_score)
return result_scores
def _score_networkx_graphs_by_frobenius_norm(result_graphs_nx: List[Union[nx.MultiDiGraph,
nx.MultiGraph]]) -> List[float]:
result_scores = []
for result_graph_nx in result_graphs_nx:
adj_matrix = nx.to_numpy_matrix(result_graph_nx)
result_score = np.linalg.norm(adj_matrix, ord='fro')
result_scores.append(result_score)
return result_scores
def _score_result_graphs_by_networkx_graph_scorer(kg_edge_id_to_edge: Dict[str, Edge],
qg_nx: Union[nx.MultiDiGraph, nx.MultiGraph],
results: List[Result],
nx_graph_scorer: Callable[[List[Union[nx.MultiDiGraph,
nx.MultiGraph]]], np.array]) -> List[float]:
result_graphs_nx = _get_weighted_graphs_networkx_from_result_graphs(kg_edge_id_to_edge,
qg_nx,
results)
return nx_graph_scorer(result_graphs_nx)
def _break_ties_and_preserve_order(scores):
adjusted_scores = scores.copy()
n = len(scores)
# if there are more than 1,000 scores, apply the fix to the first 1000 scores and ignore the rest
if n > 1000:
n = 1000
# set all scores below the 1000th to 0
for i in range(n, len(adjusted_scores)):
adjusted_scores[i] = 0
# round all scores to 3 decimal places initially to make adjustment easier
adjusted_scores = [round(score, 3) for score in adjusted_scores]
# Adjust scores in descending order to ensure no tie or inversion
for i in range(1, n):
if adjusted_scores[i] >= adjusted_scores[i - 1]:
# Decrease the current score to make it strictly less than the previous score
new_score = adjusted_scores[i - 1] - 0.001
adjusted_scores[i] = max(new_score, 0) # Prevent going below 0
# Final check to ensure all scores are within bounds
adjusted_scores = [round(max(min(score, 1), 0), 3) for score in adjusted_scores]
return adjusted_scores
class ARAXRanker:
# #### Constructor
def __init__(self):
self.response = None
self.message = None
self.parameters = None
# edge attributes we know about
self.known_attributes = {'probability', 'normalized_google_distance', 'jaccard_index',
'probability_treats', 'paired_concept_frequency',
'observed_expected_ratio', 'chi_square', 'chi_square_pvalue', 'MAGMA-pvalue', 'Genetics-quantile',
'pValue', 'fisher_exact_test_p-value','Richards-effector-genes',
'feature_coefficient', 'CMAP similarity score'}
# how much we trust each of the edge attributes
self.known_attributes_to_trust = {'probability': 0.5,
'normalized_google_distance': 0.8,
'jaccard_index': 0.5,
'probability_treats': 1,
'paired_concept_frequency': 0.5,
'observed_expected_ratio': 0.8,
'chi_square': 0.8,
'chi_square_pvalue': 0.8,
'MAGMA-pvalue': 1.0,
'Genetics-quantile': 1.0,
'pValue': 1.0,
'fisher_exact_test_p-value': 0.8,
'Richards-effector-genes': 0.5,
'feature_coefficient': 1.0,
'CMAP similarity score': 1.0
}
self.virtual_edge_types = {}
self.score_stats = dict() # dictionary that stores that max's and min's of the edge attribute values
self.kg_edge_id_to_edge = dict() # map between the edge id's in the results and the actual edges themselves
def describe_me(self):
"""
Little helper function for internal use that describes the actions and what they can do
:return:
"""
brief_description = """
rank_results iterates through all edges in the results list aggrigating and
normalizing the scores stored within the edge_attributes property. After combining these scores into
one score the ranker then scores each result through a combination of max flow, longest path,
and frobenius norm.
"""
description = """
`rank_results` iterates through all edges in the results list aggrigating and
normalizing the scores stored within the `edge_attributes` property. After combining these scores into
one score the ranker then scores each result through a combination of
[max flow](https://en.wikipedia.org/wiki/Maximum_flow_problem),
[longest path](https://en.wikipedia.org/wiki/Longest_path_problem),
and [frobenius norm](https://en.wikipedia.org/wiki/Matrix_norm#Frobenius_norm).
"""
description_list = []
params_dict = dict()
params_dict['brief_description'] = brief_description
params_dict['description'] = description
params_dict["dsl_command"] = "rank_results()"
description_list.append(params_dict)
return description_list
def result_confidence_maker(self, result):
###############################
# old method of just multiplying ALL the edge confidences together
if True:
result_confidence = 1 # everybody gets to start with a confidence of 1
for edge in result.edge_bindings:
kg_edge_id = edge.id
# TODO: replace this with the more intelligent function
# here we are just multiplying the edge confidences
# --- to see what info is going into each result: print(f"{result.essence}: {kg_edges[kg_edge_id].type}, {kg_edges[kg_edge_id].confidence}")
result_confidence *= self.kg_edge_id_to_edge[kg_edge_id].confidence
#kg_edge_attributes = {x.original_attribute_name:x.value for x in self.kg_edge_id_to_edge[kg_edge_id].attributes}
#result_confidence *= kg_edge_attributes["confidence"]
result.confidence = result_confidence
else:
# consider each pair of nodes in the QG, then somehow combine that information
# Idea:
# in each result
# for each source and target node:
# combine the confidences into a single edge with a single confidence that takes everything into account (
# edges, edge scores, edge types, etc)
# then assign result confidence as average/median of these "single" edge confidences?
result.confidence = 1
def edge_attribute_score_combiner(self, edge):
"""
This function takes a single edge and decides how to combine its attribute scores into a single confidence
Eventually we will want
1. To weight different attributes by different amounts
2. Figure out what to do with edges that have no attributes
"""
# Currently a dead simple "just multiply them all together"
edge_confidence = 1
edge_attribute_dict = {}
publications = []
if edge.attributes is not None:
for edge_attribute in edge.attributes:
if edge_attribute.original_attribute_name is not None:
edge_attribute_dict[edge_attribute.original_attribute_name] = edge_attribute.value
normalized_score = self.edge_attribute_score_normalizer(edge_attribute.original_attribute_name, edge_attribute.value)
else:
edge_attribute_dict[edge_attribute.attribute_type_id] = edge_attribute.value
normalized_score = self.edge_attribute_score_normalizer(edge_attribute.attribute_type_id, edge_attribute.value)
if edge_attribute.attribute_type_id == "biolink:publications":
if isinstance(edge_attribute.value,str):
publications = publications + [edge_attribute.value]
elif isinstance(edge_attribute.value,list):
publications = publications + edge_attribute.value
if normalized_score == -1: # this means we have no current normalization of this kind of attribute,
continue # so don't do anything to the score since we don't know what to do with it yet
else: # we have a way to normalize it, so multiply away
edge_confidence *= normalized_score
n_publications = len(set(publications))
if n_publications == 0:
pub_value = 0.01
else:
pub_value = np.log(n_publications)
max_value = 1.0
curve_steepness = 3.16993
logistic_midpoint = 1.38629
pub_value = max_value / float(1 + np.exp(-curve_steepness * (pub_value - logistic_midpoint)))
edge_confidence *= pub_value
return edge_confidence
def edge_attribute_score_normalizer(self, edge_attribute_name: str, edge_attribute_value) -> float:
"""
Takes an input edge attribute and value, dispatches it to the appropriate method that translates the value into
something in the interval [0,1] where 0 is worse and 1 is better
"""
if edge_attribute_name not in self.known_attributes:
return -1 # TODO: might want to change this
else:
if edge_attribute_value == "no value!":
edge_attribute_value = 0
try:
# check to see if it's convertible to a float (will catch None's as well)
edge_attribute_value = float(edge_attribute_value)
except TypeError:
return 0.
except ValueError:
return 0.
# check to see if it's NaN, if so, return 0
if np.isnan(edge_attribute_value):
return 0.
# else it's all good to proceed
else:
# Fix hyphens or spaces to underscores in names
edge_attribute_name = re.sub(r'[- \:]','_',edge_attribute_name)
# then dispatch to the appropriate function that does the score normalizing to get it to be in [0, 1] with 1 better
return getattr(self, '_' + self.__class__.__name__ + '__normalize_' + edge_attribute_name)(value=edge_attribute_value)
def __normalize_probability_treats(self, value):
"""
Normalize the probability drug treats disease value.
Empirically we've found that values greater than ~0.75 are "good" and <~0.75 are "bad" predictions of "treats"
We will hence throw this in a logistic function so that higher scores remain high, and low scores drop off
pretty quickly.
To see this curve in Mathematica:
L = 1; (*max value returned*)
k = 15; (*steepness of the logistic curve*)
x0 = 0.60; (*mid point of the logistic curve*)
Plot[L/(1 + Exp[-k (x - x0)]), {x, 0, 1}, PlotRange -> All, AxesOrigin -> {0, 0}]
or
import matplotlib.pyplot as plt
max_value = 1
curve_steepness = 15
logistic_midpoint = 0.60
x = np.linspace(0,1,200)
y = [max_value / float(1+np.exp(-curve_steepness*(value - logistic_midpoint))) for value in x]
plt.plot(x,y)
plt.show()
"""
max_value = 1
curve_steepness = 15
logistic_midpoint = 0.60
normalized_value = max_value / float(1+np.exp(-curve_steepness*(value - logistic_midpoint)))
# TODO: if "near" to the min value, set to zero (maybe one std dev from the min value of the logistic curve?)
# TODO: make sure max value can be obtained
return normalized_value
def __normalize_normalized_google_distance(self, value):
"""
Normalize the "normalized_google_distance
"""
max_value = 1
curve_steepness = -9
logistic_midpoint = 0.60
normalized_value = max_value / float(1 + np.exp(-curve_steepness * (value - logistic_midpoint)))
# TODO: if "near" to the min value, set to zero (maybe one std dev from the min value of the logistic curve?)
# TODO: make sure max value can be obtained
return normalized_value
def __normalize_probability(self, value):
"""
These (as of 7/28/2020 in KG1 and KG2 only "drug->protein binding probabilities"
As Vlado suggested, the lower ones are more rubbish, so again throw into a logistic function, but even steeper.
see __normalize_probability_treats for how to visualize this
"""
max_value = 1
curve_steepness = 20
logistic_midpoint = 0.8
normalized_value = max_value / float(1 + np.exp(-curve_steepness * (value - logistic_midpoint)))
# TODO: if "near" to the min value, set to zero (maybe one std dev from the min value of the logistic curve?)
# TODO: make sure max value can be obtained
return normalized_value
def __normalize_jaccard_index(self, value):
"""
The jaccard index is all relative to other results, so there is no reason to use a logistic here.
Just compare the value to the maximum value
"""
normalized_value = value / self.score_stats['jaccard_index']['maximum']
# print(f"value: {value}, normalized: {normalized_value}")
return normalized_value
def __normalize_paired_concept_frequency(self, value):
"""
Again, these are _somewhat_ relative values. In actuality, a logistic here would make sense,
but I don't know the distribution of frequencies in COHD, so just go the relative route
"""
# check to make sure we don't divide by zero
# try:
# normalized_value = value / score_stats['paired_concept_frequency']['maximum']
# except ZeroDivisionError:
# normalized_value = value / (score_stats['paired_concept_frequency']['maximum'] + np.finfo(float).eps)
# Give logistic a try
# TODO: see if we can adjust these params based on the scores stats (or see if that's even a good idea)
max_value = 1
curve_steepness = 2000 # really steep since the max values I've ever seen are quite small (eg .03)
logistic_midpoint = 0.002 # seems like an ok mid point, but....
normalized_value = max_value / float(1 + np.exp(-curve_steepness * (value - logistic_midpoint)))
# TODO: if "near" to the min value, set to zero (maybe one std dev from the min value of the logistic curve?)
# TODO: make sure max value can be obtained
# print(f"value: {value}, normalized: {normalized_value}")
return normalized_value
def __normalize_observed_expected_ratio(self, value):
"""
These are log ratios so should be interpreted as Exp[value] times more likely than chance
"""
max_value = 1
curve_steepness = 2 # Todo: need to fiddle with this as it's not quite weighting things enough
logistic_midpoint = 2 # Exp[2] more likely than chance
normalized_value = max_value / float(1 + np.exp(-curve_steepness * (value - logistic_midpoint)))
# TODO: if "near" to the min value, set to zero (maybe one std dev from the min value of the logistic curve?)
# TODO: make sure max value can be obtained
# print(f"value: {value}, normalized: {normalized_value}")
return normalized_value
def __normalize_chi_square(self, value):
"""
From COHD: Note that due to large sample sizes, the chi-square can become very large.
Hence the p-values will be very, very small... Hard to use logistic function, so instead, take the
-log(p_value) approach and use that (taking a page from the geneticist's handbook)
"""
# Taking value as is:
# max_value = 1
# curve_steepness = -100
# logistic_midpoint = 0.05
# normalized_value = max_value / float(1 + np.exp(-curve_steepness * (value - logistic_midpoint)))
# -Log[p_value] approach
value = -np.log(value)
max_value = 1
curve_steepness = 0.03
logistic_midpoint = 200
normalized_value = max_value / float(1 + np.exp(-curve_steepness * (value - logistic_midpoint)))
# TODO: if "near" to the min value, set to zero (maybe one std dev from the min value of the logistic curve?)
# TODO: make sure max value can be obtained
# print(f"value: {value}, normalized: {normalized_value}")
return normalized_value
def __normalize_chi_square_pvalue(self, value):
return self.__normalize_chi_square(value)
def __normalize_MAGMA_pvalue(self, value):
"""
For Genetics Provider MAGMA p-value: Convert provided p-value to a number between 0 and 1
with 1 being best. Estimated conversion from SAR and DMK 2020-09-22
"""
value = -np.log(value)
max_value = 1.0
curve_steepness = 0.849
logistic_midpoint = 4.97
normalized_value = max_value / float(1 + np.exp(-curve_steepness * (value - logistic_midpoint)))
return normalized_value
def __normalize_pValue(self, value):
"""
For Genetics Provider MAGMA p-value: Convert provided p-value to a number between 0 and 1
with 1 being best. Estimated conversion from SAR and DMK 2020-09-22
"""
value = -np.log(value)
max_value = 1.0
curve_steepness = 0.849
logistic_midpoint = 4.97
normalized_value = max_value / float(1 + np.exp(-curve_steepness * (value - logistic_midpoint)))
return normalized_value
def __normalize_Genetics_quantile(self, value):
"""
For Genetics Provider MAGMA quantile: We decide 2020-09-22 that just using
the quantile as-is is best. With DMK, SAR, EWD.
"""
return value
def __normalize_fisher_exact_test_p_value(self, value):
"""
For FET p-values: Including two options
The first option is to simply use 1-(p-value)
The second is a custom logorithmic. 0.05 should correspond to ~0.95 after the logistic is applied.
"""
# option 1:
# normalized_value = 1-value
# option 2:
try:
if value <= np.finfo(float).eps:
normalized_value = 1.
else:
value = -np.log(value)
max_value = 1.0
curve_steepness = 3
logistic_midpoint = 2.7
normalized_value = max_value / float(1 + np.exp(-curve_steepness * (value - logistic_midpoint)))
except RuntimeWarning: # this is the case when value is 0 (or nearly so), so should award the max value
normalized_value = 1.
return normalized_value
def __normalize_CMAP_similarity_score(self, value):
normalized_value = abs(value/100)
return normalized_value
def __normalize_Richards_effector_genes(self, value):
return value
def __normalize_feature_coefficient(self, value):
log_abs_value = np.log(abs(value))
max_value = 1
curve_steepness = 2.75
logistic_midpoint = 0.15
normalized_value = max_value / float(1+np.exp(-curve_steepness*(log_abs_value - logistic_midpoint)))
return normalized_value
def aggregate_scores_dmk(self, response):
"""
Take in a message,
decorate all edges with confidences,
take each result and use edge confidences and other info to populate result confidences,
populate the result.row_data and message.table_column_names
Does everything in place (no result returned)
"""
self.response = response
response.debug(f"Starting to rank results")
message = response.envelope.message
self.message = message
# #### Compute some basic information about the query_graph
query_graph_info = QueryGraphInfo()
result = query_graph_info.assess(message)
# response.merge(result)
# if result.status != 'OK':
# print(response.show(level=ARAXResponse.DEBUG))
# return response
# DMK FIXME: This need to be refactored so that:
# 1. The attribute names are dynamically mapped to functions that handle their weightings (for ease of renaming attribute names)
# 2. Weighting of individual attributes (eg. "probability" should be trusted MUCH less than "probability_treats")
# 3. Auto-handling of normalizing scores to be in [0,1] (eg. observed_expected ration \in (-inf, inf) while probability \in (0,1)
# 4. Auto-thresholding of values (eg. if chi_square <0.05, penalize the most, if probability_treats < 0.8, penalize the most, etc.)
# 5. Allow for ranked answers (eg. observed_expected can have a single, huge value, skewing the rest of them
# #### Iterate through all the edges in the knowledge graph to:
# #### 1) Create a dict of all edges by id
# #### 2) Collect some min,max stats for edge_attributes that we may need later
kg_edge_id_to_edge = self.kg_edge_id_to_edge
score_stats = self.score_stats
no_non_inf_float_flag = True
for edge_key, edge in message.knowledge_graph.edges.items():
kg_edge_id_to_edge[edge_key] = edge
if edge.attributes is not None:
for edge_attribute in edge.attributes:
for attribute_name in self.known_attributes:
if edge_attribute.original_attribute_name == attribute_name or edge_attribute.attribute_type_id == attribute_name:
if edge_attribute.value == "no value!":
edge_attribute.value = 0
value = 0
else:
try:
value = float(edge_attribute.value)
except ValueError:
continue
except TypeError:
continue
# initialize if not None already
if attribute_name not in score_stats:
score_stats[attribute_name] = {'minimum': None, 'maximum': None} # FIXME: doesn't handle the case when all values are inf|NaN
if not np.isinf(value) and not np.isinf(-value) and not np.isnan(value): # Ignore inf, -inf, and nan
no_non_inf_float_flag = False
if not score_stats[attribute_name]['minimum']:
score_stats[attribute_name]['minimum'] = value
if not score_stats[attribute_name]['maximum']:
score_stats[attribute_name]['maximum'] = value
if value > score_stats[attribute_name]['maximum']:
score_stats[attribute_name]['maximum'] = value
if value < score_stats[attribute_name]['minimum']:
score_stats[attribute_name]['minimum'] = value
if no_non_inf_float_flag:
response.warning(
f"No non-infinite value was encountered in any edge attribute in the knowledge graph.")
response.info(f"Summary of available edge metrics: {score_stats}")
edge_ids_manual_agent = set()
# Loop over the entire KG and normalize and combine the score of each edge, place that information in the confidence attribute of the edge
for edge_key, edge in message.knowledge_graph.edges.items():
if edge.attributes is not None:
edge_attributes = {x.original_attribute_name:x.value for x in edge.attributes}
for edge_attribute in edge.attributes:
if edge_attribute.attribute_type_id == "biolink:agent_type" and edge_attribute.value == "manual_agent":
edge_attributes['confidence'] = edge_confidence_manual_agent
edge.confidence = edge_confidence_manual_agent
edge_ids_manual_agent.add(edge_key)
break
else:
edge_attributes = {}
if edge_attributes.get("confidence", None) is not None:
#if False: # FIXME: there is no longer such an attribute. Stored as a generic attribute?
#if edge.confidence is not None:
# don't touch the confidence, since apparently someone already knows what the confidence should be
edge.confidence = edge_attributes['confidence']
#continue
else:
confidence = self.edge_attribute_score_combiner(edge)
#edge.attributes.append(Attribute(name="confidence", value=confidence))
edge.confidence = confidence
# Now that each edge has a confidence attached to it based on it's attributes, we can now:
# 1. consider edge types of the results
# 2. number of edges in the results
# 3. possibly conflicting information, etc.
results = message.results
edge_set_to_high_confidence = set()
for result in results:
edge_bindings = result.analyses[0].edge_bindings
for qedge_key in message.query_graph.edges.keys():
all_edges_for_qedge_are_high_confidence = False
bound_edges = edge_bindings.get(qedge_key, [])
for edge_name in bound_edges:
edge_id = edge_name.id
if edge_id in edge_ids_manual_agent:
all_edges_for_qedge_are_high_confidence = True
break
if all_edges_for_qedge_are_high_confidence:
for edge_name in bound_edges:
edge_set_to_high_confidence.add(edge_name.id)
for edge_key, edge in message.knowledge_graph.edges.items():
if edge_key in edge_set_to_high_confidence:
print(f"setting max confidence for edge_key: {edge_key}")
edge.confidence = edge_confidence_manual_agent
###################################
# TODO: Replace this with a more "intelligent" separate function
# now we can loop over all the results, and combine their edge confidences (now populated)
qg_nx = _get_query_graph_networkx_from_query_graph(message.query_graph)
kg_edge_id_to_edge = self.kg_edge_id_to_edge
ranks_list = list(map(_quantile_rank_list,
map(lambda scorer_func: _score_result_graphs_by_networkx_graph_scorer(kg_edge_id_to_edge,
qg_nx,
results,
scorer_func),
[_score_networkx_graphs_by_max_flow,
_score_networkx_graphs_by_longest_path,
_score_networkx_graphs_by_frobenius_norm])))
result_scores = sum(ranks_list)/float(len(ranks_list))
#print(result_scores)
# Replace Inferred Results Score with Probability score calculated by xDTD model
inferred_qedge_keys = [qedge_key for qedge_key, qedge in message.query_graph.edges.items()
if qedge.knowledge_type == "inferred"]
for result, score in zip(results, result_scores):
result.analyses[0].score = score # For now we only ever have one Analysis per Result
if inferred_qedge_keys:
inferred_qedge_key = inferred_qedge_keys[0]
edge_bindings = result.analyses[0].edge_bindings
inferred_edge_bindings = []
if edge_bindings:
inferred_edge_bindings = edge_bindings.get(inferred_qedge_key,[])
for edge_name in inferred_edge_bindings:
edge_id = edge_name.id
edge_attributes = message.knowledge_graph.edges[edge_id].attributes
if edge_attributes is not None:
for edge_attribute in edge_attributes:
if edge_attribute.original_attribute_name == 'probability_treats' and edge_attribute.value is not None:
result.analyses[0].score = float(edge_attribute.value)
# for result in message.results:
# self.result_confidence_maker(result)
###################################
# Make all scores at least 0.001. This is all way low anyway, but let's not have anything that rounds to zero
# This is a little bad in that 0.0005 becomes better than 0.0011, but this is all way low, so who cares
if result.analyses[0].score < 0.001:
result.analyses[0].score += 0.001
# Round to reasonable precision. Keep only 3 digits after the decimal
score = int(result.analyses[0].score * 1000 + 0.5) / 1000.0
result.row_data = [score, result.essence, result.essence_category]
# Add table columns name
response.envelope.table_column_names = ['score', 'essence', 'essence_category']
# Re-sort the final results
message.results.sort(key=lambda result: result.analyses[0].score, reverse=True)
# break ties and preserve order, round to 3 digits and make sure none are < 0
scores_with_ties = [result.analyses[0].score for result in message.results]
scores_without_ties = _break_ties_and_preserve_order(scores_with_ties)
print(scores_with_ties)
print(scores_without_ties)
# reinsert these scores into the results
for result, score in zip(message.results, scores_without_ties):
result.analyses[0].score = score
result.row_data[0] = score
response.debug("Results have been ranked and sorted")
##########################################################################################
def main():
# For faster testing, cache the testing messages locally
import requests_cache
requests_cache.install_cache('ARAX_ranker_testing_cache')
import argparse
argparser = argparse.ArgumentParser(description='Ranker system')
argparser.add_argument('--local', action='store_true', help='If set, use local RTXFeedback database to fetch messages')
params = argparser.parse_args()
# --- Create a response object
response = ARAXResponse()
ranker = ARAXRanker()
# --- Get a Message to work on
from ARAX_messenger import ARAXMessenger
messenger = ARAXMessenger()
if not params.local:
print("INFO: Fetching message to work on from arax.ncats.io", flush=True)
message = messenger.fetch_message('https://arax.ncats.io/api/rtx/v1/message/2614') # acetaminophen - > protein, just NGD as virtual edge
# message = messenger.fetch_message('https://arax.ncats.io/api/rtx/v1/message/2687') # neutropenia -> drug, predict_drug_treats_disease and ngd
# message = messenger.fetch_message('https://arax.ncats.io/api/rtx/v1/message/2701') # observed_expected_ratio and ngd
# message = messenger.fetch_message('https://arax.ncats.io/api/rtx/v1/message/2703') # a huge one with jaccard
# message = messenger.fetch_message('https://arax.ncats.io/api/rtx/v1/message/2706') # small one with paired concept frequency
# message = messenger.fetch_message('https://arax.ncats.io/api/rtx/v1/message/2709') # bigger one with paired concept frequency
# For local messages due to local changes in code not rolled out to production:
if params.local:
sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../UI/Feedback")
from RTXFeedback import RTXFeedback
araxdb = RTXFeedback()
message_dict = araxdb.getMessage(294) # local version of 2709 but with updates to COHD
# message_dict = araxdb.getMessage(297)
# message_dict = araxdb.getMessage(298)
# message_dict = araxdb.getMessage(299) # observed_expected_ratio different disease
# message_dict = araxdb.getMessage(300) # chi_square
# message_dict = araxdb.getMessage(302) # chi_square, different disease
# message_dict = araxdb.getMessage(304) # all clinical info, osteoarthritis
# message_dict = araxdb.getMessage(305) # all clinical info, neurtropenia
# message_dict = araxdb.getMessage(306) # all clinical info, neurtropenia, but with virtual edges
# message_dict = araxdb.getMessage(307) # all clinical info, osteoarthritis, but with virtual edges
# message_dict = araxdb.getMessage(322) # Parkinsons Jaccard, top 50
# message_dict = araxdb.getMessage(324) # chi_square, KG2
# message_dict = araxdb.getMessage(325) # chi_square, ngd, KG2
# message_dict = araxdb.getMessage(326) # prob drug treats disease as attribute to all edge thrombocytopenia
# message_dict = araxdb.getMessage(327)
# add_qnode(name=DOID:1227, id=n00)
# add_qnode(type=protein, is_set=true, id=n01)
# add_qnode(type=chemical_substance, id=n02)
# add_qedge(subject=n00, object=n01, id=e00)
# add_qedge(subject=n01, object=n02, id=e01, type=physically_interacts_with)
# expand(edge_id=[e00,e01], kp=ARAX/KG1)
# overlay(action=compute_jaccard, start_node_id=n00, intermediate_node_id=n01, end_node_id=n02, virtual_relation_label=J1)
# overlay(action=predict_drug_treats_disease, source_qnode_id=n02, target_qnode_id=n00, virtual_relation_label=P1)
# overlay(action=overlay_clinical_info, chi_square=true, virtual_relation_label=C1, source_qnode_id=n00, target_qnode_id=n02)
# overlay(action=compute_ngd, virtual_relation_label=N1, source_qnode_id=n00, target_qnode_id=n01)
# overlay(action=compute_ngd, virtual_relation_label=N2, source_qnode_id=n00, target_qnode_id=n02)
# overlay(action=compute_ngd, virtual_relation_label=N3, source_qnode_id=n01, target_qnode_id=n02)
# resultify(ignore_edge_direction=true)
# filter_results(action=limit_number_of_results, max_results=100)
from ARAX_messenger import ARAXMessenger
message = ARAXMessenger().from_dict(message_dict)
if message is None:
print("ERROR: Unable to fetch message")
return
# ranker.aggregate_scores(message,response=response)
ranker.aggregate_scores_dmk(message, response=response)
# Show the final result
print(response.show(level=ARAXResponse.DEBUG))
print("Results:")
for result in message.results:
confidence = result.confidence
if confidence is None:
confidence = 0.0
print(" -" + '{:6.3f}'.format(confidence) + f"\t{result.essence}")
# print(json.dumps(message.to_dict(),sort_keys=True,indent=2))
# Show the message number
print(json.dumps(ast.literal_eval(repr(message.id)), sort_keys=True, indent=2))
if __name__ == "__main__":
main()