diff --git a/ehr_risk_kp.yaml b/ehr_risk_kp.yaml
index c379d72..ac3ed3d 100644
--- a/ehr_risk_kp.yaml
+++ b/ehr_risk_kp.yaml
@@ -813,7 +813,7 @@ components:
parameters:
fields: >-
object.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -844,7 +844,7 @@ components:
parameters:
fields: >-
subject.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -877,7 +877,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -908,7 +908,7 @@ components:
parameters:
fields: >-
subject.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -941,7 +941,7 @@ components:
parameters:
fields: >-
object.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -972,7 +972,7 @@ components:
parameters:
fields: >-
subject.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1005,7 +1005,7 @@ components:
parameters:
fields: >-
object.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1036,7 +1036,7 @@ components:
parameters:
fields: >-
subject.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1079,7 +1079,7 @@ components:
parameters:
fields: >-
object.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1110,7 +1110,7 @@ components:
parameters:
fields: >-
subject.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1143,7 +1143,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1174,7 +1174,7 @@ components:
parameters:
fields: >-
subject.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1207,7 +1207,7 @@ components:
parameters:
fields: >-
object.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1238,7 +1238,7 @@ components:
parameters:
fields: >-
subject.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1277,7 +1277,7 @@ components:
parameters:
fields: >-
object.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1308,7 +1308,7 @@ components:
parameters:
fields: >-
subject.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1345,7 +1345,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1376,7 +1376,7 @@ components:
parameters:
fields: >-
subject.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1413,7 +1413,7 @@ components:
parameters:
fields: >-
object.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1444,7 +1444,7 @@ components:
parameters:
fields: >-
subject.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1477,7 +1477,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1508,7 +1508,7 @@ components:
parameters:
fields: >-
subject.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1541,7 +1541,7 @@ components:
parameters:
fields: >-
object.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1572,7 +1572,7 @@ components:
parameters:
fields: >-
subject.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1611,7 +1611,7 @@ components:
parameters:
fields: >-
object.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1642,7 +1642,7 @@ components:
parameters:
fields: >-
subject.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1679,7 +1679,7 @@ components:
parameters:
fields: >-
object.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1710,7 +1710,7 @@ components:
parameters:
fields: >-
subject.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1743,7 +1743,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1774,7 +1774,7 @@ components:
parameters:
fields: >-
subject.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1815,7 +1815,7 @@ components:
parameters:
fields: >-
object.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1846,7 +1846,7 @@ components:
parameters:
fields: >-
subject.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1883,7 +1883,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1914,7 +1914,7 @@ components:
parameters:
fields: >-
subject.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1951,7 +1951,7 @@ components:
parameters:
fields: >-
object.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -1982,7 +1982,7 @@ components:
parameters:
fields: >-
subject.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2015,7 +2015,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2046,7 +2046,7 @@ components:
parameters:
fields: >-
subject.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2079,7 +2079,7 @@ components:
parameters:
fields: >-
object.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2110,7 +2110,7 @@ components:
parameters:
fields: >-
subject.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2143,7 +2143,7 @@ components:
parameters:
fields: >-
object.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2174,7 +2174,7 @@ components:
parameters:
fields: >-
subject.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2211,7 +2211,7 @@ components:
parameters:
fields: >-
object.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2242,7 +2242,7 @@ components:
parameters:
fields: >-
subject.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2277,7 +2277,7 @@ components:
parameters:
fields: >-
object.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2308,7 +2308,7 @@ components:
parameters:
fields: >-
subject.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2341,7 +2341,7 @@ components:
parameters:
fields: >-
object.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2372,7 +2372,7 @@ components:
parameters:
fields: >-
subject.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2405,7 +2405,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2436,7 +2436,7 @@ components:
parameters:
fields: >-
subject.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2469,7 +2469,7 @@ components:
parameters:
fields: >-
object.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2500,7 +2500,7 @@ components:
parameters:
fields: >-
subject.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2533,7 +2533,7 @@ components:
parameters:
fields: >-
object.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2564,7 +2564,7 @@ components:
parameters:
fields: >-
subject.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2601,7 +2601,7 @@ components:
parameters:
fields: >-
object.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2632,7 +2632,7 @@ components:
parameters:
fields: >-
subject.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2669,7 +2669,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2700,7 +2700,7 @@ components:
parameters:
fields: >-
subject.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2737,7 +2737,7 @@ components:
parameters:
fields: >-
object.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2768,7 +2768,7 @@ components:
parameters:
fields: >-
subject.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2801,7 +2801,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2832,7 +2832,7 @@ components:
parameters:
fields: >-
subject.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2865,7 +2865,7 @@ components:
parameters:
fields: >-
object.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2896,7 +2896,7 @@ components:
parameters:
fields: >-
subject.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2929,7 +2929,7 @@ components:
parameters:
fields: >-
object.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2960,7 +2960,7 @@ components:
parameters:
fields: >-
subject.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -2995,7 +2995,7 @@ components:
parameters:
fields: >-
object.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3026,7 +3026,7 @@ components:
parameters:
fields: >-
subject.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3059,7 +3059,7 @@ components:
parameters:
fields: >-
object.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3090,7 +3090,7 @@ components:
parameters:
fields: >-
subject.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3125,7 +3125,7 @@ components:
parameters:
fields: >-
object.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3156,7 +3156,7 @@ components:
parameters:
fields: >-
subject.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3189,7 +3189,7 @@ components:
parameters:
fields: >-
object.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3220,7 +3220,7 @@ components:
parameters:
fields: >-
subject.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3253,7 +3253,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3284,7 +3284,7 @@ components:
parameters:
fields: >-
subject.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3317,7 +3317,7 @@ components:
parameters:
fields: >-
object.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3348,7 +3348,7 @@ components:
parameters:
fields: >-
subject.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3381,7 +3381,7 @@ components:
parameters:
fields: >-
object.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3412,7 +3412,7 @@ components:
parameters:
fields: >-
subject.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3445,7 +3445,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3476,7 +3476,7 @@ components:
parameters:
fields: >-
subject.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3511,7 +3511,7 @@ components:
parameters:
fields: >-
object.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3542,7 +3542,7 @@ components:
parameters:
fields: >-
subject.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3579,7 +3579,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3610,7 +3610,7 @@ components:
parameters:
fields: >-
subject.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3647,7 +3647,7 @@ components:
parameters:
fields: >-
object.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3678,7 +3678,7 @@ components:
parameters:
fields: >-
subject.CHEBI,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3711,7 +3711,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3742,7 +3742,7 @@ components:
parameters:
fields: >-
subject.CHEBI,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3775,7 +3775,7 @@ components:
parameters:
fields: >-
object.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3806,7 +3806,7 @@ components:
parameters:
fields: >-
subject.CHEBI,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3839,7 +3839,7 @@ components:
parameters:
fields: >-
object.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3870,7 +3870,7 @@ components:
parameters:
fields: >-
subject.UNII,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3903,7 +3903,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3934,7 +3934,7 @@ components:
parameters:
fields: >-
subject.UNII,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3967,7 +3967,7 @@ components:
parameters:
fields: >-
object.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -3998,7 +3998,7 @@ components:
parameters:
fields: >-
subject.UNII,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4031,7 +4031,7 @@ components:
parameters:
fields: >-
object.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4062,7 +4062,7 @@ components:
parameters:
fields: >-
subject.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4097,7 +4097,7 @@ components:
parameters:
fields: >-
object.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4128,7 +4128,7 @@ components:
parameters:
fields: >-
subject.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4161,7 +4161,7 @@ components:
parameters:
fields: >-
object.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4192,7 +4192,7 @@ components:
parameters:
fields: >-
subject.CHEBI,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4225,7 +4225,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4256,7 +4256,7 @@ components:
parameters:
fields: >-
subject.CHEBI,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4289,7 +4289,7 @@ components:
parameters:
fields: >-
object.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4320,7 +4320,7 @@ components:
parameters:
fields: >-
subject.CHEBI,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4353,7 +4353,7 @@ components:
parameters:
fields: >-
object.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4384,7 +4384,7 @@ components:
parameters:
fields: >-
subject.UNII,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4417,7 +4417,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4448,7 +4448,7 @@ components:
parameters:
fields: >-
subject.UNII,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4483,7 +4483,7 @@ components:
parameters:
fields: >-
object.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4514,7 +4514,7 @@ components:
parameters:
fields: >-
subject.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4551,7 +4551,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4582,7 +4582,7 @@ components:
parameters:
fields: >-
subject.CHEBI,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4615,7 +4615,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4646,7 +4646,7 @@ components:
parameters:
fields: >-
subject.UNII,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4681,7 +4681,7 @@ components:
parameters:
fields: >-
object.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4712,7 +4712,7 @@ components:
parameters:
fields: >-
subject.CHEBI,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4745,7 +4745,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4776,7 +4776,7 @@ components:
parameters:
fields: >-
subject.CHEBI,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4809,7 +4809,7 @@ components:
parameters:
fields: >-
object.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4840,7 +4840,7 @@ components:
parameters:
fields: >-
subject.CHEBI,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4873,7 +4873,7 @@ components:
parameters:
fields: >-
object.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4904,7 +4904,7 @@ components:
parameters:
fields: >-
subject.UNII,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4939,7 +4939,7 @@ components:
parameters:
fields: >-
object.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -4970,7 +4970,7 @@ components:
parameters:
fields: >-
subject.UNII,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5003,7 +5003,7 @@ components:
parameters:
fields: >-
object.MONDO,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5034,7 +5034,7 @@ components:
parameters:
fields: >-
subject.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5069,7 +5069,7 @@ components:
parameters:
fields: >-
object.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5100,7 +5100,7 @@ components:
parameters:
fields: >-
subject.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5133,7 +5133,7 @@ components:
parameters:
fields: >-
object.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5164,7 +5164,7 @@ components:
parameters:
fields: >-
subject.CHEBI,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5197,7 +5197,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5228,7 +5228,7 @@ components:
parameters:
fields: >-
subject.CHEBI,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5261,7 +5261,7 @@ components:
parameters:
fields: >-
object.SNOMEDCT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5292,7 +5292,7 @@ components:
parameters:
fields: >-
subject.CHEBI,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5325,7 +5325,7 @@ components:
parameters:
fields: >-
object.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5356,7 +5356,7 @@ components:
parameters:
fields: >-
subject.UNII,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5389,7 +5389,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5420,7 +5420,7 @@ components:
parameters:
fields: >-
subject.UNII,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5455,7 +5455,7 @@ components:
parameters:
fields: >-
object.HP,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5486,7 +5486,7 @@ components:
parameters:
fields: >-
subject.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5519,7 +5519,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5550,7 +5550,7 @@ components:
parameters:
fields: >-
subject.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5585,7 +5585,7 @@ components:
parameters:
fields: >-
object.NCIT,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5616,7 +5616,7 @@ components:
parameters:
fields: >-
subject.CHEBI,
- association.edge_attributes,
+ association.edge_attributes,source.edge_sources,
subject.name,object.name
size: 1000
outputs:
@@ -5641,58 +5641,70 @@ components:
input_name: subject.name
output_name: object.name
edge-attributes: association.edge_attributes
+ trapi_sources: source.edge_sources
object-NCIT:
NCIT: object.NCIT
input_name: subject.name
output_name: object.name
edge-attributes: association.edge_attributes
+ trapi_sources: source.edge_sources
object-SNOMEDCT:
SNOMEDCT: object.SNOMEDCT
input_name: subject.name
output_name: object.name
edge-attributes: association.edge_attributes
+ trapi_sources: source.edge_sources
object-HP:
HP: object.HP
input_name: subject.name
output_name: object.name
edge-attributes: association.edge_attributes
+ trapi_sources: source.edge_sources
object-CHEBI:
CHEBI: object.CHEBI
input_name: subject.name
output_name: object.name
edge-attributes: association.edge_attributes
+ trapi_sources: source.edge_sources
object-UNII:
UNII: object.UNII
input_name: subject.name
output_name: object.name
edge-attributes: association.edge_attributes
+ trapi_sources: source.edge_sources
subject-MONDO:
MONDO: subject.MONDO
input_name: object.name
output_name: subject.name
edge-attributes: association.edge_attributes
+ trapi_sources: source.edge_sources
subject-NCIT:
NCIT: subject.NCIT
input_name: object.name
output_name: subject.name
edge-attributes: association.edge_attributes
+ trapi_sources: source.edge_sources
subject-SNOMEDCT:
SNOMEDCT: subject.SNOMEDCT
input_name: object.name
output_name: subject.name
edge-attributes: association.edge_attributes
+ trapi_sources: source.edge_sources
subject-HP:
HP: subject.HP
input_name: object.name
output_name: subject.name
edge-attributes: association.edge_attributes
+ trapi_sources: source.edge_sources
subject-CHEBI:
CHEBI: subject.CHEBI
input_name: object.name
output_name: subject.name
edge-attributes: association.edge_attributes
+ trapi_sources: source.edge_sources
subject-UNII:
UNII: subject.UNII
input_name: object.name
output_name: subject.name
- edge-attributes: association.edge_attributes
\ No newline at end of file
+ edge-attributes: association.edge_attributes
+ trapi_sources: source.edge_sources
diff --git a/legacy_parsers/EHR_Risk_parser.ipynb b/legacy_parsers/EHR_Risk_parser.ipynb
new file mode 100644
index 0000000..6e1564d
--- /dev/null
+++ b/legacy_parsers/EHR_Risk_parser.ipynb
@@ -0,0 +1,12376 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4d2996ec",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# qualifiers to include: https://github.com/biolink/biolink-model/issues/1050"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "0dc9bbef",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import json\n",
+ "import sys, os\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "92478b80",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " category | \n",
+ " xref | \n",
+ " provided_by | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " HP:0025181 | \n",
+ " Abdominal aseptic abscess | \n",
+ " biolink:PhenotypicFeature | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " HP:0002027 | \n",
+ " Abdominal pain | \n",
+ " biolink:PhenotypicFeature | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " HP:0003115 | \n",
+ " Abnormal EKG | \n",
+ " biolink:PhenotypicFeature | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " HP:0006919 | \n",
+ " Abnormal aggressive, impulsive or violent beha... | \n",
+ " biolink:PhenotypicFeature | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " HP:0003119 | \n",
+ " Abnormal circulating lipid concentration | \n",
+ " biolink:PhenotypicFeature | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 801 | \n",
+ " CHEBI:27300 | \n",
+ " vitamin d | \n",
+ " biolink:ChemicalSubstance | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 802 | \n",
+ " CHEBI:33234 | \n",
+ " vitamin e | \n",
+ " biolink:ChemicalSubstance | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 803 | \n",
+ " CHEBI:87732 | \n",
+ " warfarin | \n",
+ " biolink:ChemicalSubstance | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 804 | \n",
+ " CHEBI:36560 | \n",
+ " zinc oxide | \n",
+ " biolink:ChemicalSubstance | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 805 | \n",
+ " CHEBI:10125 | \n",
+ " zolpidem | \n",
+ " biolink:ChemicalSubstance | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
806 rows × 5 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name \\\n",
+ "0 HP:0025181 Abdominal aseptic abscess \n",
+ "1 HP:0002027 Abdominal pain \n",
+ "2 HP:0003115 Abnormal EKG \n",
+ "3 HP:0006919 Abnormal aggressive, impulsive or violent beha... \n",
+ "4 HP:0003119 Abnormal circulating lipid concentration \n",
+ ".. ... ... \n",
+ "801 CHEBI:27300 vitamin d \n",
+ "802 CHEBI:33234 vitamin e \n",
+ "803 CHEBI:87732 warfarin \n",
+ "804 CHEBI:36560 zinc oxide \n",
+ "805 CHEBI:10125 zolpidem \n",
+ "\n",
+ " category xref provided_by \n",
+ "0 biolink:PhenotypicFeature NaN EHR Risk Provider (Multiomics) \n",
+ "1 biolink:PhenotypicFeature NaN EHR Risk Provider (Multiomics) \n",
+ "2 biolink:PhenotypicFeature NaN EHR Risk Provider (Multiomics) \n",
+ "3 biolink:PhenotypicFeature NaN EHR Risk Provider (Multiomics) \n",
+ "4 biolink:PhenotypicFeature NaN EHR Risk Provider (Multiomics) \n",
+ ".. ... ... ... \n",
+ "801 biolink:ChemicalSubstance NaN EHR Risk Provider (Multiomics) \n",
+ "802 biolink:ChemicalSubstance NaN EHR Risk Provider (Multiomics) \n",
+ "803 biolink:ChemicalSubstance NaN EHR Risk Provider (Multiomics) \n",
+ "804 biolink:ChemicalSubstance NaN EHR Risk Provider (Multiomics) \n",
+ "805 biolink:ChemicalSubstance NaN EHR Risk Provider (Multiomics) \n",
+ "\n",
+ "[806 rows x 5 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# nodes_filepath = os.path.join(\"../../data\", \"ehr_risk_nodes_data_2023_03_09.csv\")\n",
+ "# edges_filepath = os.path.join(\"../../data\", \"ehr_risk_edges_data_2023_03_09.csv\")\n",
+ "nodes_filepath = os.path.join(\"../../data\", \"ehr_risk_nodes_data_2022_06_01.csv\")\n",
+ "edges_filepath = os.path.join(\"../../data\", \"ehr_risk_edges_data_2022_06_01.csv\")\n",
+ "nodes_data = pd.read_csv(nodes_filepath, sep = ',')\n",
+ "edges_data = pd.read_csv(edges_filepath, sep = ',')\n",
+ "\n",
+ "nodes_data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "0cb5c158",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " category | \n",
+ " xref | \n",
+ " provided_by | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 511 | \n",
+ " MONDO:0009421 | \n",
+ " hypogonadism, male | \n",
+ " biolink:Disease | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name category xref \\\n",
+ "511 MONDO:0009421 hypogonadism, male biolink:Disease NaN \n",
+ "\n",
+ " provided_by \n",
+ "511 EHR Risk Provider (Multiomics) "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "nodes_data.loc[nodes_data['name'].isin(['hypogonadism, male'])]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0a92a7a5",
+ "metadata": {},
+ "source": [
+ "# There appears to be duplicate CURIEs \n",
+ "## This should be fixed at the enclave level"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "d9673bb6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " category | \n",
+ " xref | \n",
+ " provided_by | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " HP:0025181 | \n",
+ " Abdominal aseptic abscess | \n",
+ " biolink:PhenotypicFeature | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " HP:0002027 | \n",
+ " Abdominal pain | \n",
+ " biolink:PhenotypicFeature | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " HP:0003115 | \n",
+ " Abnormal EKG | \n",
+ " biolink:PhenotypicFeature | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " HP:0006919 | \n",
+ " Abnormal aggressive, impulsive or violent beha... | \n",
+ " biolink:PhenotypicFeature | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " HP:0003119 | \n",
+ " Abnormal circulating lipid concentration | \n",
+ " biolink:PhenotypicFeature | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 801 | \n",
+ " CHEBI:27300 | \n",
+ " vitamin d | \n",
+ " biolink:ChemicalSubstance | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 802 | \n",
+ " CHEBI:33234 | \n",
+ " vitamin e | \n",
+ " biolink:ChemicalSubstance | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 803 | \n",
+ " CHEBI:87732 | \n",
+ " warfarin | \n",
+ " biolink:ChemicalSubstance | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 804 | \n",
+ " CHEBI:36560 | \n",
+ " zinc oxide | \n",
+ " biolink:ChemicalSubstance | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 805 | \n",
+ " CHEBI:10125 | \n",
+ " zolpidem | \n",
+ " biolink:ChemicalSubstance | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
804 rows × 5 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name \\\n",
+ "0 HP:0025181 Abdominal aseptic abscess \n",
+ "1 HP:0002027 Abdominal pain \n",
+ "2 HP:0003115 Abnormal EKG \n",
+ "3 HP:0006919 Abnormal aggressive, impulsive or violent beha... \n",
+ "4 HP:0003119 Abnormal circulating lipid concentration \n",
+ ".. ... ... \n",
+ "801 CHEBI:27300 vitamin d \n",
+ "802 CHEBI:33234 vitamin e \n",
+ "803 CHEBI:87732 warfarin \n",
+ "804 CHEBI:36560 zinc oxide \n",
+ "805 CHEBI:10125 zolpidem \n",
+ "\n",
+ " category xref provided_by \n",
+ "0 biolink:PhenotypicFeature NaN EHR Risk Provider (Multiomics) \n",
+ "1 biolink:PhenotypicFeature NaN EHR Risk Provider (Multiomics) \n",
+ "2 biolink:PhenotypicFeature NaN EHR Risk Provider (Multiomics) \n",
+ "3 biolink:PhenotypicFeature NaN EHR Risk Provider (Multiomics) \n",
+ "4 biolink:PhenotypicFeature NaN EHR Risk Provider (Multiomics) \n",
+ ".. ... ... ... \n",
+ "801 biolink:ChemicalSubstance NaN EHR Risk Provider (Multiomics) \n",
+ "802 biolink:ChemicalSubstance NaN EHR Risk Provider (Multiomics) \n",
+ "803 biolink:ChemicalSubstance NaN EHR Risk Provider (Multiomics) \n",
+ "804 biolink:ChemicalSubstance NaN EHR Risk Provider (Multiomics) \n",
+ "805 biolink:ChemicalSubstance NaN EHR Risk Provider (Multiomics) \n",
+ "\n",
+ "[804 rows x 5 columns]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# there appears to be multiple CURIEs given for a node of the same name. Get just 1 CURIE for each name\n",
+ "nodes_data = nodes_data.drop_duplicates(subset='id', keep=\"first\")\n",
+ "nodes_data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "3eec3454",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " category | \n",
+ " xref | \n",
+ " provided_by | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " HP:0025181 | \n",
+ " Abdominal aseptic abscess | \n",
+ " biolink:PhenotypicFeature | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " HP:0002027 | \n",
+ " Abdominal pain | \n",
+ " biolink:PhenotypicFeature | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " HP:0003115 | \n",
+ " Abnormal EKG | \n",
+ " biolink:PhenotypicFeature | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " HP:0006919 | \n",
+ " Abnormal aggressive, impulsive or violent beha... | \n",
+ " biolink:PhenotypicFeature | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " HP:0003119 | \n",
+ " Abnormal circulating lipid concentration | \n",
+ " biolink:PhenotypicFeature | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 801 | \n",
+ " CHEBI:27300 | \n",
+ " vitamin d | \n",
+ " biolink:ChemicalEntity | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 802 | \n",
+ " CHEBI:33234 | \n",
+ " vitamin e | \n",
+ " biolink:ChemicalEntity | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 803 | \n",
+ " CHEBI:87732 | \n",
+ " warfarin | \n",
+ " biolink:ChemicalEntity | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 804 | \n",
+ " CHEBI:36560 | \n",
+ " zinc oxide | \n",
+ " biolink:ChemicalEntity | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ " 805 | \n",
+ " CHEBI:10125 | \n",
+ " zolpidem | \n",
+ " biolink:ChemicalEntity | \n",
+ " NaN | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
804 rows × 5 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name \\\n",
+ "0 HP:0025181 Abdominal aseptic abscess \n",
+ "1 HP:0002027 Abdominal pain \n",
+ "2 HP:0003115 Abnormal EKG \n",
+ "3 HP:0006919 Abnormal aggressive, impulsive or violent beha... \n",
+ "4 HP:0003119 Abnormal circulating lipid concentration \n",
+ ".. ... ... \n",
+ "801 CHEBI:27300 vitamin d \n",
+ "802 CHEBI:33234 vitamin e \n",
+ "803 CHEBI:87732 warfarin \n",
+ "804 CHEBI:36560 zinc oxide \n",
+ "805 CHEBI:10125 zolpidem \n",
+ "\n",
+ " category xref provided_by \n",
+ "0 biolink:PhenotypicFeature NaN EHR Risk Provider (Multiomics) \n",
+ "1 biolink:PhenotypicFeature NaN EHR Risk Provider (Multiomics) \n",
+ "2 biolink:PhenotypicFeature NaN EHR Risk Provider (Multiomics) \n",
+ "3 biolink:PhenotypicFeature NaN EHR Risk Provider (Multiomics) \n",
+ "4 biolink:PhenotypicFeature NaN EHR Risk Provider (Multiomics) \n",
+ ".. ... ... ... \n",
+ "801 biolink:ChemicalEntity NaN EHR Risk Provider (Multiomics) \n",
+ "802 biolink:ChemicalEntity NaN EHR Risk Provider (Multiomics) \n",
+ "803 biolink:ChemicalEntity NaN EHR Risk Provider (Multiomics) \n",
+ "804 biolink:ChemicalEntity NaN EHR Risk Provider (Multiomics) \n",
+ "805 biolink:ChemicalEntity NaN EHR Risk Provider (Multiomics) \n",
+ "\n",
+ "[804 rows x 5 columns]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# biolink:ChemicalSubstance has been depecrated. Use biolink:ChemicalEntity instead (change all values ChemicalSubstance--->ChemialEntity)\n",
+ "nodes_data[\"category\"].mask(nodes_data[\"category\"] == \"biolink:ChemicalSubstance\", \"biolink:ChemicalEntity\" , inplace=True )\n",
+ "nodes_data\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "af3fe28d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "804"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Check if the xref column is all NaN or empty\n",
+ "# get count of null values \n",
+ "nodes_data['xref'].isna().sum()\n",
+ "\n",
+ "# since xref is empty, we won't use it"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "d3bdd0e5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " subject | \n",
+ " predicate | \n",
+ " object | \n",
+ " relation | \n",
+ " provided_by | \n",
+ " provided_date | \n",
+ " category | \n",
+ " classifier | \n",
+ " auc_roc | \n",
+ " p_value | \n",
+ " feature_importance | \n",
+ " feature_coefficient | \n",
+ " log_positive_patient_count | \n",
+ " log_negative_patient_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " 8.796399 | \n",
+ " 5 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " MONDO:0010643 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.999872 | \n",
+ " NaN | \n",
+ " 8.585212 | \n",
+ " 5 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " UNII:25ADE2236L | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.936767 | \n",
+ " NaN | \n",
+ " 4.558177 | \n",
+ " 5 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " UNII:K16AIQ8CTM | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.998563 | \n",
+ " NaN | \n",
+ " 4.357522 | \n",
+ " 5 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " MONDO:0007972 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.093959 | \n",
+ " NaN | \n",
+ " 3.926064 | \n",
+ " 5 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 237099 | \n",
+ " MONDO:0013600 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.245984 | \n",
+ " NaN | \n",
+ " -0.790418 | \n",
+ " 5 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " 237100 | \n",
+ " MONDO:0016264 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.984249 | \n",
+ " NaN | \n",
+ " -0.796085 | \n",
+ " 5 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " 237101 | \n",
+ " MONDO:0004565 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.820564 | \n",
+ " NaN | \n",
+ " -0.803973 | \n",
+ " 5 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " 237102 | \n",
+ " CHEBI:28864 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.472603 | \n",
+ " NaN | \n",
+ " -0.822575 | \n",
+ " 5 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " 237103 | \n",
+ " HP:0011947 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.000207 | \n",
+ " NaN | \n",
+ " -0.825731 | \n",
+ " 5 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
237104 rows × 14 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " subject predicate \\\n",
+ "0 HP:0008629 biolink:associated_with_increased_likelihood_of \n",
+ "1 MONDO:0010643 biolink:associated_with_increased_likelihood_of \n",
+ "2 UNII:25ADE2236L biolink:associated_with_increased_likelihood_of \n",
+ "3 UNII:K16AIQ8CTM biolink:associated_with_increased_likelihood_of \n",
+ "4 MONDO:0007972 biolink:associated_with_increased_likelihood_of \n",
+ "... ... ... \n",
+ "237099 MONDO:0013600 biolink:associated_with_decreased_likelihood_of \n",
+ "237100 MONDO:0016264 biolink:associated_with_decreased_likelihood_of \n",
+ "237101 MONDO:0004565 biolink:associated_with_decreased_likelihood_of \n",
+ "237102 CHEBI:28864 biolink:associated_with_decreased_likelihood_of \n",
+ "237103 HP:0011947 biolink:associated_with_decreased_likelihood_of \n",
+ "\n",
+ " object relation provided_by provided_date \\\n",
+ "0 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "1 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "2 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "3 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "4 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "... ... ... ... ... \n",
+ "237099 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237100 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237101 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237102 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237103 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "\n",
+ " category classifier auc_roc p_value \\\n",
+ "0 biolink:Association Logistic Regression 0.840132 0.000000 \n",
+ "1 biolink:Association Logistic Regression 0.840132 0.999872 \n",
+ "2 biolink:Association Logistic Regression 0.840132 0.936767 \n",
+ "3 biolink:Association Logistic Regression 0.840132 0.998563 \n",
+ "4 biolink:Association Logistic Regression 0.840132 0.093959 \n",
+ "... ... ... ... ... \n",
+ "237099 biolink:Association Logistic Regression 0.956860 0.245984 \n",
+ "237100 biolink:Association Logistic Regression 0.956860 0.984249 \n",
+ "237101 biolink:Association Logistic Regression 0.956860 0.820564 \n",
+ "237102 biolink:Association Logistic Regression 0.956860 0.472603 \n",
+ "237103 biolink:Association Logistic Regression 0.956860 0.000207 \n",
+ "\n",
+ " feature_importance feature_coefficient log_positive_patient_count \\\n",
+ "0 NaN 8.796399 5 \n",
+ "1 NaN 8.585212 5 \n",
+ "2 NaN 4.558177 5 \n",
+ "3 NaN 4.357522 5 \n",
+ "4 NaN 3.926064 5 \n",
+ "... ... ... ... \n",
+ "237099 NaN -0.790418 5 \n",
+ "237100 NaN -0.796085 5 \n",
+ "237101 NaN -0.803973 5 \n",
+ "237102 NaN -0.822575 5 \n",
+ "237103 NaN -0.825731 5 \n",
+ "\n",
+ " log_negative_patient_count \n",
+ "0 7 \n",
+ "1 7 \n",
+ "2 7 \n",
+ "3 7 \n",
+ "4 7 \n",
+ "... ... \n",
+ "237099 7 \n",
+ "237100 7 \n",
+ "237101 7 \n",
+ "237102 7 \n",
+ "237103 7 \n",
+ "\n",
+ "[237104 rows x 14 columns]"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "edges_data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "29d182f7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['biolink:associated_with_increased_likelihood_of',\n",
+ " 'biolink:associated_with_decreased_likelihood_of'], dtype=object)"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# sanity check on predicates\n",
+ "edges_data['predicate'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "2f5092be",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " subject | \n",
+ " predicate | \n",
+ " object | \n",
+ " relation | \n",
+ " provided_by | \n",
+ " provided_date | \n",
+ " category | \n",
+ " classifier | \n",
+ " auc_roc | \n",
+ " p_value | \n",
+ " feature_importance | \n",
+ " feature_coefficient | \n",
+ " num_patients_with_condition | \n",
+ " num_patients_without_condition | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " 8.796399 | \n",
+ " 99669 | \n",
+ " 9999902 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " MONDO:0010643 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.999872 | \n",
+ " NaN | \n",
+ " 8.585212 | \n",
+ " 99938 | \n",
+ " 10000835 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " UNII:25ADE2236L | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.936767 | \n",
+ " NaN | \n",
+ " 4.558177 | \n",
+ " 99770 | \n",
+ " 10000939 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " UNII:K16AIQ8CTM | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.998563 | \n",
+ " NaN | \n",
+ " 4.357522 | \n",
+ " 99910 | \n",
+ " 9998659 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " MONDO:0007972 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.093959 | \n",
+ " NaN | \n",
+ " 3.926064 | \n",
+ " 100242 | \n",
+ " 9998750 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 237099 | \n",
+ " MONDO:0013600 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.245984 | \n",
+ " NaN | \n",
+ " -0.790418 | \n",
+ " 99922 | \n",
+ " 9995233 | \n",
+ "
\n",
+ " \n",
+ " 237100 | \n",
+ " MONDO:0016264 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.984249 | \n",
+ " NaN | \n",
+ " -0.796085 | \n",
+ " 99822 | \n",
+ " 10002245 | \n",
+ "
\n",
+ " \n",
+ " 237101 | \n",
+ " MONDO:0004565 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.820564 | \n",
+ " NaN | \n",
+ " -0.803973 | \n",
+ " 99937 | \n",
+ " 9999068 | \n",
+ "
\n",
+ " \n",
+ " 237102 | \n",
+ " CHEBI:28864 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.472603 | \n",
+ " NaN | \n",
+ " -0.822575 | \n",
+ " 99588 | \n",
+ " 10000517 | \n",
+ "
\n",
+ " \n",
+ " 237103 | \n",
+ " HP:0011947 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.000207 | \n",
+ " NaN | \n",
+ " -0.825731 | \n",
+ " 100073 | \n",
+ " 9995547 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
237104 rows × 14 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " subject predicate \\\n",
+ "0 HP:0008629 biolink:associated_with_increased_likelihood_of \n",
+ "1 MONDO:0010643 biolink:associated_with_increased_likelihood_of \n",
+ "2 UNII:25ADE2236L biolink:associated_with_increased_likelihood_of \n",
+ "3 UNII:K16AIQ8CTM biolink:associated_with_increased_likelihood_of \n",
+ "4 MONDO:0007972 biolink:associated_with_increased_likelihood_of \n",
+ "... ... ... \n",
+ "237099 MONDO:0013600 biolink:associated_with_decreased_likelihood_of \n",
+ "237100 MONDO:0016264 biolink:associated_with_decreased_likelihood_of \n",
+ "237101 MONDO:0004565 biolink:associated_with_decreased_likelihood_of \n",
+ "237102 CHEBI:28864 biolink:associated_with_decreased_likelihood_of \n",
+ "237103 HP:0011947 biolink:associated_with_decreased_likelihood_of \n",
+ "\n",
+ " object relation provided_by provided_date \\\n",
+ "0 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "1 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "2 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "3 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "4 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "... ... ... ... ... \n",
+ "237099 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237100 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237101 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237102 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237103 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "\n",
+ " category classifier auc_roc p_value \\\n",
+ "0 biolink:Association Logistic Regression 0.840132 0.000000 \n",
+ "1 biolink:Association Logistic Regression 0.840132 0.999872 \n",
+ "2 biolink:Association Logistic Regression 0.840132 0.936767 \n",
+ "3 biolink:Association Logistic Regression 0.840132 0.998563 \n",
+ "4 biolink:Association Logistic Regression 0.840132 0.093959 \n",
+ "... ... ... ... ... \n",
+ "237099 biolink:Association Logistic Regression 0.956860 0.245984 \n",
+ "237100 biolink:Association Logistic Regression 0.956860 0.984249 \n",
+ "237101 biolink:Association Logistic Regression 0.956860 0.820564 \n",
+ "237102 biolink:Association Logistic Regression 0.956860 0.472603 \n",
+ "237103 biolink:Association Logistic Regression 0.956860 0.000207 \n",
+ "\n",
+ " feature_importance feature_coefficient num_patients_with_condition \\\n",
+ "0 NaN 8.796399 99669 \n",
+ "1 NaN 8.585212 99938 \n",
+ "2 NaN 4.558177 99770 \n",
+ "3 NaN 4.357522 99910 \n",
+ "4 NaN 3.926064 100242 \n",
+ "... ... ... ... \n",
+ "237099 NaN -0.790418 99922 \n",
+ "237100 NaN -0.796085 99822 \n",
+ "237101 NaN -0.803973 99937 \n",
+ "237102 NaN -0.822575 99588 \n",
+ "237103 NaN -0.825731 100073 \n",
+ "\n",
+ " num_patients_without_condition \n",
+ "0 9999902 \n",
+ "1 10000835 \n",
+ "2 10000939 \n",
+ "3 9998659 \n",
+ "4 9998750 \n",
+ "... ... \n",
+ "237099 9995233 \n",
+ "237100 10002245 \n",
+ "237101 9999068 \n",
+ "237102 10000517 \n",
+ "237103 9995547 \n",
+ "\n",
+ "[237104 rows x 14 columns]"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "edges_data[\"num_patients_with_condition\"] = 10**(edges_data['log_positive_patient_count']) # convert log pos patient count to an actual # \n",
+ "edges_data[\"num_patients_without_condition\"] = 10**(edges_data['log_negative_patient_count']) # convert log neg patient count to an actual # \n",
+ "edges_data[\"num_patients_with_condition\"] = np.random.poisson(edges_data['num_patients_with_condition']) # add poisson noise injection\n",
+ "edges_data[\"num_patients_without_condition\"] = np.random.poisson(edges_data['num_patients_without_condition']) # add poisson noise injection\n",
+ "edges_data = edges_data.drop(['log_positive_patient_count', 'log_negative_patient_count'], axis=1)\n",
+ "edges_data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "ef4fb895",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " subject | \n",
+ " predicate | \n",
+ " object | \n",
+ " relation | \n",
+ " provided_by | \n",
+ " provided_date | \n",
+ " category | \n",
+ " classifier | \n",
+ " auc_roc | \n",
+ " p_value | \n",
+ " feature_importance | \n",
+ " feature_coefficient | \n",
+ " num_patients_with_condition | \n",
+ " num_patients_without_condition | \n",
+ " nodes_frozenset | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " 8.796399 | \n",
+ " 99669 | \n",
+ " 9999902 | \n",
+ " (HP:0000360, HP:0008629) | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " MONDO:0010643 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.999872 | \n",
+ " NaN | \n",
+ " 8.585212 | \n",
+ " 99938 | \n",
+ " 10000835 | \n",
+ " (HP:0000360, MONDO:0010643) | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " UNII:25ADE2236L | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.936767 | \n",
+ " NaN | \n",
+ " 4.558177 | \n",
+ " 99770 | \n",
+ " 10000939 | \n",
+ " (UNII:25ADE2236L, HP:0000360) | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " UNII:K16AIQ8CTM | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.998563 | \n",
+ " NaN | \n",
+ " 4.357522 | \n",
+ " 99910 | \n",
+ " 9998659 | \n",
+ " (HP:0000360, UNII:K16AIQ8CTM) | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " MONDO:0007972 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.093959 | \n",
+ " NaN | \n",
+ " 3.926064 | \n",
+ " 100242 | \n",
+ " 9998750 | \n",
+ " (MONDO:0007972, HP:0000360) | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 237099 | \n",
+ " MONDO:0013600 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.245984 | \n",
+ " NaN | \n",
+ " -0.790418 | \n",
+ " 99922 | \n",
+ " 9995233 | \n",
+ " (MONDO:0013600, HP:0033106) | \n",
+ "
\n",
+ " \n",
+ " 237100 | \n",
+ " MONDO:0016264 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.984249 | \n",
+ " NaN | \n",
+ " -0.796085 | \n",
+ " 99822 | \n",
+ " 10002245 | \n",
+ " (MONDO:0016264, HP:0033106) | \n",
+ "
\n",
+ " \n",
+ " 237101 | \n",
+ " MONDO:0004565 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.820564 | \n",
+ " NaN | \n",
+ " -0.803973 | \n",
+ " 99937 | \n",
+ " 9999068 | \n",
+ " (HP:0033106, MONDO:0004565) | \n",
+ "
\n",
+ " \n",
+ " 237102 | \n",
+ " CHEBI:28864 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.472603 | \n",
+ " NaN | \n",
+ " -0.822575 | \n",
+ " 99588 | \n",
+ " 10000517 | \n",
+ " (CHEBI:28864, HP:0033106) | \n",
+ "
\n",
+ " \n",
+ " 237103 | \n",
+ " HP:0011947 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.000207 | \n",
+ " NaN | \n",
+ " -0.825731 | \n",
+ " 100073 | \n",
+ " 9995547 | \n",
+ " (HP:0011947, HP:0033106) | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
237104 rows × 15 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " subject predicate \\\n",
+ "0 HP:0008629 biolink:associated_with_increased_likelihood_of \n",
+ "1 MONDO:0010643 biolink:associated_with_increased_likelihood_of \n",
+ "2 UNII:25ADE2236L biolink:associated_with_increased_likelihood_of \n",
+ "3 UNII:K16AIQ8CTM biolink:associated_with_increased_likelihood_of \n",
+ "4 MONDO:0007972 biolink:associated_with_increased_likelihood_of \n",
+ "... ... ... \n",
+ "237099 MONDO:0013600 biolink:associated_with_decreased_likelihood_of \n",
+ "237100 MONDO:0016264 biolink:associated_with_decreased_likelihood_of \n",
+ "237101 MONDO:0004565 biolink:associated_with_decreased_likelihood_of \n",
+ "237102 CHEBI:28864 biolink:associated_with_decreased_likelihood_of \n",
+ "237103 HP:0011947 biolink:associated_with_decreased_likelihood_of \n",
+ "\n",
+ " object relation provided_by provided_date \\\n",
+ "0 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "1 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "2 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "3 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "4 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "... ... ... ... ... \n",
+ "237099 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237100 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237101 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237102 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237103 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "\n",
+ " category classifier auc_roc p_value \\\n",
+ "0 biolink:Association Logistic Regression 0.840132 0.000000 \n",
+ "1 biolink:Association Logistic Regression 0.840132 0.999872 \n",
+ "2 biolink:Association Logistic Regression 0.840132 0.936767 \n",
+ "3 biolink:Association Logistic Regression 0.840132 0.998563 \n",
+ "4 biolink:Association Logistic Regression 0.840132 0.093959 \n",
+ "... ... ... ... ... \n",
+ "237099 biolink:Association Logistic Regression 0.956860 0.245984 \n",
+ "237100 biolink:Association Logistic Regression 0.956860 0.984249 \n",
+ "237101 biolink:Association Logistic Regression 0.956860 0.820564 \n",
+ "237102 biolink:Association Logistic Regression 0.956860 0.472603 \n",
+ "237103 biolink:Association Logistic Regression 0.956860 0.000207 \n",
+ "\n",
+ " feature_importance feature_coefficient num_patients_with_condition \\\n",
+ "0 NaN 8.796399 99669 \n",
+ "1 NaN 8.585212 99938 \n",
+ "2 NaN 4.558177 99770 \n",
+ "3 NaN 4.357522 99910 \n",
+ "4 NaN 3.926064 100242 \n",
+ "... ... ... ... \n",
+ "237099 NaN -0.790418 99922 \n",
+ "237100 NaN -0.796085 99822 \n",
+ "237101 NaN -0.803973 99937 \n",
+ "237102 NaN -0.822575 99588 \n",
+ "237103 NaN -0.825731 100073 \n",
+ "\n",
+ " num_patients_without_condition nodes_frozenset \n",
+ "0 9999902 (HP:0000360, HP:0008629) \n",
+ "1 10000835 (HP:0000360, MONDO:0010643) \n",
+ "2 10000939 (UNII:25ADE2236L, HP:0000360) \n",
+ "3 9998659 (HP:0000360, UNII:K16AIQ8CTM) \n",
+ "4 9998750 (MONDO:0007972, HP:0000360) \n",
+ "... ... ... \n",
+ "237099 9995233 (MONDO:0013600, HP:0033106) \n",
+ "237100 10002245 (MONDO:0016264, HP:0033106) \n",
+ "237101 9999068 (HP:0033106, MONDO:0004565) \n",
+ "237102 10000517 (CHEBI:28864, HP:0033106) \n",
+ "237103 9995547 (HP:0011947, HP:0033106) \n",
+ "\n",
+ "[237104 rows x 15 columns]"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Convert columns A and B to frozenset\n",
+ "# df['AB'] = df[['A', 'B']].apply(frozenset, axis=1)\n",
+ "edges_data['nodes_frozenset'] = edges_data[['subject', 'object']].apply(frozenset, axis=1)\n",
+ "edges_data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "610615b0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " subject | \n",
+ " predicate | \n",
+ " object | \n",
+ " relation | \n",
+ " provided_by | \n",
+ " provided_date | \n",
+ " category | \n",
+ " classifier | \n",
+ " auc_roc | \n",
+ " p_value | \n",
+ " feature_importance | \n",
+ " feature_coefficient | \n",
+ " num_patients_with_condition | \n",
+ " num_patients_without_condition | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " 8.796399 | \n",
+ " 99669 | \n",
+ " 9999902 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " MONDO:0010643 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.999872 | \n",
+ " NaN | \n",
+ " 8.585212 | \n",
+ " 99938 | \n",
+ " 10000835 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " UNII:25ADE2236L | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.936767 | \n",
+ " NaN | \n",
+ " 4.558177 | \n",
+ " 99770 | \n",
+ " 10000939 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " UNII:K16AIQ8CTM | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.998563 | \n",
+ " NaN | \n",
+ " 4.357522 | \n",
+ " 99910 | \n",
+ " 9998659 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " MONDO:0007972 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.093959 | \n",
+ " NaN | \n",
+ " 3.926064 | \n",
+ " 100242 | \n",
+ " 9998750 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 237099 | \n",
+ " MONDO:0013600 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.245984 | \n",
+ " NaN | \n",
+ " -0.790418 | \n",
+ " 99922 | \n",
+ " 9995233 | \n",
+ "
\n",
+ " \n",
+ " 237100 | \n",
+ " MONDO:0016264 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.984249 | \n",
+ " NaN | \n",
+ " -0.796085 | \n",
+ " 99822 | \n",
+ " 10002245 | \n",
+ "
\n",
+ " \n",
+ " 237101 | \n",
+ " MONDO:0004565 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.820564 | \n",
+ " NaN | \n",
+ " -0.803973 | \n",
+ " 99937 | \n",
+ " 9999068 | \n",
+ "
\n",
+ " \n",
+ " 237102 | \n",
+ " CHEBI:28864 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.472603 | \n",
+ " NaN | \n",
+ " -0.822575 | \n",
+ " 99588 | \n",
+ " 10000517 | \n",
+ "
\n",
+ " \n",
+ " 237103 | \n",
+ " HP:0011947 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033106 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.956860 | \n",
+ " 0.000207 | \n",
+ " NaN | \n",
+ " -0.825731 | \n",
+ " 100073 | \n",
+ " 9995547 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
237104 rows × 14 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " subject predicate \\\n",
+ "0 HP:0008629 biolink:associated_with_increased_likelihood_of \n",
+ "1 MONDO:0010643 biolink:associated_with_increased_likelihood_of \n",
+ "2 UNII:25ADE2236L biolink:associated_with_increased_likelihood_of \n",
+ "3 UNII:K16AIQ8CTM biolink:associated_with_increased_likelihood_of \n",
+ "4 MONDO:0007972 biolink:associated_with_increased_likelihood_of \n",
+ "... ... ... \n",
+ "237099 MONDO:0013600 biolink:associated_with_decreased_likelihood_of \n",
+ "237100 MONDO:0016264 biolink:associated_with_decreased_likelihood_of \n",
+ "237101 MONDO:0004565 biolink:associated_with_decreased_likelihood_of \n",
+ "237102 CHEBI:28864 biolink:associated_with_decreased_likelihood_of \n",
+ "237103 HP:0011947 biolink:associated_with_decreased_likelihood_of \n",
+ "\n",
+ " object relation provided_by provided_date \\\n",
+ "0 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "1 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "2 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "3 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "4 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "... ... ... ... ... \n",
+ "237099 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237100 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237101 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237102 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237103 HP:0033106 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "\n",
+ " category classifier auc_roc p_value \\\n",
+ "0 biolink:Association Logistic Regression 0.840132 0.000000 \n",
+ "1 biolink:Association Logistic Regression 0.840132 0.999872 \n",
+ "2 biolink:Association Logistic Regression 0.840132 0.936767 \n",
+ "3 biolink:Association Logistic Regression 0.840132 0.998563 \n",
+ "4 biolink:Association Logistic Regression 0.840132 0.093959 \n",
+ "... ... ... ... ... \n",
+ "237099 biolink:Association Logistic Regression 0.956860 0.245984 \n",
+ "237100 biolink:Association Logistic Regression 0.956860 0.984249 \n",
+ "237101 biolink:Association Logistic Regression 0.956860 0.820564 \n",
+ "237102 biolink:Association Logistic Regression 0.956860 0.472603 \n",
+ "237103 biolink:Association Logistic Regression 0.956860 0.000207 \n",
+ "\n",
+ " feature_importance feature_coefficient num_patients_with_condition \\\n",
+ "0 NaN 8.796399 99669 \n",
+ "1 NaN 8.585212 99938 \n",
+ "2 NaN 4.558177 99770 \n",
+ "3 NaN 4.357522 99910 \n",
+ "4 NaN 3.926064 100242 \n",
+ "... ... ... ... \n",
+ "237099 NaN -0.790418 99922 \n",
+ "237100 NaN -0.796085 99822 \n",
+ "237101 NaN -0.803973 99937 \n",
+ "237102 NaN -0.822575 99588 \n",
+ "237103 NaN -0.825731 100073 \n",
+ "\n",
+ " num_patients_without_condition \n",
+ "0 9999902 \n",
+ "1 10000835 \n",
+ "2 10000939 \n",
+ "3 9998659 \n",
+ "4 9998750 \n",
+ "... ... \n",
+ "237099 9995233 \n",
+ "237100 10002245 \n",
+ "237101 9999068 \n",
+ "237102 10000517 \n",
+ "237103 9995547 \n",
+ "\n",
+ "[237104 rows x 14 columns]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Find duplicate rows based on frozenset of columns A and B\n",
+ "duplicate_rows = edges_data.duplicated(['nodes_frozenset', 'auc_roc', 'p_value', 'feature_coefficient'])\n",
+ "\n",
+ "# Filter the DataFrame to remove duplicate rows\n",
+ "test = edges_data[~duplicate_rows]\n",
+ "\n",
+ "test = test.drop(columns='nodes_frozenset')\n",
+ "\n",
+ "test"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "id": "dc4252fb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# # create confidence interval column by concatenating 'lower_confidence_bound'and 'upper_confidence_bound', then dropping those columns\n",
+ "# edges_data['log_odds_ratio_95_confidence_interval'] = edges_data.apply(lambda row: [row['lower_confidence_bound'], row['upper_confidence_bound']], axis=1)\n",
+ "# edges_data = edges_data.drop(['lower_confidence_bound', 'upper_confidence_bound'], axis=1)\n",
+ "# edges_data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "8531ffa3",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " subject | \n",
+ " predicate | \n",
+ " object | \n",
+ " relation | \n",
+ " provided_by | \n",
+ " provided_date | \n",
+ " category_x | \n",
+ " classifier | \n",
+ " auc_roc | \n",
+ " p_value | \n",
+ " feature_importance | \n",
+ " feature_coefficient | \n",
+ " num_patients_with_condition | \n",
+ " num_patients_without_condition | \n",
+ " nodes_frozenset | \n",
+ " id | \n",
+ " name | \n",
+ " category_y | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " 8.796399 | \n",
+ " 99669 | \n",
+ " 9999902 | \n",
+ " (HP:0000360, HP:0008629) | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0000739 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.876100 | \n",
+ " 0.985604 | \n",
+ " NaN | \n",
+ " -0.530062 | \n",
+ " 99841 | \n",
+ " 9999688 | \n",
+ " (HP:0008629, HP:0000739) | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0000787 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.841102 | \n",
+ " 0.981654 | \n",
+ " NaN | \n",
+ " -1.180669 | \n",
+ " 100354 | \n",
+ " 10002196 | \n",
+ " (HP:0008629, HP:0000787) | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0000790 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.910838 | \n",
+ " 0.960982 | \n",
+ " NaN | \n",
+ " -1.200019 | \n",
+ " 99835 | \n",
+ " 10001455 | \n",
+ " (HP:0008629, HP:0000790) | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000870 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.884580 | \n",
+ " 0.988524 | \n",
+ " NaN | \n",
+ " 1.432611 | \n",
+ " 968 | \n",
+ " 9996545 | \n",
+ " (HP:0008629, HP:0000870) | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 237099 | \n",
+ " HP:0000360 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0032312 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.921417 | \n",
+ " 0.927675 | \n",
+ " NaN | \n",
+ " -0.171607 | \n",
+ " 99721 | \n",
+ " 10001233 | \n",
+ " (HP:0000360, HP:0032312) | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237100 | \n",
+ " HP:0000360 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0032372 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.990023 | \n",
+ " 0.962012 | \n",
+ " NaN | \n",
+ " -2.817568 | \n",
+ " 1041 | \n",
+ " 9998402 | \n",
+ " (HP:0000360, HP:0032372) | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237101 | \n",
+ " HP:0000360 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0032473 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.963206 | \n",
+ " 0.989129 | \n",
+ " NaN | \n",
+ " -5.438955 | \n",
+ " 8 | \n",
+ " 10000998 | \n",
+ " (HP:0000360, HP:0032473) | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237102 | \n",
+ " HP:0000360 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033077 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.987230 | \n",
+ " 0.541701 | \n",
+ " NaN | \n",
+ " -1.017309 | \n",
+ " 100548 | \n",
+ " 10003263 | \n",
+ " (HP:0000360, HP:0033077) | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237103 | \n",
+ " HP:0000360 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033078 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.934115 | \n",
+ " 0.807916 | \n",
+ " NaN | \n",
+ " -0.929524 | \n",
+ " 100345 | \n",
+ " 10001251 | \n",
+ " (HP:0000360, HP:0033078) | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
237104 rows × 18 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " subject predicate \\\n",
+ "0 HP:0008629 biolink:associated_with_increased_likelihood_of \n",
+ "1 HP:0008629 biolink:associated_with_decreased_likelihood_of \n",
+ "2 HP:0008629 biolink:associated_with_decreased_likelihood_of \n",
+ "3 HP:0008629 biolink:associated_with_decreased_likelihood_of \n",
+ "4 HP:0008629 biolink:associated_with_increased_likelihood_of \n",
+ "... ... ... \n",
+ "237099 HP:0000360 biolink:associated_with_decreased_likelihood_of \n",
+ "237100 HP:0000360 biolink:associated_with_decreased_likelihood_of \n",
+ "237101 HP:0000360 biolink:associated_with_decreased_likelihood_of \n",
+ "237102 HP:0000360 biolink:associated_with_decreased_likelihood_of \n",
+ "237103 HP:0000360 biolink:associated_with_decreased_likelihood_of \n",
+ "\n",
+ " object relation provided_by provided_date \\\n",
+ "0 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "1 HP:0000739 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "2 HP:0000787 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "3 HP:0000790 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "4 HP:0000870 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "... ... ... ... ... \n",
+ "237099 HP:0032312 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237100 HP:0032372 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237101 HP:0032473 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237102 HP:0033077 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237103 HP:0033078 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "\n",
+ " category_x classifier auc_roc p_value \\\n",
+ "0 biolink:Association Logistic Regression 0.840132 0.000000 \n",
+ "1 biolink:Association Logistic Regression 0.876100 0.985604 \n",
+ "2 biolink:Association Logistic Regression 0.841102 0.981654 \n",
+ "3 biolink:Association Logistic Regression 0.910838 0.960982 \n",
+ "4 biolink:Association Logistic Regression 0.884580 0.988524 \n",
+ "... ... ... ... ... \n",
+ "237099 biolink:Association Logistic Regression 0.921417 0.927675 \n",
+ "237100 biolink:Association Logistic Regression 0.990023 0.962012 \n",
+ "237101 biolink:Association Logistic Regression 0.963206 0.989129 \n",
+ "237102 biolink:Association Logistic Regression 0.987230 0.541701 \n",
+ "237103 biolink:Association Logistic Regression 0.934115 0.807916 \n",
+ "\n",
+ " feature_importance feature_coefficient num_patients_with_condition \\\n",
+ "0 NaN 8.796399 99669 \n",
+ "1 NaN -0.530062 99841 \n",
+ "2 NaN -1.180669 100354 \n",
+ "3 NaN -1.200019 99835 \n",
+ "4 NaN 1.432611 968 \n",
+ "... ... ... ... \n",
+ "237099 NaN -0.171607 99721 \n",
+ "237100 NaN -2.817568 1041 \n",
+ "237101 NaN -5.438955 8 \n",
+ "237102 NaN -1.017309 100548 \n",
+ "237103 NaN -0.929524 100345 \n",
+ "\n",
+ " num_patients_without_condition nodes_frozenset id \\\n",
+ "0 9999902 (HP:0000360, HP:0008629) HP:0008629 \n",
+ "1 9999688 (HP:0008629, HP:0000739) HP:0008629 \n",
+ "2 10002196 (HP:0008629, HP:0000787) HP:0008629 \n",
+ "3 10001455 (HP:0008629, HP:0000790) HP:0008629 \n",
+ "4 9996545 (HP:0008629, HP:0000870) HP:0008629 \n",
+ "... ... ... ... \n",
+ "237099 10001233 (HP:0000360, HP:0032312) HP:0000360 \n",
+ "237100 9998402 (HP:0000360, HP:0032372) HP:0000360 \n",
+ "237101 10000998 (HP:0000360, HP:0032473) HP:0000360 \n",
+ "237102 10003263 (HP:0000360, HP:0033077) HP:0000360 \n",
+ "237103 10001251 (HP:0000360, HP:0033078) HP:0000360 \n",
+ "\n",
+ " name category_y \n",
+ "0 Pulsatile tinnitus biolink:PhenotypicFeature \n",
+ "1 Pulsatile tinnitus biolink:PhenotypicFeature \n",
+ "2 Pulsatile tinnitus biolink:PhenotypicFeature \n",
+ "3 Pulsatile tinnitus biolink:PhenotypicFeature \n",
+ "4 Pulsatile tinnitus biolink:PhenotypicFeature \n",
+ "... ... ... \n",
+ "237099 Tinnitus biolink:PhenotypicFeature \n",
+ "237100 Tinnitus biolink:PhenotypicFeature \n",
+ "237101 Tinnitus biolink:PhenotypicFeature \n",
+ "237102 Tinnitus biolink:PhenotypicFeature \n",
+ "237103 Tinnitus biolink:PhenotypicFeature \n",
+ "\n",
+ "[237104 rows x 18 columns]"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kg = pd.merge(edges_data, nodes_data[['id', 'name', 'category']], left_on='subject', right_on = 'id', how=\"inner\")\n",
+ "kg"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "60ab6b78",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " subject | \n",
+ " predicate | \n",
+ " object | \n",
+ " relation | \n",
+ " provided_by | \n",
+ " provided_date | \n",
+ " predicate_category | \n",
+ " classifier | \n",
+ " auc_roc | \n",
+ " p_value | \n",
+ " feature_importance | \n",
+ " feature_coefficient | \n",
+ " num_patients_with_condition | \n",
+ " num_patients_without_condition | \n",
+ " nodes_frozenset | \n",
+ " subject_id | \n",
+ " subject_name | \n",
+ " subject_category | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " 8.796399 | \n",
+ " 99669 | \n",
+ " 9999902 | \n",
+ " (HP:0000360, HP:0008629) | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0000739 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.876100 | \n",
+ " 0.985604 | \n",
+ " NaN | \n",
+ " -0.530062 | \n",
+ " 99841 | \n",
+ " 9999688 | \n",
+ " (HP:0008629, HP:0000739) | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0000787 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.841102 | \n",
+ " 0.981654 | \n",
+ " NaN | \n",
+ " -1.180669 | \n",
+ " 100354 | \n",
+ " 10002196 | \n",
+ " (HP:0008629, HP:0000787) | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0000790 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.910838 | \n",
+ " 0.960982 | \n",
+ " NaN | \n",
+ " -1.200019 | \n",
+ " 99835 | \n",
+ " 10001455 | \n",
+ " (HP:0008629, HP:0000790) | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000870 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.884580 | \n",
+ " 0.988524 | \n",
+ " NaN | \n",
+ " 1.432611 | \n",
+ " 968 | \n",
+ " 9996545 | \n",
+ " (HP:0008629, HP:0000870) | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 237099 | \n",
+ " HP:0000360 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0032312 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.921417 | \n",
+ " 0.927675 | \n",
+ " NaN | \n",
+ " -0.171607 | \n",
+ " 99721 | \n",
+ " 10001233 | \n",
+ " (HP:0000360, HP:0032312) | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237100 | \n",
+ " HP:0000360 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0032372 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.990023 | \n",
+ " 0.962012 | \n",
+ " NaN | \n",
+ " -2.817568 | \n",
+ " 1041 | \n",
+ " 9998402 | \n",
+ " (HP:0000360, HP:0032372) | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237101 | \n",
+ " HP:0000360 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0032473 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.963206 | \n",
+ " 0.989129 | \n",
+ " NaN | \n",
+ " -5.438955 | \n",
+ " 8 | \n",
+ " 10000998 | \n",
+ " (HP:0000360, HP:0032473) | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237102 | \n",
+ " HP:0000360 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033077 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.987230 | \n",
+ " 0.541701 | \n",
+ " NaN | \n",
+ " -1.017309 | \n",
+ " 100548 | \n",
+ " 10003263 | \n",
+ " (HP:0000360, HP:0033077) | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237103 | \n",
+ " HP:0000360 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0033078 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.934115 | \n",
+ " 0.807916 | \n",
+ " NaN | \n",
+ " -0.929524 | \n",
+ " 100345 | \n",
+ " 10001251 | \n",
+ " (HP:0000360, HP:0033078) | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
237104 rows × 18 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " subject predicate \\\n",
+ "0 HP:0008629 biolink:associated_with_increased_likelihood_of \n",
+ "1 HP:0008629 biolink:associated_with_decreased_likelihood_of \n",
+ "2 HP:0008629 biolink:associated_with_decreased_likelihood_of \n",
+ "3 HP:0008629 biolink:associated_with_decreased_likelihood_of \n",
+ "4 HP:0008629 biolink:associated_with_increased_likelihood_of \n",
+ "... ... ... \n",
+ "237099 HP:0000360 biolink:associated_with_decreased_likelihood_of \n",
+ "237100 HP:0000360 biolink:associated_with_decreased_likelihood_of \n",
+ "237101 HP:0000360 biolink:associated_with_decreased_likelihood_of \n",
+ "237102 HP:0000360 biolink:associated_with_decreased_likelihood_of \n",
+ "237103 HP:0000360 biolink:associated_with_decreased_likelihood_of \n",
+ "\n",
+ " object relation provided_by provided_date \\\n",
+ "0 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "1 HP:0000739 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "2 HP:0000787 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "3 HP:0000790 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "4 HP:0000870 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "... ... ... ... ... \n",
+ "237099 HP:0032312 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237100 HP:0032372 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237101 HP:0032473 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237102 HP:0033077 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237103 HP:0033078 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "\n",
+ " predicate_category classifier auc_roc p_value \\\n",
+ "0 biolink:Association Logistic Regression 0.840132 0.000000 \n",
+ "1 biolink:Association Logistic Regression 0.876100 0.985604 \n",
+ "2 biolink:Association Logistic Regression 0.841102 0.981654 \n",
+ "3 biolink:Association Logistic Regression 0.910838 0.960982 \n",
+ "4 biolink:Association Logistic Regression 0.884580 0.988524 \n",
+ "... ... ... ... ... \n",
+ "237099 biolink:Association Logistic Regression 0.921417 0.927675 \n",
+ "237100 biolink:Association Logistic Regression 0.990023 0.962012 \n",
+ "237101 biolink:Association Logistic Regression 0.963206 0.989129 \n",
+ "237102 biolink:Association Logistic Regression 0.987230 0.541701 \n",
+ "237103 biolink:Association Logistic Regression 0.934115 0.807916 \n",
+ "\n",
+ " feature_importance feature_coefficient num_patients_with_condition \\\n",
+ "0 NaN 8.796399 99669 \n",
+ "1 NaN -0.530062 99841 \n",
+ "2 NaN -1.180669 100354 \n",
+ "3 NaN -1.200019 99835 \n",
+ "4 NaN 1.432611 968 \n",
+ "... ... ... ... \n",
+ "237099 NaN -0.171607 99721 \n",
+ "237100 NaN -2.817568 1041 \n",
+ "237101 NaN -5.438955 8 \n",
+ "237102 NaN -1.017309 100548 \n",
+ "237103 NaN -0.929524 100345 \n",
+ "\n",
+ " num_patients_without_condition nodes_frozenset subject_id \\\n",
+ "0 9999902 (HP:0000360, HP:0008629) HP:0008629 \n",
+ "1 9999688 (HP:0008629, HP:0000739) HP:0008629 \n",
+ "2 10002196 (HP:0008629, HP:0000787) HP:0008629 \n",
+ "3 10001455 (HP:0008629, HP:0000790) HP:0008629 \n",
+ "4 9996545 (HP:0008629, HP:0000870) HP:0008629 \n",
+ "... ... ... ... \n",
+ "237099 10001233 (HP:0000360, HP:0032312) HP:0000360 \n",
+ "237100 9998402 (HP:0000360, HP:0032372) HP:0000360 \n",
+ "237101 10000998 (HP:0000360, HP:0032473) HP:0000360 \n",
+ "237102 10003263 (HP:0000360, HP:0033077) HP:0000360 \n",
+ "237103 10001251 (HP:0000360, HP:0033078) HP:0000360 \n",
+ "\n",
+ " subject_name subject_category \n",
+ "0 Pulsatile tinnitus biolink:PhenotypicFeature \n",
+ "1 Pulsatile tinnitus biolink:PhenotypicFeature \n",
+ "2 Pulsatile tinnitus biolink:PhenotypicFeature \n",
+ "3 Pulsatile tinnitus biolink:PhenotypicFeature \n",
+ "4 Pulsatile tinnitus biolink:PhenotypicFeature \n",
+ "... ... ... \n",
+ "237099 Tinnitus biolink:PhenotypicFeature \n",
+ "237100 Tinnitus biolink:PhenotypicFeature \n",
+ "237101 Tinnitus biolink:PhenotypicFeature \n",
+ "237102 Tinnitus biolink:PhenotypicFeature \n",
+ "237103 Tinnitus biolink:PhenotypicFeature \n",
+ "\n",
+ "[237104 rows x 18 columns]"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kg.rename(columns = {'category_x':'predicate_category',\n",
+ " 'category_y': 'subject_category',\n",
+ " 'id': 'subject_id',\n",
+ " 'name': 'subject_name'}, inplace = True)\n",
+ "kg"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "89d414ca",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " subject | \n",
+ " predicate | \n",
+ " object | \n",
+ " relation | \n",
+ " provided_by | \n",
+ " provided_date | \n",
+ " predicate_category | \n",
+ " classifier | \n",
+ " auc_roc | \n",
+ " p_value | \n",
+ " ... | \n",
+ " feature_coefficient | \n",
+ " num_patients_with_condition | \n",
+ " num_patients_without_condition | \n",
+ " nodes_frozenset | \n",
+ " subject_id | \n",
+ " subject_name | \n",
+ " subject_category | \n",
+ " id | \n",
+ " name | \n",
+ " category | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 8.796399 | \n",
+ " 99669 | \n",
+ " 9999902 | \n",
+ " (HP:0000360, HP:0008629) | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " MONDO:0010643 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.999872 | \n",
+ " ... | \n",
+ " 8.585212 | \n",
+ " 99938 | \n",
+ " 10000835 | \n",
+ " (HP:0000360, MONDO:0010643) | \n",
+ " MONDO:0010643 | \n",
+ " acute leukemia (disease) | \n",
+ " biolink:Disease | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " UNII:25ADE2236L | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.936767 | \n",
+ " ... | \n",
+ " 4.558177 | \n",
+ " 99770 | \n",
+ " 10000939 | \n",
+ " (UNII:25ADE2236L, HP:0000360) | \n",
+ " UNII:25ADE2236L | \n",
+ " thrombin | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " UNII:K16AIQ8CTM | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.998563 | \n",
+ " ... | \n",
+ " 4.357522 | \n",
+ " 99910 | \n",
+ " 9998659 | \n",
+ " (HP:0000360, UNII:K16AIQ8CTM) | \n",
+ " UNII:K16AIQ8CTM | \n",
+ " pertuzumab | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " MONDO:0007972 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.093959 | \n",
+ " ... | \n",
+ " 3.926064 | \n",
+ " 100242 | \n",
+ " 9998750 | \n",
+ " (MONDO:0007972, HP:0000360) | \n",
+ " MONDO:0007972 | \n",
+ " Meniere disease | \n",
+ " biolink:Disease | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 237099 | \n",
+ " CHEBI:114785 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.998503 | \n",
+ " ... | \n",
+ " -5.087542 | \n",
+ " 1007 | \n",
+ " 10007299 | \n",
+ " (HP:0008629, CHEBI:114785) | \n",
+ " CHEBI:114785 | \n",
+ " erlotinib | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237100 | \n",
+ " UNII:52CMI0WC3Y | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.999719 | \n",
+ " ... | \n",
+ " -5.066445 | \n",
+ " 1016 | \n",
+ " 9996273 | \n",
+ " (UNII:52CMI0WC3Y, HP:0008629) | \n",
+ " UNII:52CMI0WC3Y | \n",
+ " atezolizumab | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237101 | \n",
+ " CHEBI:135738 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.998357 | \n",
+ " ... | \n",
+ " -5.079640 | \n",
+ " 978 | \n",
+ " 9998030 | \n",
+ " (HP:0008629, CHEBI:135738) | \n",
+ " CHEBI:135738 | \n",
+ " clevidipine | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237102 | \n",
+ " MONDO:0004967 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.997631 | \n",
+ " ... | \n",
+ " -4.683547 | \n",
+ " 1006 | \n",
+ " 10001385 | \n",
+ " (HP:0008629, MONDO:0004967) | \n",
+ " MONDO:0004967 | \n",
+ " acute lymphoblastic leukemia (disease) | \n",
+ " biolink:Disease | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237103 | \n",
+ " HP:0000360 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 7.553003 | \n",
+ " 997 | \n",
+ " 9997731 | \n",
+ " (HP:0000360, HP:0008629) | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
237104 rows × 21 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " subject predicate \\\n",
+ "0 HP:0008629 biolink:associated_with_increased_likelihood_of \n",
+ "1 MONDO:0010643 biolink:associated_with_increased_likelihood_of \n",
+ "2 UNII:25ADE2236L biolink:associated_with_increased_likelihood_of \n",
+ "3 UNII:K16AIQ8CTM biolink:associated_with_increased_likelihood_of \n",
+ "4 MONDO:0007972 biolink:associated_with_increased_likelihood_of \n",
+ "... ... ... \n",
+ "237099 CHEBI:114785 biolink:associated_with_decreased_likelihood_of \n",
+ "237100 UNII:52CMI0WC3Y biolink:associated_with_decreased_likelihood_of \n",
+ "237101 CHEBI:135738 biolink:associated_with_decreased_likelihood_of \n",
+ "237102 MONDO:0004967 biolink:associated_with_decreased_likelihood_of \n",
+ "237103 HP:0000360 biolink:associated_with_increased_likelihood_of \n",
+ "\n",
+ " object relation provided_by provided_date \\\n",
+ "0 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "1 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "2 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "3 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "4 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "... ... ... ... ... \n",
+ "237099 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237100 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237101 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237102 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237103 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "\n",
+ " predicate_category classifier auc_roc p_value ... \\\n",
+ "0 biolink:Association Logistic Regression 0.840132 0.000000 ... \n",
+ "1 biolink:Association Logistic Regression 0.840132 0.999872 ... \n",
+ "2 biolink:Association Logistic Regression 0.840132 0.936767 ... \n",
+ "3 biolink:Association Logistic Regression 0.840132 0.998563 ... \n",
+ "4 biolink:Association Logistic Regression 0.840132 0.093959 ... \n",
+ "... ... ... ... ... ... \n",
+ "237099 biolink:Association Logistic Regression 0.959791 0.998503 ... \n",
+ "237100 biolink:Association Logistic Regression 0.959791 0.999719 ... \n",
+ "237101 biolink:Association Logistic Regression 0.959791 0.998357 ... \n",
+ "237102 biolink:Association Logistic Regression 0.959791 0.997631 ... \n",
+ "237103 biolink:Association Logistic Regression 0.959791 0.000000 ... \n",
+ "\n",
+ " feature_coefficient num_patients_with_condition \\\n",
+ "0 8.796399 99669 \n",
+ "1 8.585212 99938 \n",
+ "2 4.558177 99770 \n",
+ "3 4.357522 99910 \n",
+ "4 3.926064 100242 \n",
+ "... ... ... \n",
+ "237099 -5.087542 1007 \n",
+ "237100 -5.066445 1016 \n",
+ "237101 -5.079640 978 \n",
+ "237102 -4.683547 1006 \n",
+ "237103 7.553003 997 \n",
+ "\n",
+ " num_patients_without_condition nodes_frozenset \\\n",
+ "0 9999902 (HP:0000360, HP:0008629) \n",
+ "1 10000835 (HP:0000360, MONDO:0010643) \n",
+ "2 10000939 (UNII:25ADE2236L, HP:0000360) \n",
+ "3 9998659 (HP:0000360, UNII:K16AIQ8CTM) \n",
+ "4 9998750 (MONDO:0007972, HP:0000360) \n",
+ "... ... ... \n",
+ "237099 10007299 (HP:0008629, CHEBI:114785) \n",
+ "237100 9996273 (UNII:52CMI0WC3Y, HP:0008629) \n",
+ "237101 9998030 (HP:0008629, CHEBI:135738) \n",
+ "237102 10001385 (HP:0008629, MONDO:0004967) \n",
+ "237103 9997731 (HP:0000360, HP:0008629) \n",
+ "\n",
+ " subject_id subject_name \\\n",
+ "0 HP:0008629 Pulsatile tinnitus \n",
+ "1 MONDO:0010643 acute leukemia (disease) \n",
+ "2 UNII:25ADE2236L thrombin \n",
+ "3 UNII:K16AIQ8CTM pertuzumab \n",
+ "4 MONDO:0007972 Meniere disease \n",
+ "... ... ... \n",
+ "237099 CHEBI:114785 erlotinib \n",
+ "237100 UNII:52CMI0WC3Y atezolizumab \n",
+ "237101 CHEBI:135738 clevidipine \n",
+ "237102 MONDO:0004967 acute lymphoblastic leukemia (disease) \n",
+ "237103 HP:0000360 Tinnitus \n",
+ "\n",
+ " subject_category id name \\\n",
+ "0 biolink:PhenotypicFeature HP:0000360 Tinnitus \n",
+ "1 biolink:Disease HP:0000360 Tinnitus \n",
+ "2 biolink:ChemicalEntity HP:0000360 Tinnitus \n",
+ "3 biolink:ChemicalEntity HP:0000360 Tinnitus \n",
+ "4 biolink:Disease HP:0000360 Tinnitus \n",
+ "... ... ... ... \n",
+ "237099 biolink:ChemicalEntity HP:0008629 Pulsatile tinnitus \n",
+ "237100 biolink:ChemicalEntity HP:0008629 Pulsatile tinnitus \n",
+ "237101 biolink:ChemicalEntity HP:0008629 Pulsatile tinnitus \n",
+ "237102 biolink:Disease HP:0008629 Pulsatile tinnitus \n",
+ "237103 biolink:PhenotypicFeature HP:0008629 Pulsatile tinnitus \n",
+ "\n",
+ " category \n",
+ "0 biolink:PhenotypicFeature \n",
+ "1 biolink:PhenotypicFeature \n",
+ "2 biolink:PhenotypicFeature \n",
+ "3 biolink:PhenotypicFeature \n",
+ "4 biolink:PhenotypicFeature \n",
+ "... ... \n",
+ "237099 biolink:PhenotypicFeature \n",
+ "237100 biolink:PhenotypicFeature \n",
+ "237101 biolink:PhenotypicFeature \n",
+ "237102 biolink:PhenotypicFeature \n",
+ "237103 biolink:PhenotypicFeature \n",
+ "\n",
+ "[237104 rows x 21 columns]"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# merging object info from nodes df\n",
+ "kg = pd.merge(kg, nodes_data[['id', 'name', 'category']], left_on='object', right_on = 'id', how=\"inner\")\n",
+ "kg"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "b81d60e9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " subject | \n",
+ " predicate | \n",
+ " object | \n",
+ " relation | \n",
+ " provided_by | \n",
+ " provided_date | \n",
+ " predicate_category | \n",
+ " classifier | \n",
+ " auc_roc | \n",
+ " p_value | \n",
+ " ... | \n",
+ " feature_coefficient | \n",
+ " num_patients_with_condition | \n",
+ " num_patients_without_condition | \n",
+ " nodes_frozenset | \n",
+ " subject_id | \n",
+ " subject_name | \n",
+ " subject_category | \n",
+ " object_id | \n",
+ " object_name | \n",
+ " object_category | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 8.796399 | \n",
+ " 99669 | \n",
+ " 9999902 | \n",
+ " (HP:0000360, HP:0008629) | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " MONDO:0010643 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.999872 | \n",
+ " ... | \n",
+ " 8.585212 | \n",
+ " 99938 | \n",
+ " 10000835 | \n",
+ " (HP:0000360, MONDO:0010643) | \n",
+ " MONDO:0010643 | \n",
+ " acute leukemia (disease) | \n",
+ " biolink:Disease | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " UNII:25ADE2236L | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.936767 | \n",
+ " ... | \n",
+ " 4.558177 | \n",
+ " 99770 | \n",
+ " 10000939 | \n",
+ " (UNII:25ADE2236L, HP:0000360) | \n",
+ " UNII:25ADE2236L | \n",
+ " thrombin | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " UNII:K16AIQ8CTM | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.998563 | \n",
+ " ... | \n",
+ " 4.357522 | \n",
+ " 99910 | \n",
+ " 9998659 | \n",
+ " (HP:0000360, UNII:K16AIQ8CTM) | \n",
+ " UNII:K16AIQ8CTM | \n",
+ " pertuzumab | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " MONDO:0007972 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.093959 | \n",
+ " ... | \n",
+ " 3.926064 | \n",
+ " 100242 | \n",
+ " 9998750 | \n",
+ " (MONDO:0007972, HP:0000360) | \n",
+ " MONDO:0007972 | \n",
+ " Meniere disease | \n",
+ " biolink:Disease | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 237099 | \n",
+ " CHEBI:114785 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.998503 | \n",
+ " ... | \n",
+ " -5.087542 | \n",
+ " 1007 | \n",
+ " 10007299 | \n",
+ " (HP:0008629, CHEBI:114785) | \n",
+ " CHEBI:114785 | \n",
+ " erlotinib | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237100 | \n",
+ " UNII:52CMI0WC3Y | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.999719 | \n",
+ " ... | \n",
+ " -5.066445 | \n",
+ " 1016 | \n",
+ " 9996273 | \n",
+ " (UNII:52CMI0WC3Y, HP:0008629) | \n",
+ " UNII:52CMI0WC3Y | \n",
+ " atezolizumab | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237101 | \n",
+ " CHEBI:135738 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.998357 | \n",
+ " ... | \n",
+ " -5.079640 | \n",
+ " 978 | \n",
+ " 9998030 | \n",
+ " (HP:0008629, CHEBI:135738) | \n",
+ " CHEBI:135738 | \n",
+ " clevidipine | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237102 | \n",
+ " MONDO:0004967 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.997631 | \n",
+ " ... | \n",
+ " -4.683547 | \n",
+ " 1006 | \n",
+ " 10001385 | \n",
+ " (HP:0008629, MONDO:0004967) | \n",
+ " MONDO:0004967 | \n",
+ " acute lymphoblastic leukemia (disease) | \n",
+ " biolink:Disease | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237103 | \n",
+ " HP:0000360 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 7.553003 | \n",
+ " 997 | \n",
+ " 9997731 | \n",
+ " (HP:0000360, HP:0008629) | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
237104 rows × 21 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " subject predicate \\\n",
+ "0 HP:0008629 biolink:associated_with_increased_likelihood_of \n",
+ "1 MONDO:0010643 biolink:associated_with_increased_likelihood_of \n",
+ "2 UNII:25ADE2236L biolink:associated_with_increased_likelihood_of \n",
+ "3 UNII:K16AIQ8CTM biolink:associated_with_increased_likelihood_of \n",
+ "4 MONDO:0007972 biolink:associated_with_increased_likelihood_of \n",
+ "... ... ... \n",
+ "237099 CHEBI:114785 biolink:associated_with_decreased_likelihood_of \n",
+ "237100 UNII:52CMI0WC3Y biolink:associated_with_decreased_likelihood_of \n",
+ "237101 CHEBI:135738 biolink:associated_with_decreased_likelihood_of \n",
+ "237102 MONDO:0004967 biolink:associated_with_decreased_likelihood_of \n",
+ "237103 HP:0000360 biolink:associated_with_increased_likelihood_of \n",
+ "\n",
+ " object relation provided_by provided_date \\\n",
+ "0 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "1 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "2 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "3 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "4 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "... ... ... ... ... \n",
+ "237099 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237100 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237101 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237102 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237103 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "\n",
+ " predicate_category classifier auc_roc p_value ... \\\n",
+ "0 biolink:Association Logistic Regression 0.840132 0.000000 ... \n",
+ "1 biolink:Association Logistic Regression 0.840132 0.999872 ... \n",
+ "2 biolink:Association Logistic Regression 0.840132 0.936767 ... \n",
+ "3 biolink:Association Logistic Regression 0.840132 0.998563 ... \n",
+ "4 biolink:Association Logistic Regression 0.840132 0.093959 ... \n",
+ "... ... ... ... ... ... \n",
+ "237099 biolink:Association Logistic Regression 0.959791 0.998503 ... \n",
+ "237100 biolink:Association Logistic Regression 0.959791 0.999719 ... \n",
+ "237101 biolink:Association Logistic Regression 0.959791 0.998357 ... \n",
+ "237102 biolink:Association Logistic Regression 0.959791 0.997631 ... \n",
+ "237103 biolink:Association Logistic Regression 0.959791 0.000000 ... \n",
+ "\n",
+ " feature_coefficient num_patients_with_condition \\\n",
+ "0 8.796399 99669 \n",
+ "1 8.585212 99938 \n",
+ "2 4.558177 99770 \n",
+ "3 4.357522 99910 \n",
+ "4 3.926064 100242 \n",
+ "... ... ... \n",
+ "237099 -5.087542 1007 \n",
+ "237100 -5.066445 1016 \n",
+ "237101 -5.079640 978 \n",
+ "237102 -4.683547 1006 \n",
+ "237103 7.553003 997 \n",
+ "\n",
+ " num_patients_without_condition nodes_frozenset \\\n",
+ "0 9999902 (HP:0000360, HP:0008629) \n",
+ "1 10000835 (HP:0000360, MONDO:0010643) \n",
+ "2 10000939 (UNII:25ADE2236L, HP:0000360) \n",
+ "3 9998659 (HP:0000360, UNII:K16AIQ8CTM) \n",
+ "4 9998750 (MONDO:0007972, HP:0000360) \n",
+ "... ... ... \n",
+ "237099 10007299 (HP:0008629, CHEBI:114785) \n",
+ "237100 9996273 (UNII:52CMI0WC3Y, HP:0008629) \n",
+ "237101 9998030 (HP:0008629, CHEBI:135738) \n",
+ "237102 10001385 (HP:0008629, MONDO:0004967) \n",
+ "237103 9997731 (HP:0000360, HP:0008629) \n",
+ "\n",
+ " subject_id subject_name \\\n",
+ "0 HP:0008629 Pulsatile tinnitus \n",
+ "1 MONDO:0010643 acute leukemia (disease) \n",
+ "2 UNII:25ADE2236L thrombin \n",
+ "3 UNII:K16AIQ8CTM pertuzumab \n",
+ "4 MONDO:0007972 Meniere disease \n",
+ "... ... ... \n",
+ "237099 CHEBI:114785 erlotinib \n",
+ "237100 UNII:52CMI0WC3Y atezolizumab \n",
+ "237101 CHEBI:135738 clevidipine \n",
+ "237102 MONDO:0004967 acute lymphoblastic leukemia (disease) \n",
+ "237103 HP:0000360 Tinnitus \n",
+ "\n",
+ " subject_category object_id object_name \\\n",
+ "0 biolink:PhenotypicFeature HP:0000360 Tinnitus \n",
+ "1 biolink:Disease HP:0000360 Tinnitus \n",
+ "2 biolink:ChemicalEntity HP:0000360 Tinnitus \n",
+ "3 biolink:ChemicalEntity HP:0000360 Tinnitus \n",
+ "4 biolink:Disease HP:0000360 Tinnitus \n",
+ "... ... ... ... \n",
+ "237099 biolink:ChemicalEntity HP:0008629 Pulsatile tinnitus \n",
+ "237100 biolink:ChemicalEntity HP:0008629 Pulsatile tinnitus \n",
+ "237101 biolink:ChemicalEntity HP:0008629 Pulsatile tinnitus \n",
+ "237102 biolink:Disease HP:0008629 Pulsatile tinnitus \n",
+ "237103 biolink:PhenotypicFeature HP:0008629 Pulsatile tinnitus \n",
+ "\n",
+ " object_category \n",
+ "0 biolink:PhenotypicFeature \n",
+ "1 biolink:PhenotypicFeature \n",
+ "2 biolink:PhenotypicFeature \n",
+ "3 biolink:PhenotypicFeature \n",
+ "4 biolink:PhenotypicFeature \n",
+ "... ... \n",
+ "237099 biolink:PhenotypicFeature \n",
+ "237100 biolink:PhenotypicFeature \n",
+ "237101 biolink:PhenotypicFeature \n",
+ "237102 biolink:PhenotypicFeature \n",
+ "237103 biolink:PhenotypicFeature \n",
+ "\n",
+ "[237104 rows x 21 columns]"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kg.rename(columns = {'id':'object_id',\n",
+ " 'category': 'object_category',\n",
+ " 'name': 'object_name'}, inplace = True)\n",
+ "kg"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "d54c7830",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " subject | \n",
+ " predicate | \n",
+ " object | \n",
+ " relation | \n",
+ " provided_by | \n",
+ " provided_date | \n",
+ " predicate_category | \n",
+ " classifier | \n",
+ " auc_roc | \n",
+ " p_value | \n",
+ " ... | \n",
+ " feature_coefficient | \n",
+ " num_patients_with_condition | \n",
+ " num_patients_without_condition | \n",
+ " nodes_frozenset | \n",
+ " subject_id | \n",
+ " subject_name | \n",
+ " subject_category | \n",
+ " object_id | \n",
+ " object_name | \n",
+ " object_category | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 8.796399 | \n",
+ " 99669 | \n",
+ " 9999902 | \n",
+ " (HP:0000360, HP:0008629) | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " MONDO:0010643 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.999872 | \n",
+ " ... | \n",
+ " 8.585212 | \n",
+ " 99938 | \n",
+ " 10000835 | \n",
+ " (HP:0000360, MONDO:0010643) | \n",
+ " MONDO:0010643 | \n",
+ " acute leukemia (disease) | \n",
+ " biolink:Disease | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " UNII:25ADE2236L | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.936767 | \n",
+ " ... | \n",
+ " 4.558177 | \n",
+ " 99770 | \n",
+ " 10000939 | \n",
+ " (UNII:25ADE2236L, HP:0000360) | \n",
+ " UNII:25ADE2236L | \n",
+ " thrombin | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " UNII:K16AIQ8CTM | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.998563 | \n",
+ " ... | \n",
+ " 4.357522 | \n",
+ " 99910 | \n",
+ " 9998659 | \n",
+ " (HP:0000360, UNII:K16AIQ8CTM) | \n",
+ " UNII:K16AIQ8CTM | \n",
+ " pertuzumab | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " MONDO:0007972 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.093959 | \n",
+ " ... | \n",
+ " 3.926064 | \n",
+ " 100242 | \n",
+ " 9998750 | \n",
+ " (MONDO:0007972, HP:0000360) | \n",
+ " MONDO:0007972 | \n",
+ " Meniere disease | \n",
+ " biolink:Disease | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 237099 | \n",
+ " CHEBI:114785 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.998503 | \n",
+ " ... | \n",
+ " -5.087542 | \n",
+ " 1007 | \n",
+ " 10007299 | \n",
+ " (HP:0008629, CHEBI:114785) | \n",
+ " CHEBI:114785 | \n",
+ " erlotinib | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237100 | \n",
+ " UNII:52CMI0WC3Y | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.999719 | \n",
+ " ... | \n",
+ " -5.066445 | \n",
+ " 1016 | \n",
+ " 9996273 | \n",
+ " (UNII:52CMI0WC3Y, HP:0008629) | \n",
+ " UNII:52CMI0WC3Y | \n",
+ " atezolizumab | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237101 | \n",
+ " CHEBI:135738 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.998357 | \n",
+ " ... | \n",
+ " -5.079640 | \n",
+ " 978 | \n",
+ " 9998030 | \n",
+ " (HP:0008629, CHEBI:135738) | \n",
+ " CHEBI:135738 | \n",
+ " clevidipine | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237102 | \n",
+ " MONDO:0004967 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.997631 | \n",
+ " ... | \n",
+ " -4.683547 | \n",
+ " 1006 | \n",
+ " 10001385 | \n",
+ " (HP:0008629, MONDO:0004967) | \n",
+ " MONDO:0004967 | \n",
+ " acute lymphoblastic leukemia (disease) | \n",
+ " biolink:Disease | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237103 | \n",
+ " HP:0000360 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 7.553003 | \n",
+ " 997 | \n",
+ " 9997731 | \n",
+ " (HP:0000360, HP:0008629) | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
237104 rows × 21 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " subject predicate \\\n",
+ "0 HP:0008629 biolink:associated_with_increased_likelihood_of \n",
+ "1 MONDO:0010643 biolink:associated_with_increased_likelihood_of \n",
+ "2 UNII:25ADE2236L biolink:associated_with_increased_likelihood_of \n",
+ "3 UNII:K16AIQ8CTM biolink:associated_with_increased_likelihood_of \n",
+ "4 MONDO:0007972 biolink:associated_with_increased_likelihood_of \n",
+ "... ... ... \n",
+ "237099 CHEBI:114785 biolink:associated_with_decreased_likelihood_of \n",
+ "237100 UNII:52CMI0WC3Y biolink:associated_with_decreased_likelihood_of \n",
+ "237101 CHEBI:135738 biolink:associated_with_decreased_likelihood_of \n",
+ "237102 MONDO:0004967 biolink:associated_with_decreased_likelihood_of \n",
+ "237103 HP:0000360 biolink:associated_with_increased_likelihood_of \n",
+ "\n",
+ " object relation provided_by provided_date \\\n",
+ "0 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "1 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "2 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "3 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "4 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "... ... ... ... ... \n",
+ "237099 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237100 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237101 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237102 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237103 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "\n",
+ " predicate_category classifier auc_roc p_value ... \\\n",
+ "0 biolink:Association Logistic Regression 0.840132 0.000000 ... \n",
+ "1 biolink:Association Logistic Regression 0.840132 0.999872 ... \n",
+ "2 biolink:Association Logistic Regression 0.840132 0.936767 ... \n",
+ "3 biolink:Association Logistic Regression 0.840132 0.998563 ... \n",
+ "4 biolink:Association Logistic Regression 0.840132 0.093959 ... \n",
+ "... ... ... ... ... ... \n",
+ "237099 biolink:Association Logistic Regression 0.959791 0.998503 ... \n",
+ "237100 biolink:Association Logistic Regression 0.959791 0.999719 ... \n",
+ "237101 biolink:Association Logistic Regression 0.959791 0.998357 ... \n",
+ "237102 biolink:Association Logistic Regression 0.959791 0.997631 ... \n",
+ "237103 biolink:Association Logistic Regression 0.959791 0.000000 ... \n",
+ "\n",
+ " feature_coefficient num_patients_with_condition \\\n",
+ "0 8.796399 99669 \n",
+ "1 8.585212 99938 \n",
+ "2 4.558177 99770 \n",
+ "3 4.357522 99910 \n",
+ "4 3.926064 100242 \n",
+ "... ... ... \n",
+ "237099 -5.087542 1007 \n",
+ "237100 -5.066445 1016 \n",
+ "237101 -5.079640 978 \n",
+ "237102 -4.683547 1006 \n",
+ "237103 7.553003 997 \n",
+ "\n",
+ " num_patients_without_condition nodes_frozenset \\\n",
+ "0 9999902 (HP:0000360, HP:0008629) \n",
+ "1 10000835 (HP:0000360, MONDO:0010643) \n",
+ "2 10000939 (UNII:25ADE2236L, HP:0000360) \n",
+ "3 9998659 (HP:0000360, UNII:K16AIQ8CTM) \n",
+ "4 9998750 (MONDO:0007972, HP:0000360) \n",
+ "... ... ... \n",
+ "237099 10007299 (HP:0008629, CHEBI:114785) \n",
+ "237100 9996273 (UNII:52CMI0WC3Y, HP:0008629) \n",
+ "237101 9998030 (HP:0008629, CHEBI:135738) \n",
+ "237102 10001385 (HP:0008629, MONDO:0004967) \n",
+ "237103 9997731 (HP:0000360, HP:0008629) \n",
+ "\n",
+ " subject_id subject_name \\\n",
+ "0 HP:0008629 Pulsatile tinnitus \n",
+ "1 MONDO:0010643 acute leukemia (disease) \n",
+ "2 UNII:25ADE2236L thrombin \n",
+ "3 UNII:K16AIQ8CTM pertuzumab \n",
+ "4 MONDO:0007972 Meniere disease \n",
+ "... ... ... \n",
+ "237099 CHEBI:114785 erlotinib \n",
+ "237100 UNII:52CMI0WC3Y atezolizumab \n",
+ "237101 CHEBI:135738 clevidipine \n",
+ "237102 MONDO:0004967 acute lymphoblastic leukemia (disease) \n",
+ "237103 HP:0000360 Tinnitus \n",
+ "\n",
+ " subject_category object_id object_name \\\n",
+ "0 biolink:PhenotypicFeature HP:0000360 Tinnitus \n",
+ "1 biolink:Disease HP:0000360 Tinnitus \n",
+ "2 biolink:ChemicalEntity HP:0000360 Tinnitus \n",
+ "3 biolink:ChemicalEntity HP:0000360 Tinnitus \n",
+ "4 biolink:Disease HP:0000360 Tinnitus \n",
+ "... ... ... ... \n",
+ "237099 biolink:ChemicalEntity HP:0008629 Pulsatile tinnitus \n",
+ "237100 biolink:ChemicalEntity HP:0008629 Pulsatile tinnitus \n",
+ "237101 biolink:ChemicalEntity HP:0008629 Pulsatile tinnitus \n",
+ "237102 biolink:Disease HP:0008629 Pulsatile tinnitus \n",
+ "237103 biolink:PhenotypicFeature HP:0008629 Pulsatile tinnitus \n",
+ "\n",
+ " object_category \n",
+ "0 biolink:PhenotypicFeature \n",
+ "1 biolink:PhenotypicFeature \n",
+ "2 biolink:PhenotypicFeature \n",
+ "3 biolink:PhenotypicFeature \n",
+ "4 biolink:PhenotypicFeature \n",
+ "... ... \n",
+ "237099 biolink:PhenotypicFeature \n",
+ "237100 biolink:PhenotypicFeature \n",
+ "237101 biolink:PhenotypicFeature \n",
+ "237102 biolink:PhenotypicFeature \n",
+ "237103 biolink:PhenotypicFeature \n",
+ "\n",
+ "[237104 rows x 21 columns]"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# ENSURE THERE ARE UNIQUE RECORDS/ROWS\n",
+ "kg = kg.drop_duplicates(['subject', 'object', 'auc_roc', 'p_value', 'feature_coefficient'], keep='first')\n",
+ "kg"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "08820f13",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "292\n",
+ "0\n"
+ ]
+ }
+ ],
+ "source": [
+ "kg_NONE_subjects = kg[kg[\"subject\"].str.contains(\"NONE\")==True]\n",
+ "print(len(kg_NONE_subjects))\n",
+ "kg_NONE_objects = kg[kg[\"object\"].str.contains(\"NONE\")==True]\n",
+ "print(len(kg_NONE_objects))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "cc1c65fd",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " subject | \n",
+ " predicate | \n",
+ " object | \n",
+ " relation | \n",
+ " provided_by | \n",
+ " provided_date | \n",
+ " predicate_category | \n",
+ " classifier | \n",
+ " auc_roc | \n",
+ " p_value | \n",
+ " ... | \n",
+ " feature_coefficient | \n",
+ " num_patients_with_condition | \n",
+ " num_patients_without_condition | \n",
+ " nodes_frozenset | \n",
+ " subject_id | \n",
+ " subject_name | \n",
+ " subject_category | \n",
+ " object_id | \n",
+ " object_name | \n",
+ " object_category | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 8.796399 | \n",
+ " 99669 | \n",
+ " 9999902 | \n",
+ " (HP:0000360, HP:0008629) | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " MONDO:0010643 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.999872 | \n",
+ " ... | \n",
+ " 8.585212 | \n",
+ " 99938 | \n",
+ " 10000835 | \n",
+ " (HP:0000360, MONDO:0010643) | \n",
+ " MONDO:0010643 | \n",
+ " acute leukemia (disease) | \n",
+ " biolink:Disease | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " UNII:25ADE2236L | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.936767 | \n",
+ " ... | \n",
+ " 4.558177 | \n",
+ " 99770 | \n",
+ " 10000939 | \n",
+ " (UNII:25ADE2236L, HP:0000360) | \n",
+ " UNII:25ADE2236L | \n",
+ " thrombin | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " UNII:K16AIQ8CTM | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.998563 | \n",
+ " ... | \n",
+ " 4.357522 | \n",
+ " 99910 | \n",
+ " 9998659 | \n",
+ " (HP:0000360, UNII:K16AIQ8CTM) | \n",
+ " UNII:K16AIQ8CTM | \n",
+ " pertuzumab | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " MONDO:0007972 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.093959 | \n",
+ " ... | \n",
+ " 3.926064 | \n",
+ " 100242 | \n",
+ " 9998750 | \n",
+ " (MONDO:0007972, HP:0000360) | \n",
+ " MONDO:0007972 | \n",
+ " Meniere disease | \n",
+ " biolink:Disease | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 237099 | \n",
+ " CHEBI:114785 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.998503 | \n",
+ " ... | \n",
+ " -5.087542 | \n",
+ " 1007 | \n",
+ " 10007299 | \n",
+ " (HP:0008629, CHEBI:114785) | \n",
+ " CHEBI:114785 | \n",
+ " erlotinib | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237100 | \n",
+ " UNII:52CMI0WC3Y | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.999719 | \n",
+ " ... | \n",
+ " -5.066445 | \n",
+ " 1016 | \n",
+ " 9996273 | \n",
+ " (UNII:52CMI0WC3Y, HP:0008629) | \n",
+ " UNII:52CMI0WC3Y | \n",
+ " atezolizumab | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237101 | \n",
+ " CHEBI:135738 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.998357 | \n",
+ " ... | \n",
+ " -5.079640 | \n",
+ " 978 | \n",
+ " 9998030 | \n",
+ " (HP:0008629, CHEBI:135738) | \n",
+ " CHEBI:135738 | \n",
+ " clevidipine | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237102 | \n",
+ " MONDO:0004967 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.997631 | \n",
+ " ... | \n",
+ " -4.683547 | \n",
+ " 1006 | \n",
+ " 10001385 | \n",
+ " (HP:0008629, MONDO:0004967) | \n",
+ " MONDO:0004967 | \n",
+ " acute lymphoblastic leukemia (disease) | \n",
+ " biolink:Disease | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237103 | \n",
+ " HP:0000360 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 7.553003 | \n",
+ " 997 | \n",
+ " 9997731 | \n",
+ " (HP:0000360, HP:0008629) | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
236812 rows × 21 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " subject predicate \\\n",
+ "0 HP:0008629 biolink:associated_with_increased_likelihood_of \n",
+ "1 MONDO:0010643 biolink:associated_with_increased_likelihood_of \n",
+ "2 UNII:25ADE2236L biolink:associated_with_increased_likelihood_of \n",
+ "3 UNII:K16AIQ8CTM biolink:associated_with_increased_likelihood_of \n",
+ "4 MONDO:0007972 biolink:associated_with_increased_likelihood_of \n",
+ "... ... ... \n",
+ "237099 CHEBI:114785 biolink:associated_with_decreased_likelihood_of \n",
+ "237100 UNII:52CMI0WC3Y biolink:associated_with_decreased_likelihood_of \n",
+ "237101 CHEBI:135738 biolink:associated_with_decreased_likelihood_of \n",
+ "237102 MONDO:0004967 biolink:associated_with_decreased_likelihood_of \n",
+ "237103 HP:0000360 biolink:associated_with_increased_likelihood_of \n",
+ "\n",
+ " object relation provided_by provided_date \\\n",
+ "0 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "1 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "2 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "3 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "4 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "... ... ... ... ... \n",
+ "237099 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237100 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237101 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237102 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237103 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "\n",
+ " predicate_category classifier auc_roc p_value ... \\\n",
+ "0 biolink:Association Logistic Regression 0.840132 0.000000 ... \n",
+ "1 biolink:Association Logistic Regression 0.840132 0.999872 ... \n",
+ "2 biolink:Association Logistic Regression 0.840132 0.936767 ... \n",
+ "3 biolink:Association Logistic Regression 0.840132 0.998563 ... \n",
+ "4 biolink:Association Logistic Regression 0.840132 0.093959 ... \n",
+ "... ... ... ... ... ... \n",
+ "237099 biolink:Association Logistic Regression 0.959791 0.998503 ... \n",
+ "237100 biolink:Association Logistic Regression 0.959791 0.999719 ... \n",
+ "237101 biolink:Association Logistic Regression 0.959791 0.998357 ... \n",
+ "237102 biolink:Association Logistic Regression 0.959791 0.997631 ... \n",
+ "237103 biolink:Association Logistic Regression 0.959791 0.000000 ... \n",
+ "\n",
+ " feature_coefficient num_patients_with_condition \\\n",
+ "0 8.796399 99669 \n",
+ "1 8.585212 99938 \n",
+ "2 4.558177 99770 \n",
+ "3 4.357522 99910 \n",
+ "4 3.926064 100242 \n",
+ "... ... ... \n",
+ "237099 -5.087542 1007 \n",
+ "237100 -5.066445 1016 \n",
+ "237101 -5.079640 978 \n",
+ "237102 -4.683547 1006 \n",
+ "237103 7.553003 997 \n",
+ "\n",
+ " num_patients_without_condition nodes_frozenset \\\n",
+ "0 9999902 (HP:0000360, HP:0008629) \n",
+ "1 10000835 (HP:0000360, MONDO:0010643) \n",
+ "2 10000939 (UNII:25ADE2236L, HP:0000360) \n",
+ "3 9998659 (HP:0000360, UNII:K16AIQ8CTM) \n",
+ "4 9998750 (MONDO:0007972, HP:0000360) \n",
+ "... ... ... \n",
+ "237099 10007299 (HP:0008629, CHEBI:114785) \n",
+ "237100 9996273 (UNII:52CMI0WC3Y, HP:0008629) \n",
+ "237101 9998030 (HP:0008629, CHEBI:135738) \n",
+ "237102 10001385 (HP:0008629, MONDO:0004967) \n",
+ "237103 9997731 (HP:0000360, HP:0008629) \n",
+ "\n",
+ " subject_id subject_name \\\n",
+ "0 HP:0008629 Pulsatile tinnitus \n",
+ "1 MONDO:0010643 acute leukemia (disease) \n",
+ "2 UNII:25ADE2236L thrombin \n",
+ "3 UNII:K16AIQ8CTM pertuzumab \n",
+ "4 MONDO:0007972 Meniere disease \n",
+ "... ... ... \n",
+ "237099 CHEBI:114785 erlotinib \n",
+ "237100 UNII:52CMI0WC3Y atezolizumab \n",
+ "237101 CHEBI:135738 clevidipine \n",
+ "237102 MONDO:0004967 acute lymphoblastic leukemia (disease) \n",
+ "237103 HP:0000360 Tinnitus \n",
+ "\n",
+ " subject_category object_id object_name \\\n",
+ "0 biolink:PhenotypicFeature HP:0000360 Tinnitus \n",
+ "1 biolink:Disease HP:0000360 Tinnitus \n",
+ "2 biolink:ChemicalEntity HP:0000360 Tinnitus \n",
+ "3 biolink:ChemicalEntity HP:0000360 Tinnitus \n",
+ "4 biolink:Disease HP:0000360 Tinnitus \n",
+ "... ... ... ... \n",
+ "237099 biolink:ChemicalEntity HP:0008629 Pulsatile tinnitus \n",
+ "237100 biolink:ChemicalEntity HP:0008629 Pulsatile tinnitus \n",
+ "237101 biolink:ChemicalEntity HP:0008629 Pulsatile tinnitus \n",
+ "237102 biolink:Disease HP:0008629 Pulsatile tinnitus \n",
+ "237103 biolink:PhenotypicFeature HP:0008629 Pulsatile tinnitus \n",
+ "\n",
+ " object_category \n",
+ "0 biolink:PhenotypicFeature \n",
+ "1 biolink:PhenotypicFeature \n",
+ "2 biolink:PhenotypicFeature \n",
+ "3 biolink:PhenotypicFeature \n",
+ "4 biolink:PhenotypicFeature \n",
+ "... ... \n",
+ "237099 biolink:PhenotypicFeature \n",
+ "237100 biolink:PhenotypicFeature \n",
+ "237101 biolink:PhenotypicFeature \n",
+ "237102 biolink:PhenotypicFeature \n",
+ "237103 biolink:PhenotypicFeature \n",
+ "\n",
+ "[236812 rows x 21 columns]"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# for some reason, some subject or object ids are empty (they actually literally contain the string \"NONE\")\n",
+ "kg = kg.dropna(axis=0, subset=['subject'])\n",
+ "kg = kg.dropna(axis=0, subset=['object'])\n",
+ "kg = kg[~kg[\"subject\"].str.contains(\"NONE\")==True]\n",
+ "kg = kg[~kg[\"object\"].str.contains(\"NONE\")==True]\n",
+ "kg"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "c14460e0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " subject | \n",
+ " predicate | \n",
+ " object | \n",
+ " relation | \n",
+ " provided_by | \n",
+ " provided_date | \n",
+ " predicate_category | \n",
+ " classifier | \n",
+ " auc_roc | \n",
+ " p_value | \n",
+ " ... | \n",
+ " feature_coefficient | \n",
+ " num_patients_with_condition | \n",
+ " num_patients_without_condition | \n",
+ " nodes_frozenset | \n",
+ " subject_id | \n",
+ " subject_name | \n",
+ " subject_category | \n",
+ " object_id | \n",
+ " object_name | \n",
+ " object_category | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 8.796399 | \n",
+ " 99669 | \n",
+ " 9999902 | \n",
+ " (HP:0000360, HP:0008629) | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " MONDO:0010643 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.999872 | \n",
+ " ... | \n",
+ " 8.585212 | \n",
+ " 99938 | \n",
+ " 10000835 | \n",
+ " (HP:0000360, MONDO:0010643) | \n",
+ " MONDO:0010643 | \n",
+ " acute leukemia (disease) | \n",
+ " biolink:Disease | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " UNII:25ADE2236L | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.936767 | \n",
+ " ... | \n",
+ " 4.558177 | \n",
+ " 99770 | \n",
+ " 10000939 | \n",
+ " (UNII:25ADE2236L, HP:0000360) | \n",
+ " UNII:25ADE2236L | \n",
+ " thrombin | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " UNII:K16AIQ8CTM | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.998563 | \n",
+ " ... | \n",
+ " 4.357522 | \n",
+ " 99910 | \n",
+ " 9998659 | \n",
+ " (HP:0000360, UNII:K16AIQ8CTM) | \n",
+ " UNII:K16AIQ8CTM | \n",
+ " pertuzumab | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " MONDO:0007972 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.093959 | \n",
+ " ... | \n",
+ " 3.926064 | \n",
+ " 100242 | \n",
+ " 9998750 | \n",
+ " (MONDO:0007972, HP:0000360) | \n",
+ " MONDO:0007972 | \n",
+ " Meniere disease | \n",
+ " biolink:Disease | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 237099 | \n",
+ " CHEBI:114785 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.998503 | \n",
+ " ... | \n",
+ " -5.087542 | \n",
+ " 1007 | \n",
+ " 10007299 | \n",
+ " (HP:0008629, CHEBI:114785) | \n",
+ " CHEBI:114785 | \n",
+ " erlotinib | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237100 | \n",
+ " UNII:52CMI0WC3Y | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.999719 | \n",
+ " ... | \n",
+ " -5.066445 | \n",
+ " 1016 | \n",
+ " 9996273 | \n",
+ " (UNII:52CMI0WC3Y, HP:0008629) | \n",
+ " UNII:52CMI0WC3Y | \n",
+ " atezolizumab | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237101 | \n",
+ " CHEBI:135738 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.998357 | \n",
+ " ... | \n",
+ " -5.079640 | \n",
+ " 978 | \n",
+ " 9998030 | \n",
+ " (HP:0008629, CHEBI:135738) | \n",
+ " CHEBI:135738 | \n",
+ " clevidipine | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237102 | \n",
+ " MONDO:0004967 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.997631 | \n",
+ " ... | \n",
+ " -4.683547 | \n",
+ " 1006 | \n",
+ " 10001385 | \n",
+ " (HP:0008629, MONDO:0004967) | \n",
+ " MONDO:0004967 | \n",
+ " acute lymphoblastic leukemia (disease) | \n",
+ " biolink:Disease | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ " 237103 | \n",
+ " HP:0000360 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 7.553003 | \n",
+ " 997 | \n",
+ " 9997731 | \n",
+ " (HP:0000360, HP:0008629) | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
236812 rows × 21 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " subject predicate \\\n",
+ "0 HP:0008629 biolink:associated_with_increased_likelihood_of \n",
+ "1 MONDO:0010643 biolink:associated_with_increased_likelihood_of \n",
+ "2 UNII:25ADE2236L biolink:associated_with_increased_likelihood_of \n",
+ "3 UNII:K16AIQ8CTM biolink:associated_with_increased_likelihood_of \n",
+ "4 MONDO:0007972 biolink:associated_with_increased_likelihood_of \n",
+ "... ... ... \n",
+ "237099 CHEBI:114785 biolink:associated_with_decreased_likelihood_of \n",
+ "237100 UNII:52CMI0WC3Y biolink:associated_with_decreased_likelihood_of \n",
+ "237101 CHEBI:135738 biolink:associated_with_decreased_likelihood_of \n",
+ "237102 MONDO:0004967 biolink:associated_with_decreased_likelihood_of \n",
+ "237103 HP:0000360 biolink:associated_with_increased_likelihood_of \n",
+ "\n",
+ " object relation provided_by provided_date \\\n",
+ "0 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "1 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "2 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "3 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "4 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "... ... ... ... ... \n",
+ "237099 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237100 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237101 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237102 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237103 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "\n",
+ " predicate_category classifier auc_roc p_value ... \\\n",
+ "0 biolink:Association Logistic Regression 0.840132 0.000000 ... \n",
+ "1 biolink:Association Logistic Regression 0.840132 0.999872 ... \n",
+ "2 biolink:Association Logistic Regression 0.840132 0.936767 ... \n",
+ "3 biolink:Association Logistic Regression 0.840132 0.998563 ... \n",
+ "4 biolink:Association Logistic Regression 0.840132 0.093959 ... \n",
+ "... ... ... ... ... ... \n",
+ "237099 biolink:Association Logistic Regression 0.959791 0.998503 ... \n",
+ "237100 biolink:Association Logistic Regression 0.959791 0.999719 ... \n",
+ "237101 biolink:Association Logistic Regression 0.959791 0.998357 ... \n",
+ "237102 biolink:Association Logistic Regression 0.959791 0.997631 ... \n",
+ "237103 biolink:Association Logistic Regression 0.959791 0.000000 ... \n",
+ "\n",
+ " feature_coefficient num_patients_with_condition \\\n",
+ "0 8.796399 99669 \n",
+ "1 8.585212 99938 \n",
+ "2 4.558177 99770 \n",
+ "3 4.357522 99910 \n",
+ "4 3.926064 100242 \n",
+ "... ... ... \n",
+ "237099 -5.087542 1007 \n",
+ "237100 -5.066445 1016 \n",
+ "237101 -5.079640 978 \n",
+ "237102 -4.683547 1006 \n",
+ "237103 7.553003 997 \n",
+ "\n",
+ " num_patients_without_condition nodes_frozenset \\\n",
+ "0 9999902 (HP:0000360, HP:0008629) \n",
+ "1 10000835 (HP:0000360, MONDO:0010643) \n",
+ "2 10000939 (UNII:25ADE2236L, HP:0000360) \n",
+ "3 9998659 (HP:0000360, UNII:K16AIQ8CTM) \n",
+ "4 9998750 (MONDO:0007972, HP:0000360) \n",
+ "... ... ... \n",
+ "237099 10007299 (HP:0008629, CHEBI:114785) \n",
+ "237100 9996273 (UNII:52CMI0WC3Y, HP:0008629) \n",
+ "237101 9998030 (HP:0008629, CHEBI:135738) \n",
+ "237102 10001385 (HP:0008629, MONDO:0004967) \n",
+ "237103 9997731 (HP:0000360, HP:0008629) \n",
+ "\n",
+ " subject_id subject_name \\\n",
+ "0 HP:0008629 Pulsatile tinnitus \n",
+ "1 MONDO:0010643 acute leukemia (disease) \n",
+ "2 UNII:25ADE2236L thrombin \n",
+ "3 UNII:K16AIQ8CTM pertuzumab \n",
+ "4 MONDO:0007972 Meniere disease \n",
+ "... ... ... \n",
+ "237099 CHEBI:114785 erlotinib \n",
+ "237100 UNII:52CMI0WC3Y atezolizumab \n",
+ "237101 CHEBI:135738 clevidipine \n",
+ "237102 MONDO:0004967 acute lymphoblastic leukemia (disease) \n",
+ "237103 HP:0000360 Tinnitus \n",
+ "\n",
+ " subject_category object_id object_name \\\n",
+ "0 biolink:PhenotypicFeature HP:0000360 Tinnitus \n",
+ "1 biolink:Disease HP:0000360 Tinnitus \n",
+ "2 biolink:ChemicalEntity HP:0000360 Tinnitus \n",
+ "3 biolink:ChemicalEntity HP:0000360 Tinnitus \n",
+ "4 biolink:Disease HP:0000360 Tinnitus \n",
+ "... ... ... ... \n",
+ "237099 biolink:ChemicalEntity HP:0008629 Pulsatile tinnitus \n",
+ "237100 biolink:ChemicalEntity HP:0008629 Pulsatile tinnitus \n",
+ "237101 biolink:ChemicalEntity HP:0008629 Pulsatile tinnitus \n",
+ "237102 biolink:Disease HP:0008629 Pulsatile tinnitus \n",
+ "237103 biolink:PhenotypicFeature HP:0008629 Pulsatile tinnitus \n",
+ "\n",
+ " object_category \n",
+ "0 biolink:PhenotypicFeature \n",
+ "1 biolink:PhenotypicFeature \n",
+ "2 biolink:PhenotypicFeature \n",
+ "3 biolink:PhenotypicFeature \n",
+ "4 biolink:PhenotypicFeature \n",
+ "... ... \n",
+ "237099 biolink:PhenotypicFeature \n",
+ "237100 biolink:PhenotypicFeature \n",
+ "237101 biolink:PhenotypicFeature \n",
+ "237102 biolink:PhenotypicFeature \n",
+ "237103 biolink:PhenotypicFeature \n",
+ "\n",
+ "[236812 rows x 21 columns]"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# for some reason, some subject or object ids are empty (they actually literally contain the string \"NONE\")\n",
+ "# get rid of these rows\n",
+ "kg = kg[~kg[\"subject\"].str.contains(\"NONE\")==True] # subject and object are all CURIEs, not names\n",
+ "kg = kg[~kg[\"object\"].str.contains(\"NONE\")==True]\n",
+ "kg = kg[~kg[\"subject\"].str.contains(\"none\")==True]\n",
+ "kg = kg[~kg[\"object\"].str.contains(\"none\")==True]\n",
+ "kg = kg[~kg[\"subject\"].str.contains(\"None\")==True]\n",
+ "kg = kg[~kg[\"object\"].str.contains(\"None\")==True]\n",
+ "kg"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5201401a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# OPTIONAL THRESHOLD\n",
+ "# kg_pval_subsetted = kg[kg[\"p_value\"] < 0.2]\n",
+ "# kg_pval_subsetted"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 334,
+ "id": "e809e4a0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " subject_category_postfix | \n",
+ " subject_prefix | \n",
+ " increased_or_decreased | \n",
+ " object_category_postfix | \n",
+ " object_prefix | \n",
+ " Count | \n",
+ " object_category_postfix_abbv | \n",
+ " subject_category_postfix_abbv | \n",
+ " joined_to_match_xbte | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ChemicalEntity | \n",
+ " CHEBI | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " 34917 | \n",
+ " Disease | \n",
+ " Chem | \n",
+ " ChemCHEBI_decreased_DiseaseMONDO | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ChemicalEntity | \n",
+ " CHEBI | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " 212 | \n",
+ " Disease | \n",
+ " Chem | \n",
+ " ChemCHEBI_decreased_DiseaseNCIT | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ChemicalEntity | \n",
+ " CHEBI | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " 2917 | \n",
+ " Disease | \n",
+ " Chem | \n",
+ " ChemCHEBI_decreased_DiseaseSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ChemicalEntity | \n",
+ " CHEBI | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " 24505 | \n",
+ " Pheno | \n",
+ " Chem | \n",
+ " ChemCHEBI_decreased_PhenoHP | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " ChemicalEntity | \n",
+ " CHEBI | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " 2254 | \n",
+ " Pheno | \n",
+ " Chem | \n",
+ " ChemCHEBI_decreased_PhenoNCIT | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " ChemicalEntity | \n",
+ " CHEBI | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " 244 | \n",
+ " Pheno | \n",
+ " Chem | \n",
+ " ChemCHEBI_decreased_PhenoSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " ChemicalEntity | \n",
+ " CHEBI | \n",
+ " decreased | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " 222 | \n",
+ " Procedure | \n",
+ " Chem | \n",
+ " ChemCHEBI_decreased_ProcedureNCIT | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " ChemicalEntity | \n",
+ " CHEBI | \n",
+ " increased | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " 18248 | \n",
+ " Disease | \n",
+ " Chem | \n",
+ " ChemCHEBI_increased_DiseaseMONDO | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " ChemicalEntity | \n",
+ " CHEBI | \n",
+ " increased | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " 131 | \n",
+ " Disease | \n",
+ " Chem | \n",
+ " ChemCHEBI_increased_DiseaseNCIT | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " ChemicalEntity | \n",
+ " CHEBI | \n",
+ " increased | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " 1542 | \n",
+ " Disease | \n",
+ " Chem | \n",
+ " ChemCHEBI_increased_DiseaseSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " ChemicalEntity | \n",
+ " CHEBI | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " 13225 | \n",
+ " Pheno | \n",
+ " Chem | \n",
+ " ChemCHEBI_increased_PhenoHP | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " ChemicalEntity | \n",
+ " CHEBI | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " 1176 | \n",
+ " Pheno | \n",
+ " Chem | \n",
+ " ChemCHEBI_increased_PhenoNCIT | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " ChemicalEntity | \n",
+ " CHEBI | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " 99 | \n",
+ " Pheno | \n",
+ " Chem | \n",
+ " ChemCHEBI_increased_PhenoSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " ChemicalEntity | \n",
+ " CHEBI | \n",
+ " increased | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " 121 | \n",
+ " Procedure | \n",
+ " Chem | \n",
+ " ChemCHEBI_increased_ProcedureNCIT | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " ChemicalEntity | \n",
+ " UNII | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " 5902 | \n",
+ " Disease | \n",
+ " Chem | \n",
+ " ChemUNII_decreased_DiseaseMONDO | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " ChemicalEntity | \n",
+ " UNII | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " 44 | \n",
+ " Disease | \n",
+ " Chem | \n",
+ " ChemUNII_decreased_DiseaseNCIT | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " ChemicalEntity | \n",
+ " UNII | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " 505 | \n",
+ " Disease | \n",
+ " Chem | \n",
+ " ChemUNII_decreased_DiseaseSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " ChemicalEntity | \n",
+ " UNII | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " 3727 | \n",
+ " Pheno | \n",
+ " Chem | \n",
+ " ChemUNII_decreased_PhenoHP | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " ChemicalEntity | \n",
+ " UNII | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " 381 | \n",
+ " Pheno | \n",
+ " Chem | \n",
+ " ChemUNII_decreased_PhenoNCIT | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " ChemicalEntity | \n",
+ " UNII | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " 39 | \n",
+ " Pheno | \n",
+ " Chem | \n",
+ " ChemUNII_decreased_PhenoSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " ChemicalEntity | \n",
+ " UNII | \n",
+ " decreased | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " 43 | \n",
+ " Procedure | \n",
+ " Chem | \n",
+ " ChemUNII_decreased_ProcedureNCIT | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " ChemicalEntity | \n",
+ " UNII | \n",
+ " increased | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " 2158 | \n",
+ " Disease | \n",
+ " Chem | \n",
+ " ChemUNII_increased_DiseaseMONDO | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " ChemicalEntity | \n",
+ " UNII | \n",
+ " increased | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " 8 | \n",
+ " Disease | \n",
+ " Chem | \n",
+ " ChemUNII_increased_DiseaseNCIT | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " ChemicalEntity | \n",
+ " UNII | \n",
+ " increased | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " 171 | \n",
+ " Disease | \n",
+ " Chem | \n",
+ " ChemUNII_increased_DiseaseSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " ChemicalEntity | \n",
+ " UNII | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " 1993 | \n",
+ " Pheno | \n",
+ " Chem | \n",
+ " ChemUNII_increased_PhenoHP | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " ChemicalEntity | \n",
+ " UNII | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " 139 | \n",
+ " Pheno | \n",
+ " Chem | \n",
+ " ChemUNII_increased_PhenoNCIT | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " ChemicalEntity | \n",
+ " UNII | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " 13 | \n",
+ " Pheno | \n",
+ " Chem | \n",
+ " ChemUNII_increased_PhenoSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " ChemicalEntity | \n",
+ " UNII | \n",
+ " increased | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " 9 | \n",
+ " Procedure | \n",
+ " Chem | \n",
+ " ChemUNII_increased_ProcedureNCIT | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " 13640 | \n",
+ " Disease | \n",
+ " Disease | \n",
+ " DiseaseMONDO_decreased_DiseaseMONDO | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " 86 | \n",
+ " Disease | \n",
+ " Disease | \n",
+ " DiseaseMONDO_decreased_DiseaseNCIT | \n",
+ "
\n",
+ " \n",
+ " 30 | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " 1237 | \n",
+ " Disease | \n",
+ " Disease | \n",
+ " DiseaseMONDO_decreased_DiseaseSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 31 | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " 10101 | \n",
+ " Pheno | \n",
+ " Disease | \n",
+ " DiseaseMONDO_decreased_PhenoHP | \n",
+ "
\n",
+ " \n",
+ " 32 | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " 1293 | \n",
+ " Pheno | \n",
+ " Disease | \n",
+ " DiseaseMONDO_decreased_PhenoNCIT | \n",
+ "
\n",
+ " \n",
+ " 33 | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " 88 | \n",
+ " Pheno | \n",
+ " Disease | \n",
+ " DiseaseMONDO_decreased_PhenoSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 34 | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " decreased | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " 113 | \n",
+ " Procedure | \n",
+ " Disease | \n",
+ " DiseaseMONDO_decreased_ProcedureNCIT | \n",
+ "
\n",
+ " \n",
+ " 35 | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " increased | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " 14415 | \n",
+ " Disease | \n",
+ " Disease | \n",
+ " DiseaseMONDO_increased_DiseaseMONDO | \n",
+ "
\n",
+ " \n",
+ " 36 | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " increased | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " 96 | \n",
+ " Disease | \n",
+ " Disease | \n",
+ " DiseaseMONDO_increased_DiseaseNCIT | \n",
+ "
\n",
+ " \n",
+ " 37 | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " increased | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " 1129 | \n",
+ " Disease | \n",
+ " Disease | \n",
+ " DiseaseMONDO_increased_DiseaseSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 38 | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " 9919 | \n",
+ " Pheno | \n",
+ " Disease | \n",
+ " DiseaseMONDO_increased_PhenoHP | \n",
+ "
\n",
+ " \n",
+ " 39 | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " 527 | \n",
+ " Pheno | \n",
+ " Disease | \n",
+ " DiseaseMONDO_increased_PhenoNCIT | \n",
+ "
\n",
+ " \n",
+ " 40 | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " 94 | \n",
+ " Pheno | \n",
+ " Disease | \n",
+ " DiseaseMONDO_increased_PhenoSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 41 | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " increased | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " 69 | \n",
+ " Procedure | \n",
+ " Disease | \n",
+ " DiseaseMONDO_increased_ProcedureNCIT | \n",
+ "
\n",
+ " \n",
+ " 42 | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " 73 | \n",
+ " Disease | \n",
+ " Disease | \n",
+ " DiseaseNCIT_decreased_DiseaseMONDO | \n",
+ "
\n",
+ " \n",
+ " 43 | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " 7 | \n",
+ " Disease | \n",
+ " Disease | \n",
+ " DiseaseNCIT_decreased_DiseaseSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 44 | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " 44 | \n",
+ " Pheno | \n",
+ " Disease | \n",
+ " DiseaseNCIT_decreased_PhenoHP | \n",
+ "
\n",
+ " \n",
+ " 45 | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " 9 | \n",
+ " Pheno | \n",
+ " Disease | \n",
+ " DiseaseNCIT_decreased_PhenoNCIT | \n",
+ "
\n",
+ " \n",
+ " 46 | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " 1 | \n",
+ " Procedure | \n",
+ " Disease | \n",
+ " DiseaseNCIT_decreased_ProcedureNCIT | \n",
+ "
\n",
+ " \n",
+ " 47 | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " 82 | \n",
+ " Disease | \n",
+ " Disease | \n",
+ " DiseaseNCIT_increased_DiseaseMONDO | \n",
+ "
\n",
+ " \n",
+ " 48 | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " 6 | \n",
+ " Disease | \n",
+ " Disease | \n",
+ " DiseaseNCIT_increased_DiseaseSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 49 | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " 66 | \n",
+ " Pheno | \n",
+ " Disease | \n",
+ " DiseaseNCIT_increased_PhenoHP | \n",
+ "
\n",
+ " \n",
+ " 50 | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " 1 | \n",
+ " Pheno | \n",
+ " Disease | \n",
+ " DiseaseNCIT_increased_PhenoNCIT | \n",
+ "
\n",
+ " \n",
+ " 51 | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " 1 | \n",
+ " Pheno | \n",
+ " Disease | \n",
+ " DiseaseNCIT_increased_PhenoSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 52 | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " 1318 | \n",
+ " Disease | \n",
+ " Disease | \n",
+ " DiseaseSNOMEDCT_decreased_DiseaseMONDO | \n",
+ "
\n",
+ " \n",
+ " 53 | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " 10 | \n",
+ " Disease | \n",
+ " Disease | \n",
+ " DiseaseSNOMEDCT_decreased_DiseaseNCIT | \n",
+ "
\n",
+ " \n",
+ " 54 | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " 101 | \n",
+ " Disease | \n",
+ " Disease | \n",
+ " DiseaseSNOMEDCT_decreased_DiseaseSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 55 | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " 957 | \n",
+ " Pheno | \n",
+ " Disease | \n",
+ " DiseaseSNOMEDCT_decreased_PhenoHP | \n",
+ "
\n",
+ " \n",
+ " 56 | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " 116 | \n",
+ " Pheno | \n",
+ " Disease | \n",
+ " DiseaseSNOMEDCT_decreased_PhenoNCIT | \n",
+ "
\n",
+ " \n",
+ " 57 | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " 8 | \n",
+ " Pheno | \n",
+ " Disease | \n",
+ " DiseaseSNOMEDCT_decreased_PhenoSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 58 | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " decreased | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " 13 | \n",
+ " Procedure | \n",
+ " Disease | \n",
+ " DiseaseSNOMEDCT_decreased_ProcedureNCIT | \n",
+ "
\n",
+ " \n",
+ " 59 | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " increased | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " 1007 | \n",
+ " Disease | \n",
+ " Disease | \n",
+ " DiseaseSNOMEDCT_increased_DiseaseMONDO | \n",
+ "
\n",
+ " \n",
+ " 60 | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " increased | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " 5 | \n",
+ " Disease | \n",
+ " Disease | \n",
+ " DiseaseSNOMEDCT_increased_DiseaseNCIT | \n",
+ "
\n",
+ " \n",
+ " 61 | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " increased | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " 81 | \n",
+ " Disease | \n",
+ " Disease | \n",
+ " DiseaseSNOMEDCT_increased_DiseaseSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 62 | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " 693 | \n",
+ " Pheno | \n",
+ " Disease | \n",
+ " DiseaseSNOMEDCT_increased_PhenoHP | \n",
+ "
\n",
+ " \n",
+ " 63 | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " 34 | \n",
+ " Pheno | \n",
+ " Disease | \n",
+ " DiseaseSNOMEDCT_increased_PhenoNCIT | \n",
+ "
\n",
+ " \n",
+ " 64 | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " 7 | \n",
+ " Pheno | \n",
+ " Disease | \n",
+ " DiseaseSNOMEDCT_increased_PhenoSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 65 | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " increased | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " 2 | \n",
+ " Procedure | \n",
+ " Disease | \n",
+ " DiseaseSNOMEDCT_increased_ProcedureNCIT | \n",
+ "
\n",
+ " \n",
+ " 66 | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " 18759 | \n",
+ " Disease | \n",
+ " Pheno | \n",
+ " PhenoHP_decreased_DiseaseMONDO | \n",
+ "
\n",
+ " \n",
+ " 67 | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " 113 | \n",
+ " Disease | \n",
+ " Pheno | \n",
+ " PhenoHP_decreased_DiseaseNCIT | \n",
+ "
\n",
+ " \n",
+ " 68 | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " 1532 | \n",
+ " Disease | \n",
+ " Pheno | \n",
+ " PhenoHP_decreased_DiseaseSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 69 | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " 11425 | \n",
+ " Pheno | \n",
+ " Pheno | \n",
+ " PhenoHP_decreased_PhenoHP | \n",
+ "
\n",
+ " \n",
+ " 70 | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " 1257 | \n",
+ " Pheno | \n",
+ " Pheno | \n",
+ " PhenoHP_decreased_PhenoNCIT | \n",
+ "
\n",
+ " \n",
+ " 71 | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " 121 | \n",
+ " Pheno | \n",
+ " Pheno | \n",
+ " PhenoHP_decreased_PhenoSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 72 | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " decreased | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " 132 | \n",
+ " Procedure | \n",
+ " Pheno | \n",
+ " PhenoHP_decreased_ProcedureNCIT | \n",
+ "
\n",
+ " \n",
+ " 73 | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " increased | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " 12086 | \n",
+ " Disease | \n",
+ " Pheno | \n",
+ " PhenoHP_increased_DiseaseMONDO | \n",
+ "
\n",
+ " \n",
+ " 74 | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " increased | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " 86 | \n",
+ " Disease | \n",
+ " Pheno | \n",
+ " PhenoHP_increased_DiseaseNCIT | \n",
+ "
\n",
+ " \n",
+ " 75 | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " increased | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " 1055 | \n",
+ " Disease | \n",
+ " Pheno | \n",
+ " PhenoHP_increased_DiseaseSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 76 | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " 10355 | \n",
+ " Pheno | \n",
+ " Pheno | \n",
+ " PhenoHP_increased_PhenoHP | \n",
+ "
\n",
+ " \n",
+ " 77 | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " 733 | \n",
+ " Pheno | \n",
+ " Pheno | \n",
+ " PhenoHP_increased_PhenoNCIT | \n",
+ "
\n",
+ " \n",
+ " 78 | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " 78 | \n",
+ " Pheno | \n",
+ " Pheno | \n",
+ " PhenoHP_increased_PhenoSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 79 | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " increased | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " 67 | \n",
+ " Procedure | \n",
+ " Pheno | \n",
+ " PhenoHP_increased_ProcedureNCIT | \n",
+ "
\n",
+ " \n",
+ " 80 | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " 1170 | \n",
+ " Disease | \n",
+ " Pheno | \n",
+ " PhenoNCIT_decreased_DiseaseMONDO | \n",
+ "
\n",
+ " \n",
+ " 81 | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " 7 | \n",
+ " Disease | \n",
+ " Pheno | \n",
+ " PhenoNCIT_decreased_DiseaseNCIT | \n",
+ "
\n",
+ " \n",
+ " 82 | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " 94 | \n",
+ " Disease | \n",
+ " Pheno | \n",
+ " PhenoNCIT_decreased_DiseaseSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 83 | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " 705 | \n",
+ " Pheno | \n",
+ " Pheno | \n",
+ " PhenoNCIT_decreased_PhenoHP | \n",
+ "
\n",
+ " \n",
+ " 84 | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " 56 | \n",
+ " Pheno | \n",
+ " Pheno | \n",
+ " PhenoNCIT_decreased_PhenoNCIT | \n",
+ "
\n",
+ " \n",
+ " 85 | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " 9 | \n",
+ " Pheno | \n",
+ " Pheno | \n",
+ " PhenoNCIT_decreased_PhenoSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 86 | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " 7 | \n",
+ " Procedure | \n",
+ " Pheno | \n",
+ " PhenoNCIT_decreased_ProcedureNCIT | \n",
+ "
\n",
+ " \n",
+ " 87 | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " 380 | \n",
+ " Disease | \n",
+ " Pheno | \n",
+ " PhenoNCIT_increased_DiseaseMONDO | \n",
+ "
\n",
+ " \n",
+ " 88 | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " 3 | \n",
+ " Disease | \n",
+ " Pheno | \n",
+ " PhenoNCIT_increased_DiseaseNCIT | \n",
+ "
\n",
+ " \n",
+ " 89 | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " 36 | \n",
+ " Disease | \n",
+ " Pheno | \n",
+ " PhenoNCIT_increased_DiseaseSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 90 | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " 395 | \n",
+ " Pheno | \n",
+ " Pheno | \n",
+ " PhenoNCIT_increased_PhenoHP | \n",
+ "
\n",
+ " \n",
+ " 91 | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " 34 | \n",
+ " Pheno | \n",
+ " Pheno | \n",
+ " PhenoNCIT_increased_PhenoNCIT | \n",
+ "
\n",
+ " \n",
+ " 92 | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " 1 | \n",
+ " Pheno | \n",
+ " Pheno | \n",
+ " PhenoNCIT_increased_PhenoSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 93 | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " 3 | \n",
+ " Procedure | \n",
+ " Pheno | \n",
+ " PhenoNCIT_increased_ProcedureNCIT | \n",
+ "
\n",
+ " \n",
+ " 94 | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " 307 | \n",
+ " Disease | \n",
+ " Pheno | \n",
+ " PhenoSNOMEDCT_decreased_DiseaseMONDO | \n",
+ "
\n",
+ " \n",
+ " 95 | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " 1 | \n",
+ " Disease | \n",
+ " Pheno | \n",
+ " PhenoSNOMEDCT_decreased_DiseaseNCIT | \n",
+ "
\n",
+ " \n",
+ " 96 | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " 26 | \n",
+ " Disease | \n",
+ " Pheno | \n",
+ " PhenoSNOMEDCT_decreased_DiseaseSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 97 | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " 217 | \n",
+ " Pheno | \n",
+ " Pheno | \n",
+ " PhenoSNOMEDCT_decreased_PhenoHP | \n",
+ "
\n",
+ " \n",
+ " 98 | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " 25 | \n",
+ " Pheno | \n",
+ " Pheno | \n",
+ " PhenoSNOMEDCT_decreased_PhenoNCIT | \n",
+ "
\n",
+ " \n",
+ " 99 | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " 2 | \n",
+ " Pheno | \n",
+ " Pheno | \n",
+ " PhenoSNOMEDCT_decreased_PhenoSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 100 | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " decreased | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " 2 | \n",
+ " Procedure | \n",
+ " Pheno | \n",
+ " PhenoSNOMEDCT_decreased_ProcedureNCIT | \n",
+ "
\n",
+ " \n",
+ " 101 | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " increased | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " 158 | \n",
+ " Disease | \n",
+ " Pheno | \n",
+ " PhenoSNOMEDCT_increased_DiseaseMONDO | \n",
+ "
\n",
+ " \n",
+ " 102 | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " increased | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " 2 | \n",
+ " Disease | \n",
+ " Pheno | \n",
+ " PhenoSNOMEDCT_increased_DiseaseNCIT | \n",
+ "
\n",
+ " \n",
+ " 103 | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " increased | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " 13 | \n",
+ " Disease | \n",
+ " Pheno | \n",
+ " PhenoSNOMEDCT_increased_DiseaseSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 104 | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " 113 | \n",
+ " Pheno | \n",
+ " Pheno | \n",
+ " PhenoSNOMEDCT_increased_PhenoHP | \n",
+ "
\n",
+ " \n",
+ " 105 | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " 5 | \n",
+ " Pheno | \n",
+ " Pheno | \n",
+ " PhenoSNOMEDCT_increased_PhenoNCIT | \n",
+ "
\n",
+ " \n",
+ " 106 | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " increased | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " 1 | \n",
+ " Procedure | \n",
+ " Pheno | \n",
+ " PhenoSNOMEDCT_increased_ProcedureNCIT | \n",
+ "
\n",
+ " \n",
+ " 107 | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " 721 | \n",
+ " Disease | \n",
+ " Procedure | \n",
+ " ProcedureNCIT_decreased_DiseaseMONDO | \n",
+ "
\n",
+ " \n",
+ " 108 | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " 4 | \n",
+ " Disease | \n",
+ " Procedure | \n",
+ " ProcedureNCIT_decreased_DiseaseNCIT | \n",
+ "
\n",
+ " \n",
+ " 109 | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " 58 | \n",
+ " Disease | \n",
+ " Procedure | \n",
+ " ProcedureNCIT_decreased_DiseaseSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 110 | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " 502 | \n",
+ " Pheno | \n",
+ " Procedure | \n",
+ " ProcedureNCIT_decreased_PhenoHP | \n",
+ "
\n",
+ " \n",
+ " 111 | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " 56 | \n",
+ " Pheno | \n",
+ " Procedure | \n",
+ " ProcedureNCIT_decreased_PhenoNCIT | \n",
+ "
\n",
+ " \n",
+ " 112 | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " 4 | \n",
+ " Pheno | \n",
+ " Procedure | \n",
+ " ProcedureNCIT_decreased_PhenoSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 113 | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " decreased | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " 3 | \n",
+ " Procedure | \n",
+ " Procedure | \n",
+ " ProcedureNCIT_decreased_ProcedureNCIT | \n",
+ "
\n",
+ " \n",
+ " 114 | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " Disease | \n",
+ " MONDO | \n",
+ " 209 | \n",
+ " Disease | \n",
+ " Procedure | \n",
+ " ProcedureNCIT_increased_DiseaseMONDO | \n",
+ "
\n",
+ " \n",
+ " 115 | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " Disease | \n",
+ " NCIT | \n",
+ " 2 | \n",
+ " Disease | \n",
+ " Procedure | \n",
+ " ProcedureNCIT_increased_DiseaseNCIT | \n",
+ "
\n",
+ " \n",
+ " 116 | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " Disease | \n",
+ " SNOMEDCT | \n",
+ " 20 | \n",
+ " Disease | \n",
+ " Procedure | \n",
+ " ProcedureNCIT_increased_DiseaseSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 117 | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " HP | \n",
+ " 158 | \n",
+ " Pheno | \n",
+ " Procedure | \n",
+ " ProcedureNCIT_increased_PhenoHP | \n",
+ "
\n",
+ " \n",
+ " 118 | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " NCIT | \n",
+ " 4 | \n",
+ " Pheno | \n",
+ " Procedure | \n",
+ " ProcedureNCIT_increased_PhenoNCIT | \n",
+ "
\n",
+ " \n",
+ " 119 | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " PhenotypicFeature | \n",
+ " SNOMEDCT | \n",
+ " 2 | \n",
+ " Pheno | \n",
+ " Procedure | \n",
+ " ProcedureNCIT_increased_PhenoSNOMEDCT | \n",
+ "
\n",
+ " \n",
+ " 120 | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " increased | \n",
+ " Procedure | \n",
+ " NCIT | \n",
+ " 2 | \n",
+ " Procedure | \n",
+ " Procedure | \n",
+ " ProcedureNCIT_increased_ProcedureNCIT | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " subject_category_postfix subject_prefix increased_or_decreased \\\n",
+ "0 ChemicalEntity CHEBI decreased \n",
+ "1 ChemicalEntity CHEBI decreased \n",
+ "2 ChemicalEntity CHEBI decreased \n",
+ "3 ChemicalEntity CHEBI decreased \n",
+ "4 ChemicalEntity CHEBI decreased \n",
+ "5 ChemicalEntity CHEBI decreased \n",
+ "6 ChemicalEntity CHEBI decreased \n",
+ "7 ChemicalEntity CHEBI increased \n",
+ "8 ChemicalEntity CHEBI increased \n",
+ "9 ChemicalEntity CHEBI increased \n",
+ "10 ChemicalEntity CHEBI increased \n",
+ "11 ChemicalEntity CHEBI increased \n",
+ "12 ChemicalEntity CHEBI increased \n",
+ "13 ChemicalEntity CHEBI increased \n",
+ "14 ChemicalEntity UNII decreased \n",
+ "15 ChemicalEntity UNII decreased \n",
+ "16 ChemicalEntity UNII decreased \n",
+ "17 ChemicalEntity UNII decreased \n",
+ "18 ChemicalEntity UNII decreased \n",
+ "19 ChemicalEntity UNII decreased \n",
+ "20 ChemicalEntity UNII decreased \n",
+ "21 ChemicalEntity UNII increased \n",
+ "22 ChemicalEntity UNII increased \n",
+ "23 ChemicalEntity UNII increased \n",
+ "24 ChemicalEntity UNII increased \n",
+ "25 ChemicalEntity UNII increased \n",
+ "26 ChemicalEntity UNII increased \n",
+ "27 ChemicalEntity UNII increased \n",
+ "28 Disease MONDO decreased \n",
+ "29 Disease MONDO decreased \n",
+ "30 Disease MONDO decreased \n",
+ "31 Disease MONDO decreased \n",
+ "32 Disease MONDO decreased \n",
+ "33 Disease MONDO decreased \n",
+ "34 Disease MONDO decreased \n",
+ "35 Disease MONDO increased \n",
+ "36 Disease MONDO increased \n",
+ "37 Disease MONDO increased \n",
+ "38 Disease MONDO increased \n",
+ "39 Disease MONDO increased \n",
+ "40 Disease MONDO increased \n",
+ "41 Disease MONDO increased \n",
+ "42 Disease NCIT decreased \n",
+ "43 Disease NCIT decreased \n",
+ "44 Disease NCIT decreased \n",
+ "45 Disease NCIT decreased \n",
+ "46 Disease NCIT decreased \n",
+ "47 Disease NCIT increased \n",
+ "48 Disease NCIT increased \n",
+ "49 Disease NCIT increased \n",
+ "50 Disease NCIT increased \n",
+ "51 Disease NCIT increased \n",
+ "52 Disease SNOMEDCT decreased \n",
+ "53 Disease SNOMEDCT decreased \n",
+ "54 Disease SNOMEDCT decreased \n",
+ "55 Disease SNOMEDCT decreased \n",
+ "56 Disease SNOMEDCT decreased \n",
+ "57 Disease SNOMEDCT decreased \n",
+ "58 Disease SNOMEDCT decreased \n",
+ "59 Disease SNOMEDCT increased \n",
+ "60 Disease SNOMEDCT increased \n",
+ "61 Disease SNOMEDCT increased \n",
+ "62 Disease SNOMEDCT increased \n",
+ "63 Disease SNOMEDCT increased \n",
+ "64 Disease SNOMEDCT increased \n",
+ "65 Disease SNOMEDCT increased \n",
+ "66 PhenotypicFeature HP decreased \n",
+ "67 PhenotypicFeature HP decreased \n",
+ "68 PhenotypicFeature HP decreased \n",
+ "69 PhenotypicFeature HP decreased \n",
+ "70 PhenotypicFeature HP decreased \n",
+ "71 PhenotypicFeature HP decreased \n",
+ "72 PhenotypicFeature HP decreased \n",
+ "73 PhenotypicFeature HP increased \n",
+ "74 PhenotypicFeature HP increased \n",
+ "75 PhenotypicFeature HP increased \n",
+ "76 PhenotypicFeature HP increased \n",
+ "77 PhenotypicFeature HP increased \n",
+ "78 PhenotypicFeature HP increased \n",
+ "79 PhenotypicFeature HP increased \n",
+ "80 PhenotypicFeature NCIT decreased \n",
+ "81 PhenotypicFeature NCIT decreased \n",
+ "82 PhenotypicFeature NCIT decreased \n",
+ "83 PhenotypicFeature NCIT decreased \n",
+ "84 PhenotypicFeature NCIT decreased \n",
+ "85 PhenotypicFeature NCIT decreased \n",
+ "86 PhenotypicFeature NCIT decreased \n",
+ "87 PhenotypicFeature NCIT increased \n",
+ "88 PhenotypicFeature NCIT increased \n",
+ "89 PhenotypicFeature NCIT increased \n",
+ "90 PhenotypicFeature NCIT increased \n",
+ "91 PhenotypicFeature NCIT increased \n",
+ "92 PhenotypicFeature NCIT increased \n",
+ "93 PhenotypicFeature NCIT increased \n",
+ "94 PhenotypicFeature SNOMEDCT decreased \n",
+ "95 PhenotypicFeature SNOMEDCT decreased \n",
+ "96 PhenotypicFeature SNOMEDCT decreased \n",
+ "97 PhenotypicFeature SNOMEDCT decreased \n",
+ "98 PhenotypicFeature SNOMEDCT decreased \n",
+ "99 PhenotypicFeature SNOMEDCT decreased \n",
+ "100 PhenotypicFeature SNOMEDCT decreased \n",
+ "101 PhenotypicFeature SNOMEDCT increased \n",
+ "102 PhenotypicFeature SNOMEDCT increased \n",
+ "103 PhenotypicFeature SNOMEDCT increased \n",
+ "104 PhenotypicFeature SNOMEDCT increased \n",
+ "105 PhenotypicFeature SNOMEDCT increased \n",
+ "106 PhenotypicFeature SNOMEDCT increased \n",
+ "107 Procedure NCIT decreased \n",
+ "108 Procedure NCIT decreased \n",
+ "109 Procedure NCIT decreased \n",
+ "110 Procedure NCIT decreased \n",
+ "111 Procedure NCIT decreased \n",
+ "112 Procedure NCIT decreased \n",
+ "113 Procedure NCIT decreased \n",
+ "114 Procedure NCIT increased \n",
+ "115 Procedure NCIT increased \n",
+ "116 Procedure NCIT increased \n",
+ "117 Procedure NCIT increased \n",
+ "118 Procedure NCIT increased \n",
+ "119 Procedure NCIT increased \n",
+ "120 Procedure NCIT increased \n",
+ "\n",
+ " object_category_postfix object_prefix Count object_category_postfix_abbv \\\n",
+ "0 Disease MONDO 34917 Disease \n",
+ "1 Disease NCIT 212 Disease \n",
+ "2 Disease SNOMEDCT 2917 Disease \n",
+ "3 PhenotypicFeature HP 24505 Pheno \n",
+ "4 PhenotypicFeature NCIT 2254 Pheno \n",
+ "5 PhenotypicFeature SNOMEDCT 244 Pheno \n",
+ "6 Procedure NCIT 222 Procedure \n",
+ "7 Disease MONDO 18248 Disease \n",
+ "8 Disease NCIT 131 Disease \n",
+ "9 Disease SNOMEDCT 1542 Disease \n",
+ "10 PhenotypicFeature HP 13225 Pheno \n",
+ "11 PhenotypicFeature NCIT 1176 Pheno \n",
+ "12 PhenotypicFeature SNOMEDCT 99 Pheno \n",
+ "13 Procedure NCIT 121 Procedure \n",
+ "14 Disease MONDO 5902 Disease \n",
+ "15 Disease NCIT 44 Disease \n",
+ "16 Disease SNOMEDCT 505 Disease \n",
+ "17 PhenotypicFeature HP 3727 Pheno \n",
+ "18 PhenotypicFeature NCIT 381 Pheno \n",
+ "19 PhenotypicFeature SNOMEDCT 39 Pheno \n",
+ "20 Procedure NCIT 43 Procedure \n",
+ "21 Disease MONDO 2158 Disease \n",
+ "22 Disease NCIT 8 Disease \n",
+ "23 Disease SNOMEDCT 171 Disease \n",
+ "24 PhenotypicFeature HP 1993 Pheno \n",
+ "25 PhenotypicFeature NCIT 139 Pheno \n",
+ "26 PhenotypicFeature SNOMEDCT 13 Pheno \n",
+ "27 Procedure NCIT 9 Procedure \n",
+ "28 Disease MONDO 13640 Disease \n",
+ "29 Disease NCIT 86 Disease \n",
+ "30 Disease SNOMEDCT 1237 Disease \n",
+ "31 PhenotypicFeature HP 10101 Pheno \n",
+ "32 PhenotypicFeature NCIT 1293 Pheno \n",
+ "33 PhenotypicFeature SNOMEDCT 88 Pheno \n",
+ "34 Procedure NCIT 113 Procedure \n",
+ "35 Disease MONDO 14415 Disease \n",
+ "36 Disease NCIT 96 Disease \n",
+ "37 Disease SNOMEDCT 1129 Disease \n",
+ "38 PhenotypicFeature HP 9919 Pheno \n",
+ "39 PhenotypicFeature NCIT 527 Pheno \n",
+ "40 PhenotypicFeature SNOMEDCT 94 Pheno \n",
+ "41 Procedure NCIT 69 Procedure \n",
+ "42 Disease MONDO 73 Disease \n",
+ "43 Disease SNOMEDCT 7 Disease \n",
+ "44 PhenotypicFeature HP 44 Pheno \n",
+ "45 PhenotypicFeature NCIT 9 Pheno \n",
+ "46 Procedure NCIT 1 Procedure \n",
+ "47 Disease MONDO 82 Disease \n",
+ "48 Disease SNOMEDCT 6 Disease \n",
+ "49 PhenotypicFeature HP 66 Pheno \n",
+ "50 PhenotypicFeature NCIT 1 Pheno \n",
+ "51 PhenotypicFeature SNOMEDCT 1 Pheno \n",
+ "52 Disease MONDO 1318 Disease \n",
+ "53 Disease NCIT 10 Disease \n",
+ "54 Disease SNOMEDCT 101 Disease \n",
+ "55 PhenotypicFeature HP 957 Pheno \n",
+ "56 PhenotypicFeature NCIT 116 Pheno \n",
+ "57 PhenotypicFeature SNOMEDCT 8 Pheno \n",
+ "58 Procedure NCIT 13 Procedure \n",
+ "59 Disease MONDO 1007 Disease \n",
+ "60 Disease NCIT 5 Disease \n",
+ "61 Disease SNOMEDCT 81 Disease \n",
+ "62 PhenotypicFeature HP 693 Pheno \n",
+ "63 PhenotypicFeature NCIT 34 Pheno \n",
+ "64 PhenotypicFeature SNOMEDCT 7 Pheno \n",
+ "65 Procedure NCIT 2 Procedure \n",
+ "66 Disease MONDO 18759 Disease \n",
+ "67 Disease NCIT 113 Disease \n",
+ "68 Disease SNOMEDCT 1532 Disease \n",
+ "69 PhenotypicFeature HP 11425 Pheno \n",
+ "70 PhenotypicFeature NCIT 1257 Pheno \n",
+ "71 PhenotypicFeature SNOMEDCT 121 Pheno \n",
+ "72 Procedure NCIT 132 Procedure \n",
+ "73 Disease MONDO 12086 Disease \n",
+ "74 Disease NCIT 86 Disease \n",
+ "75 Disease SNOMEDCT 1055 Disease \n",
+ "76 PhenotypicFeature HP 10355 Pheno \n",
+ "77 PhenotypicFeature NCIT 733 Pheno \n",
+ "78 PhenotypicFeature SNOMEDCT 78 Pheno \n",
+ "79 Procedure NCIT 67 Procedure \n",
+ "80 Disease MONDO 1170 Disease \n",
+ "81 Disease NCIT 7 Disease \n",
+ "82 Disease SNOMEDCT 94 Disease \n",
+ "83 PhenotypicFeature HP 705 Pheno \n",
+ "84 PhenotypicFeature NCIT 56 Pheno \n",
+ "85 PhenotypicFeature SNOMEDCT 9 Pheno \n",
+ "86 Procedure NCIT 7 Procedure \n",
+ "87 Disease MONDO 380 Disease \n",
+ "88 Disease NCIT 3 Disease \n",
+ "89 Disease SNOMEDCT 36 Disease \n",
+ "90 PhenotypicFeature HP 395 Pheno \n",
+ "91 PhenotypicFeature NCIT 34 Pheno \n",
+ "92 PhenotypicFeature SNOMEDCT 1 Pheno \n",
+ "93 Procedure NCIT 3 Procedure \n",
+ "94 Disease MONDO 307 Disease \n",
+ "95 Disease NCIT 1 Disease \n",
+ "96 Disease SNOMEDCT 26 Disease \n",
+ "97 PhenotypicFeature HP 217 Pheno \n",
+ "98 PhenotypicFeature NCIT 25 Pheno \n",
+ "99 PhenotypicFeature SNOMEDCT 2 Pheno \n",
+ "100 Procedure NCIT 2 Procedure \n",
+ "101 Disease MONDO 158 Disease \n",
+ "102 Disease NCIT 2 Disease \n",
+ "103 Disease SNOMEDCT 13 Disease \n",
+ "104 PhenotypicFeature HP 113 Pheno \n",
+ "105 PhenotypicFeature NCIT 5 Pheno \n",
+ "106 Procedure NCIT 1 Procedure \n",
+ "107 Disease MONDO 721 Disease \n",
+ "108 Disease NCIT 4 Disease \n",
+ "109 Disease SNOMEDCT 58 Disease \n",
+ "110 PhenotypicFeature HP 502 Pheno \n",
+ "111 PhenotypicFeature NCIT 56 Pheno \n",
+ "112 PhenotypicFeature SNOMEDCT 4 Pheno \n",
+ "113 Procedure NCIT 3 Procedure \n",
+ "114 Disease MONDO 209 Disease \n",
+ "115 Disease NCIT 2 Disease \n",
+ "116 Disease SNOMEDCT 20 Disease \n",
+ "117 PhenotypicFeature HP 158 Pheno \n",
+ "118 PhenotypicFeature NCIT 4 Pheno \n",
+ "119 PhenotypicFeature SNOMEDCT 2 Pheno \n",
+ "120 Procedure NCIT 2 Procedure \n",
+ "\n",
+ " subject_category_postfix_abbv joined_to_match_xbte \n",
+ "0 Chem ChemCHEBI_decreased_DiseaseMONDO \n",
+ "1 Chem ChemCHEBI_decreased_DiseaseNCIT \n",
+ "2 Chem ChemCHEBI_decreased_DiseaseSNOMEDCT \n",
+ "3 Chem ChemCHEBI_decreased_PhenoHP \n",
+ "4 Chem ChemCHEBI_decreased_PhenoNCIT \n",
+ "5 Chem ChemCHEBI_decreased_PhenoSNOMEDCT \n",
+ "6 Chem ChemCHEBI_decreased_ProcedureNCIT \n",
+ "7 Chem ChemCHEBI_increased_DiseaseMONDO \n",
+ "8 Chem ChemCHEBI_increased_DiseaseNCIT \n",
+ "9 Chem ChemCHEBI_increased_DiseaseSNOMEDCT \n",
+ "10 Chem ChemCHEBI_increased_PhenoHP \n",
+ "11 Chem ChemCHEBI_increased_PhenoNCIT \n",
+ "12 Chem ChemCHEBI_increased_PhenoSNOMEDCT \n",
+ "13 Chem ChemCHEBI_increased_ProcedureNCIT \n",
+ "14 Chem ChemUNII_decreased_DiseaseMONDO \n",
+ "15 Chem ChemUNII_decreased_DiseaseNCIT \n",
+ "16 Chem ChemUNII_decreased_DiseaseSNOMEDCT \n",
+ "17 Chem ChemUNII_decreased_PhenoHP \n",
+ "18 Chem ChemUNII_decreased_PhenoNCIT \n",
+ "19 Chem ChemUNII_decreased_PhenoSNOMEDCT \n",
+ "20 Chem ChemUNII_decreased_ProcedureNCIT \n",
+ "21 Chem ChemUNII_increased_DiseaseMONDO \n",
+ "22 Chem ChemUNII_increased_DiseaseNCIT \n",
+ "23 Chem ChemUNII_increased_DiseaseSNOMEDCT \n",
+ "24 Chem ChemUNII_increased_PhenoHP \n",
+ "25 Chem ChemUNII_increased_PhenoNCIT \n",
+ "26 Chem ChemUNII_increased_PhenoSNOMEDCT \n",
+ "27 Chem ChemUNII_increased_ProcedureNCIT \n",
+ "28 Disease DiseaseMONDO_decreased_DiseaseMONDO \n",
+ "29 Disease DiseaseMONDO_decreased_DiseaseNCIT \n",
+ "30 Disease DiseaseMONDO_decreased_DiseaseSNOMEDCT \n",
+ "31 Disease DiseaseMONDO_decreased_PhenoHP \n",
+ "32 Disease DiseaseMONDO_decreased_PhenoNCIT \n",
+ "33 Disease DiseaseMONDO_decreased_PhenoSNOMEDCT \n",
+ "34 Disease DiseaseMONDO_decreased_ProcedureNCIT \n",
+ "35 Disease DiseaseMONDO_increased_DiseaseMONDO \n",
+ "36 Disease DiseaseMONDO_increased_DiseaseNCIT \n",
+ "37 Disease DiseaseMONDO_increased_DiseaseSNOMEDCT \n",
+ "38 Disease DiseaseMONDO_increased_PhenoHP \n",
+ "39 Disease DiseaseMONDO_increased_PhenoNCIT \n",
+ "40 Disease DiseaseMONDO_increased_PhenoSNOMEDCT \n",
+ "41 Disease DiseaseMONDO_increased_ProcedureNCIT \n",
+ "42 Disease DiseaseNCIT_decreased_DiseaseMONDO \n",
+ "43 Disease DiseaseNCIT_decreased_DiseaseSNOMEDCT \n",
+ "44 Disease DiseaseNCIT_decreased_PhenoHP \n",
+ "45 Disease DiseaseNCIT_decreased_PhenoNCIT \n",
+ "46 Disease DiseaseNCIT_decreased_ProcedureNCIT \n",
+ "47 Disease DiseaseNCIT_increased_DiseaseMONDO \n",
+ "48 Disease DiseaseNCIT_increased_DiseaseSNOMEDCT \n",
+ "49 Disease DiseaseNCIT_increased_PhenoHP \n",
+ "50 Disease DiseaseNCIT_increased_PhenoNCIT \n",
+ "51 Disease DiseaseNCIT_increased_PhenoSNOMEDCT \n",
+ "52 Disease DiseaseSNOMEDCT_decreased_DiseaseMONDO \n",
+ "53 Disease DiseaseSNOMEDCT_decreased_DiseaseNCIT \n",
+ "54 Disease DiseaseSNOMEDCT_decreased_DiseaseSNOMEDCT \n",
+ "55 Disease DiseaseSNOMEDCT_decreased_PhenoHP \n",
+ "56 Disease DiseaseSNOMEDCT_decreased_PhenoNCIT \n",
+ "57 Disease DiseaseSNOMEDCT_decreased_PhenoSNOMEDCT \n",
+ "58 Disease DiseaseSNOMEDCT_decreased_ProcedureNCIT \n",
+ "59 Disease DiseaseSNOMEDCT_increased_DiseaseMONDO \n",
+ "60 Disease DiseaseSNOMEDCT_increased_DiseaseNCIT \n",
+ "61 Disease DiseaseSNOMEDCT_increased_DiseaseSNOMEDCT \n",
+ "62 Disease DiseaseSNOMEDCT_increased_PhenoHP \n",
+ "63 Disease DiseaseSNOMEDCT_increased_PhenoNCIT \n",
+ "64 Disease DiseaseSNOMEDCT_increased_PhenoSNOMEDCT \n",
+ "65 Disease DiseaseSNOMEDCT_increased_ProcedureNCIT \n",
+ "66 Pheno PhenoHP_decreased_DiseaseMONDO \n",
+ "67 Pheno PhenoHP_decreased_DiseaseNCIT \n",
+ "68 Pheno PhenoHP_decreased_DiseaseSNOMEDCT \n",
+ "69 Pheno PhenoHP_decreased_PhenoHP \n",
+ "70 Pheno PhenoHP_decreased_PhenoNCIT \n",
+ "71 Pheno PhenoHP_decreased_PhenoSNOMEDCT \n",
+ "72 Pheno PhenoHP_decreased_ProcedureNCIT \n",
+ "73 Pheno PhenoHP_increased_DiseaseMONDO \n",
+ "74 Pheno PhenoHP_increased_DiseaseNCIT \n",
+ "75 Pheno PhenoHP_increased_DiseaseSNOMEDCT \n",
+ "76 Pheno PhenoHP_increased_PhenoHP \n",
+ "77 Pheno PhenoHP_increased_PhenoNCIT \n",
+ "78 Pheno PhenoHP_increased_PhenoSNOMEDCT \n",
+ "79 Pheno PhenoHP_increased_ProcedureNCIT \n",
+ "80 Pheno PhenoNCIT_decreased_DiseaseMONDO \n",
+ "81 Pheno PhenoNCIT_decreased_DiseaseNCIT \n",
+ "82 Pheno PhenoNCIT_decreased_DiseaseSNOMEDCT \n",
+ "83 Pheno PhenoNCIT_decreased_PhenoHP \n",
+ "84 Pheno PhenoNCIT_decreased_PhenoNCIT \n",
+ "85 Pheno PhenoNCIT_decreased_PhenoSNOMEDCT \n",
+ "86 Pheno PhenoNCIT_decreased_ProcedureNCIT \n",
+ "87 Pheno PhenoNCIT_increased_DiseaseMONDO \n",
+ "88 Pheno PhenoNCIT_increased_DiseaseNCIT \n",
+ "89 Pheno PhenoNCIT_increased_DiseaseSNOMEDCT \n",
+ "90 Pheno PhenoNCIT_increased_PhenoHP \n",
+ "91 Pheno PhenoNCIT_increased_PhenoNCIT \n",
+ "92 Pheno PhenoNCIT_increased_PhenoSNOMEDCT \n",
+ "93 Pheno PhenoNCIT_increased_ProcedureNCIT \n",
+ "94 Pheno PhenoSNOMEDCT_decreased_DiseaseMONDO \n",
+ "95 Pheno PhenoSNOMEDCT_decreased_DiseaseNCIT \n",
+ "96 Pheno PhenoSNOMEDCT_decreased_DiseaseSNOMEDCT \n",
+ "97 Pheno PhenoSNOMEDCT_decreased_PhenoHP \n",
+ "98 Pheno PhenoSNOMEDCT_decreased_PhenoNCIT \n",
+ "99 Pheno PhenoSNOMEDCT_decreased_PhenoSNOMEDCT \n",
+ "100 Pheno PhenoSNOMEDCT_decreased_ProcedureNCIT \n",
+ "101 Pheno PhenoSNOMEDCT_increased_DiseaseMONDO \n",
+ "102 Pheno PhenoSNOMEDCT_increased_DiseaseNCIT \n",
+ "103 Pheno PhenoSNOMEDCT_increased_DiseaseSNOMEDCT \n",
+ "104 Pheno PhenoSNOMEDCT_increased_PhenoHP \n",
+ "105 Pheno PhenoSNOMEDCT_increased_PhenoNCIT \n",
+ "106 Pheno PhenoSNOMEDCT_increased_ProcedureNCIT \n",
+ "107 Procedure ProcedureNCIT_decreased_DiseaseMONDO \n",
+ "108 Procedure ProcedureNCIT_decreased_DiseaseNCIT \n",
+ "109 Procedure ProcedureNCIT_decreased_DiseaseSNOMEDCT \n",
+ "110 Procedure ProcedureNCIT_decreased_PhenoHP \n",
+ "111 Procedure ProcedureNCIT_decreased_PhenoNCIT \n",
+ "112 Procedure ProcedureNCIT_decreased_PhenoSNOMEDCT \n",
+ "113 Procedure ProcedureNCIT_decreased_ProcedureNCIT \n",
+ "114 Procedure ProcedureNCIT_increased_DiseaseMONDO \n",
+ "115 Procedure ProcedureNCIT_increased_DiseaseNCIT \n",
+ "116 Procedure ProcedureNCIT_increased_DiseaseSNOMEDCT \n",
+ "117 Procedure ProcedureNCIT_increased_PhenoHP \n",
+ "118 Procedure ProcedureNCIT_increased_PhenoNCIT \n",
+ "119 Procedure ProcedureNCIT_increased_PhenoSNOMEDCT \n",
+ "120 Procedure ProcedureNCIT_increased_ProcedureNCIT "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# get a list of meta-triples for this KP: make it look like what BTE wants as closely as possible (see yaml)\n",
+ "# X_BTE KNOWN OPERATIONS TO ADD\n",
+ "# unique combos of subject-prefix / subject-category / predicate / object-prefix / object-category \n",
+ "# (and qualifier-set, if applicable)\n",
+ "\n",
+ "# Use the grouped[\"joined_to_match_xbte\"] column made below to create the tags and x-bte-kgs-operations portions in ehr_risk_kg.yaml\n",
+ " \n",
+ "meta_triples = kg.copy()\n",
+ "\n",
+ "meta_triples[\"subject_prefix\"] = meta_triples[\"subject\"].str.split(\":\").str[0]\n",
+ "meta_triples[\"object_prefix\"] = meta_triples[\"object\"].str.split(\":\").str[0]\n",
+ "\n",
+ "meta_triples[\"subject_category_postfix\"] = meta_triples[\"subject_category\"].str.split(\":\").str[1]\n",
+ "meta_triples[\"object_category_postfix\"] = meta_triples[\"object_category\"].str.split(\":\").str[1]\n",
+ "\n",
+ "meta_triples['increased_or_decreased'] = None\n",
+ "meta_triples.loc[meta_triples['predicate'].str.contains('increased', case=False, na=False), 'increased_or_decreased'] = 'increased'\n",
+ "meta_triples.loc[meta_triples['predicate'].str.contains('decreased', case=False, na=False), 'increased_or_decreased'] = 'decreased'\n",
+ "\n",
+ "# unique combos of subject-prefix / subject-category / predicate / object-prefix / object-category\n",
+ "\n",
+ "# Define the columns you want to combine\n",
+ "columns_to_combine = ['subject_category_postfix', 'subject_prefix', 'increased_or_decreased', 'object_category_postfix', 'object_prefix']\n",
+ "\n",
+ "# Group by the combination of columns and count the occurrences\n",
+ "grouped = meta_triples.groupby(columns_to_combine).size().reset_index(name='Count')\n",
+ "grouped['object_category_postfix_abbv'] = grouped['object_category_postfix'].str.replace('PhenotypicFeature', 'Pheno')\n",
+ "grouped['subject_category_postfix_abbv'] = grouped['subject_category_postfix'].str.replace('PhenotypicFeature', 'Pheno')\n",
+ "\n",
+ "grouped['object_category_postfix_abbv'] = grouped['object_category_postfix_abbv'].str.replace('ChemicalEntity', 'Chem')\n",
+ "grouped['subject_category_postfix_abbv'] = grouped['subject_category_postfix_abbv'].str.replace('ChemicalEntity', 'Chem')\n",
+ "\n",
+ "grouped['joined_to_match_xbte'] = grouped['subject_category_postfix_abbv'].astype(str) + grouped['subject_prefix'].astype(str) + '_' + grouped['increased_or_decreased'].astype(str) + '_' + grouped['object_category_postfix_abbv'].astype(str) + grouped['object_prefix'].astype(str)\n",
+ "\n",
+ "# grouped.to_csv('yaml_all_metatriples.tsv', sep=\"\\t\")\n",
+ "# # Print the DataFrame with unique combinations and their counts\n",
+ "\n",
+ "with pd.option_context(\"display.max_rows\", 30000):\n",
+ " display(grouped)\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 335,
+ "id": "ac11f711",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " subject | \n",
+ " predicate | \n",
+ " object | \n",
+ " relation | \n",
+ " provided_by | \n",
+ " provided_date | \n",
+ " predicate_category | \n",
+ " classifier | \n",
+ " auc_roc | \n",
+ " p_value | \n",
+ " ... | \n",
+ " num_patients_without_condition | \n",
+ " nodes_frozenset | \n",
+ " subject_id | \n",
+ " subject_name | \n",
+ " subject_category | \n",
+ " object_id | \n",
+ " object_name | \n",
+ " object_category | \n",
+ " subject_prefix | \n",
+ " object_prefix | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " HP:0008629 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 9999902 | \n",
+ " (HP:0000360, HP:0008629) | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " HP | \n",
+ " HP | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " MONDO:0010643 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.999872 | \n",
+ " ... | \n",
+ " 10000835 | \n",
+ " (HP:0000360, MONDO:0010643) | \n",
+ " MONDO:0010643 | \n",
+ " acute leukemia (disease) | \n",
+ " biolink:Disease | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " MONDO | \n",
+ " HP | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " UNII:25ADE2236L | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.936767 | \n",
+ " ... | \n",
+ " 10000939 | \n",
+ " (UNII:25ADE2236L, HP:0000360) | \n",
+ " UNII:25ADE2236L | \n",
+ " thrombin | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " UNII | \n",
+ " HP | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " UNII:K16AIQ8CTM | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.998563 | \n",
+ " ... | \n",
+ " 9998659 | \n",
+ " (HP:0000360, UNII:K16AIQ8CTM) | \n",
+ " UNII:K16AIQ8CTM | \n",
+ " pertuzumab | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " UNII | \n",
+ " HP | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " MONDO:0007972 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0000360 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.840132 | \n",
+ " 0.093959 | \n",
+ " ... | \n",
+ " 9998750 | \n",
+ " (MONDO:0007972, HP:0000360) | \n",
+ " MONDO:0007972 | \n",
+ " Meniere disease | \n",
+ " biolink:Disease | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " MONDO | \n",
+ " HP | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 237099 | \n",
+ " CHEBI:114785 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.998503 | \n",
+ " ... | \n",
+ " 10007299 | \n",
+ " (HP:0008629, CHEBI:114785) | \n",
+ " CHEBI:114785 | \n",
+ " erlotinib | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " CHEBI | \n",
+ " HP | \n",
+ "
\n",
+ " \n",
+ " 237100 | \n",
+ " UNII:52CMI0WC3Y | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.999719 | \n",
+ " ... | \n",
+ " 9996273 | \n",
+ " (UNII:52CMI0WC3Y, HP:0008629) | \n",
+ " UNII:52CMI0WC3Y | \n",
+ " atezolizumab | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " UNII | \n",
+ " HP | \n",
+ "
\n",
+ " \n",
+ " 237101 | \n",
+ " CHEBI:135738 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.998357 | \n",
+ " ... | \n",
+ " 9998030 | \n",
+ " (HP:0008629, CHEBI:135738) | \n",
+ " CHEBI:135738 | \n",
+ " clevidipine | \n",
+ " biolink:ChemicalEntity | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " CHEBI | \n",
+ " HP | \n",
+ "
\n",
+ " \n",
+ " 237102 | \n",
+ " MONDO:0004967 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.997631 | \n",
+ " ... | \n",
+ " 10001385 | \n",
+ " (HP:0008629, MONDO:0004967) | \n",
+ " MONDO:0004967 | \n",
+ " acute lymphoblastic leukemia (disease) | \n",
+ " biolink:Disease | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " MONDO | \n",
+ " HP | \n",
+ "
\n",
+ " \n",
+ " 237103 | \n",
+ " HP:0000360 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " HP:0008629 | \n",
+ " RO:0003308 | \n",
+ " EHR Risk Provider (Multiomics) | \n",
+ " 2022-05-18 | \n",
+ " biolink:Association | \n",
+ " Logistic Regression | \n",
+ " 0.959791 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 9997731 | \n",
+ " (HP:0000360, HP:0008629) | \n",
+ " HP:0000360 | \n",
+ " Tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " HP:0008629 | \n",
+ " Pulsatile tinnitus | \n",
+ " biolink:PhenotypicFeature | \n",
+ " HP | \n",
+ " HP | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
236812 rows × 23 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " subject predicate \\\n",
+ "0 HP:0008629 biolink:associated_with_increased_likelihood_of \n",
+ "1 MONDO:0010643 biolink:associated_with_increased_likelihood_of \n",
+ "2 UNII:25ADE2236L biolink:associated_with_increased_likelihood_of \n",
+ "3 UNII:K16AIQ8CTM biolink:associated_with_increased_likelihood_of \n",
+ "4 MONDO:0007972 biolink:associated_with_increased_likelihood_of \n",
+ "... ... ... \n",
+ "237099 CHEBI:114785 biolink:associated_with_decreased_likelihood_of \n",
+ "237100 UNII:52CMI0WC3Y biolink:associated_with_decreased_likelihood_of \n",
+ "237101 CHEBI:135738 biolink:associated_with_decreased_likelihood_of \n",
+ "237102 MONDO:0004967 biolink:associated_with_decreased_likelihood_of \n",
+ "237103 HP:0000360 biolink:associated_with_increased_likelihood_of \n",
+ "\n",
+ " object relation provided_by provided_date \\\n",
+ "0 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "1 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "2 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "3 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "4 HP:0000360 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "... ... ... ... ... \n",
+ "237099 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237100 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237101 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237102 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "237103 HP:0008629 RO:0003308 EHR Risk Provider (Multiomics) 2022-05-18 \n",
+ "\n",
+ " predicate_category classifier auc_roc p_value ... \\\n",
+ "0 biolink:Association Logistic Regression 0.840132 0.000000 ... \n",
+ "1 biolink:Association Logistic Regression 0.840132 0.999872 ... \n",
+ "2 biolink:Association Logistic Regression 0.840132 0.936767 ... \n",
+ "3 biolink:Association Logistic Regression 0.840132 0.998563 ... \n",
+ "4 biolink:Association Logistic Regression 0.840132 0.093959 ... \n",
+ "... ... ... ... ... ... \n",
+ "237099 biolink:Association Logistic Regression 0.959791 0.998503 ... \n",
+ "237100 biolink:Association Logistic Regression 0.959791 0.999719 ... \n",
+ "237101 biolink:Association Logistic Regression 0.959791 0.998357 ... \n",
+ "237102 biolink:Association Logistic Regression 0.959791 0.997631 ... \n",
+ "237103 biolink:Association Logistic Regression 0.959791 0.000000 ... \n",
+ "\n",
+ " num_patients_without_condition nodes_frozenset \\\n",
+ "0 9999902 (HP:0000360, HP:0008629) \n",
+ "1 10000835 (HP:0000360, MONDO:0010643) \n",
+ "2 10000939 (UNII:25ADE2236L, HP:0000360) \n",
+ "3 9998659 (HP:0000360, UNII:K16AIQ8CTM) \n",
+ "4 9998750 (MONDO:0007972, HP:0000360) \n",
+ "... ... ... \n",
+ "237099 10007299 (HP:0008629, CHEBI:114785) \n",
+ "237100 9996273 (UNII:52CMI0WC3Y, HP:0008629) \n",
+ "237101 9998030 (HP:0008629, CHEBI:135738) \n",
+ "237102 10001385 (HP:0008629, MONDO:0004967) \n",
+ "237103 9997731 (HP:0000360, HP:0008629) \n",
+ "\n",
+ " subject_id subject_name \\\n",
+ "0 HP:0008629 Pulsatile tinnitus \n",
+ "1 MONDO:0010643 acute leukemia (disease) \n",
+ "2 UNII:25ADE2236L thrombin \n",
+ "3 UNII:K16AIQ8CTM pertuzumab \n",
+ "4 MONDO:0007972 Meniere disease \n",
+ "... ... ... \n",
+ "237099 CHEBI:114785 erlotinib \n",
+ "237100 UNII:52CMI0WC3Y atezolizumab \n",
+ "237101 CHEBI:135738 clevidipine \n",
+ "237102 MONDO:0004967 acute lymphoblastic leukemia (disease) \n",
+ "237103 HP:0000360 Tinnitus \n",
+ "\n",
+ " subject_category object_id object_name \\\n",
+ "0 biolink:PhenotypicFeature HP:0000360 Tinnitus \n",
+ "1 biolink:Disease HP:0000360 Tinnitus \n",
+ "2 biolink:ChemicalEntity HP:0000360 Tinnitus \n",
+ "3 biolink:ChemicalEntity HP:0000360 Tinnitus \n",
+ "4 biolink:Disease HP:0000360 Tinnitus \n",
+ "... ... ... ... \n",
+ "237099 biolink:ChemicalEntity HP:0008629 Pulsatile tinnitus \n",
+ "237100 biolink:ChemicalEntity HP:0008629 Pulsatile tinnitus \n",
+ "237101 biolink:ChemicalEntity HP:0008629 Pulsatile tinnitus \n",
+ "237102 biolink:Disease HP:0008629 Pulsatile tinnitus \n",
+ "237103 biolink:PhenotypicFeature HP:0008629 Pulsatile tinnitus \n",
+ "\n",
+ " object_category subject_prefix object_prefix \n",
+ "0 biolink:PhenotypicFeature HP HP \n",
+ "1 biolink:PhenotypicFeature MONDO HP \n",
+ "2 biolink:PhenotypicFeature UNII HP \n",
+ "3 biolink:PhenotypicFeature UNII HP \n",
+ "4 biolink:PhenotypicFeature MONDO HP \n",
+ "... ... ... ... \n",
+ "237099 biolink:PhenotypicFeature CHEBI HP \n",
+ "237100 biolink:PhenotypicFeature UNII HP \n",
+ "237101 biolink:PhenotypicFeature CHEBI HP \n",
+ "237102 biolink:PhenotypicFeature MONDO HP \n",
+ "237103 biolink:PhenotypicFeature HP HP \n",
+ "\n",
+ "[236812 rows x 23 columns]"
+ ]
+ },
+ "execution_count": 335,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# need this (dataframe called kg_yaml) to lookup examples for testExamples (qInput and oneOutput) portion of yaml\n",
+ "# additionally need this generate portion of x-bte-response-mapping\n",
+ "\n",
+ "kg_yaml = kg.copy()\n",
+ "\n",
+ "kg_yaml[\"subject_prefix\"] = kg_yaml[\"subject\"].str.split(\":\").str[0]\n",
+ "kg_yaml[\"object_prefix\"] = kg_yaml[\"object\"].str.split(\":\").str[0]\n",
+ "\n",
+ "kg_yaml"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 339,
+ "id": "f5c776cc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# the following code writes the tags and x-bte-kgs-operations of ehr_risk_kg.yaml to an output file titled 'x-bte-kgs-operations-response-mapping.txt'\n",
+ "# copy paste the output to the relevant section in the yaml\n",
+ "with open('x-bte-kgs-operations-response-mapping.txt', 'w+') as file:\n",
+ " file.write(\" tags:\" + \"\\n\")\n",
+ " file.write(\" - query:\" + \"\\n\")\n",
+ " file.write(f\" ## {len(grouped)} operations based on TSV of KG\" + \"\\n\")\n",
+ " for index, row in grouped.iterrows():\n",
+ " file.write(f\" ## - {row['joined_to_match_xbte']}\" + \"\\n\")\n",
+ " file.write(\" x-bte-kgs-operations\" + \"\\n\")\n",
+ " for index, row in grouped.iterrows(): \n",
+ " file.write(f\" - $ref: '#/components/x-bte-kgs-operations/{row['joined_to_match_xbte']}'\" + \"\\n\")\n",
+ " file.write(f\" - $ref: '#/components/x-bte-kgs-operations/{row['joined_to_match_xbte']}-rev'\" + \"\\n\")\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 340,
+ "id": "232ad5c1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# the following code writes the x-bte-kgs-operations of ehr_risk_kg.yaml to an output file titled 'x-bte-kgs-operations-response-mapping.txt'\n",
+ "# copy paste the output to the relevant section in the yaml\n",
+ "\n",
+ "with open('x-bte-kgs-operations-response-mapping.txt', 'a') as file:\n",
+ " file.write(\" \" + \"x-bte-kgs-operations:\" + \"\\n\")\n",
+ " for index, row in grouped.iterrows():\n",
+ " xbte_header = f'{row[\"joined_to_match_xbte\"]}:'\n",
+ " file.write(\" \" + xbte_header + \"\\n\")\n",
+ " \n",
+ " base_url = \"https://pending.biothings.io/multiomics_ehr_risk_kp/query?q=\"\n",
+ " url = base_url + f'''subject.type:\"biolink:{row['subject_category_postfix']}\"%20AND%20_exists_:subject.\"biolink:{row['subject_prefix']}\"%20AND%20association.predicate:associated_with_decreased_likelihood_of%20AND%20object.type:\"biolink:{row['object_category_postfix']}\"%20AND%20_exists_:object.{row['object_prefix']}'''\n",
+ " commented_url = \"## \" + url\n",
+ " file.write(\" \" + commented_url + \"\\n\")\n",
+ " \n",
+ " file.write(\" ## \" + str(row[\"Count\"]) + \" records\" + \"\\n\")\n",
+ " \n",
+ " file.write(\" -supportBatch: true\" + \"\\n\")\n",
+ " file.write(\" useTemplating: true\" + \"\\n\")\n",
+ " file.write(\" inputs:\" + \"\\n\")\n",
+ " \n",
+ " file.write(f\" - id: {row['subject_prefix']}\" + \"\\n\")\n",
+ " file.write(f\" - semantic: {row['subject_category_postfix']}\" + \"\\n\")\n",
+ " \n",
+ " file.write(\" requestBody:\" + \"\\n\")\n",
+ " file.write(\" body: >-\" + \"\\n\")\n",
+ " \n",
+ " file.write(\" {\" + \"\\n\")\n",
+ " q_string = f\"\"\" \"q\": [ {{ queryInputs | rmPrefix() | wrap ( '[\"' , '\",\"biolink:{row['subject_category_postfix']}\", \"associated_with_{row['increased_or_decreased']}_likelihood_of\", \"biolink:{row['object_category_postfix']}\"]') }} ]\"\"\"\n",
+ " file.write(' ' + q_string + \"\\n\") \n",
+ " scopes_string = f\"\"\" \"scopes\": [\"subject.{row['subject_prefix']}\", \"subject.type\", \"association.predicate\", \"object.type\"]\"\"\"\n",
+ " file.write(' ' + scopes_string + \"\\n\") \n",
+ " file.write(\" }\" + \"\\n\")\n",
+ " \n",
+ " file.write(\" parameters:\" + \"\\n\")\n",
+ " file.write(f\" fields: object.{row['object_prefix']}, association.edge_attributes, subject.name, object.name, source.edge_sources\" + \"\\n\")\n",
+ " file.write(f\" size: 1000\" + \"\\n\")\n",
+ " \n",
+ " file.write(\" outputs:\" + \"\\n\")\n",
+ " file.write(f\" -id: {row['object_prefix']}\" + \"\\n\")\n",
+ " file.write(f\" semantic: {row['object_category_postfix']}\" + \"\\n\")\n",
+ " \n",
+ " file.write(\" predicate: associated_with\" + \"\\n\")\n",
+ " file.write(\" qualifiers:\" + \"\\n\")\n",
+ " file.write(f\" object_direction_qualifier: {row['increased_or_decreased']}\" + \"\\n\")\n",
+ " file.write(f\" object_aspect_qualifier: likelihood\" + \"\\n\")\n",
+ " \n",
+ " file.write(\" response_mapping:\" + \"\\n\")\n",
+ " file.write(f' \"ref\": \"#/components/x-bte-response-mapping/object-{row[\"object_prefix\"]}\"' + \"\\n\")\n",
+ " \n",
+ " file.write(\" # testExamples:\" + \"\\n\")\n",
+ " sub_prefix = row[\"subject_prefix\"]\n",
+ " obj_prefix = row[\"object_prefix\"]\n",
+ " matched_rows = kg_yaml.loc[(kg_yaml['subject_prefix'] == sub_prefix) & (kg_yaml['object_prefix'] == obj_prefix)]\n",
+ " select_row = matched_rows.sample()\n",
+ " qInput = select_row['subject'].values[0]\n",
+ " qName = select_row['subject_name'].values[0]\n",
+ " oneOutput = select_row['object'].values[0]\n",
+ " oneOutputName = select_row['object_name'].values[0]\n",
+ " file.write(f' # - qInput: \"{qInput}\" ## {qName} ' + \"\\n\")\n",
+ " file.write(f' # oneOutput: \"{oneOutput}\" ## {oneOutputName} ' + \"\\n\")\n",
+ " \n",
+ " #### DO REVERSE #### \n",
+ " \n",
+ " xbte_header_rev = f'{row[\"joined_to_match_xbte\"]}_rev:'\n",
+ " file.write(\" \" + xbte_header_rev + \"\\n\")\n",
+ "\n",
+ " file.write(\" -supportBatch: true\" + \"\\n\")\n",
+ " file.write(\" useTemplating: true\" + \"\\n\")\n",
+ " file.write(\" inputs:\" + \"\\n\")\n",
+ " \n",
+ " file.write(f\" - id: {row['object_prefix']}\" + \"\\n\")\n",
+ " file.write(f\" - semantic: {row['object_category_postfix']}\" + \"\\n\")\n",
+ " \n",
+ " file.write(\" requestBody:\" + \"\\n\")\n",
+ " file.write(\" body: >-\" + \"\\n\")\n",
+ " \n",
+ " file.write(\" {\" + \"\\n\")\n",
+ " q_string = f\"\"\" \"q\": [ {{ queryInputs | rmPrefix() | wrap ( '[\"' , '\",\"biolink:{row['subject_category_postfix']}\", \"associated_with_{row['increased_or_decreased']}_likelihood_of\", \"biolink:{row['object_category_postfix']}\"]') }} ]\"\"\"\n",
+ " file.write(' ' + q_string + \"\\n\") \n",
+ " scopes_string = f\"\"\" \"scopes\": [\"object.{row['object_prefix']}\", \"subject.type\", \"association.predicate\", \"object.type\"]\"\"\"\n",
+ " file.write(' ' + scopes_string + \"\\n\") \n",
+ " file.write(\" }\" + \"\\n\")\n",
+ " \n",
+ " file.write(\" parameters:\" + \"\\n\")\n",
+ " file.write(f\" fields: subject.{row['subject_prefix']}, association.edge_attributes, subject.name, object.name, source.edge_sources\" + \"\\n\")\n",
+ " file.write(f\" size: 1000\" + \"\\n\")\n",
+ " \n",
+ " file.write(\" outputs:\" + \"\\n\")\n",
+ " file.write(f\" -id: {row['subject_prefix']}\" + \"\\n\")\n",
+ " file.write(f\" semantic: {row['subject_category_postfix']}\" + \"\\n\")\n",
+ " \n",
+ " file.write(\" predicate: associated_with\" + \"\\n\")\n",
+ " file.write(\" qualifiers:\" + \"\\n\")\n",
+ " file.write(f\" object_direction_qualifier: {row['increased_or_decreased']}\" + \"\\n\")\n",
+ " file.write(f\" object_aspect_qualifier: likelihood\" + \"\\n\")\n",
+ " \n",
+ " file.write(\" response_mapping:\" + \"\\n\")\n",
+ " file.write(f' \"ref\": \"#/components/x-bte-response-mapping/object-{row[\"subject_prefix\"]}\"' + \"\\n\")\n",
+ " \n",
+ " file.write(\" # testExamples:\" + \"\\n\")\n",
+ " sub_prefix = row[\"subject_prefix\"]\n",
+ " obj_prefix = row[\"object_prefix\"]\n",
+ " matched_rows = kg_yaml.loc[(kg_yaml['subject_prefix'] == sub_prefix) & (kg_yaml['object_prefix'] == obj_prefix)]\n",
+ " select_row = matched_rows.sample()\n",
+ " qInput = select_row['object'].values[0]\n",
+ " qName = select_row['object_name'].values[0]\n",
+ " oneOutput = select_row['subject'].values[0]\n",
+ " oneOutputName = select_row['subject_name'].values[0]\n",
+ " file.write(f' # - qInput: \"{qInput}\" ## {qName} ' + \"\\n\")\n",
+ " file.write(f' # oneOutput: \"{oneOutput}\" ## {oneOutputName} ' + \"\\n\")\n",
+ " \n",
+ " \n",
+ "\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9bbc605f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "Output example for x-bte-kgs-operations\n",
+ "\n",
+ " ChemCHEBI_decreased_PhenoNCIT:\n",
+ " ## https://pending.biothings.io/multiomics_ehr_risk_kp/query?q=subject.type:ChemicalEntity%20AND%20_exists_:subject.CHEBI%20AND%20association.predicate:associated_with_decreased_likelihood_of%20AND%20object.type:PhenotypicFeature%20AND%20_exists_:object.NCIT\n",
+ " ## 191 records\n",
+ " - supportBatch: true\n",
+ " useTemplating: true \n",
+ " inputs:\n",
+ " - id: CHEBI\n",
+ " semantic: SmallMolecule\n",
+ " requestBodyType: object\n",
+ " requestBody:\n",
+ " body: >-\n",
+ " {\n",
+ " \"q\": [ {{ queryInputs | rmPrefix() | wrap ( '[\"' , '\",\"biolink:ChemicalEntity\", \"associated_with_decreased_likelihood_of\", \"biolink:PhenotypicFeature\"]') }} ],\n",
+ " \"scopes\": [\"subject.CHEBI\", \"subject.type\", \"association.predicate\", \"object.type\"]\n",
+ " }\n",
+ " parameters:\n",
+ " fields: >-\n",
+ " object.NCIT,\n",
+ " association.edge_attributes,\n",
+ " subject.name,object.name\n",
+ " size: 1000\n",
+ " outputs:\n",
+ " - id: NCIT\n",
+ " semantic: PhenotypicFeature\n",
+ " predicate: associated_with\n",
+ " qualifiers:\n",
+ " object_direction_qualifier: decreased\n",
+ " object_aspect_qualifier: likelihood\n",
+ " response_mapping:\n",
+ " \"$ref\": \"#/components/x-bte-response-mapping/object-NCIT\"\n",
+ " # testExamples:\n",
+ " # - qInput: \"CHEBI:135866\" ## clindamycin\n",
+ " # oneOutput: \"NCIT:C171647\" ## SARS Coronavirus 2 Positive\n",
+ " ChemCHEBI_decreased_PhenoNCIT-rev:\n",
+ " - supportBatch: true\n",
+ " useTemplating: true \n",
+ " inputs:\n",
+ " - id: NCIT\n",
+ " semantic: PhenotypicFeature\n",
+ " requestBodyType: object\n",
+ " requestBody:\n",
+ " body: >-\n",
+ " {\n",
+ " \"q\": [ {{ queryInputs | rmPrefix() | wrap( '[\"' , '\",\"biolink:ChemicalEntity\", \"associated_with_decreased_likelihood_of\", \"biolink:PhenotypicFeature\"]') }} ],\n",
+ " \"scopes\": [\"object.NCIT\", \"subject.type\", \"association.predicate\", \"object.type\"]\n",
+ " }\n",
+ " parameters:\n",
+ " fields: >-\n",
+ " subject.CHEBI,\n",
+ " association.edge_attributes,\n",
+ " subject.name,object.name\n",
+ " size: 1000\n",
+ " outputs:\n",
+ " - id: CHEBI\n",
+ " semantic: SmallMolecule\n",
+ " predicate: associated_with\n",
+ " qualifiers:\n",
+ " subject_direction_qualifier: decreased\n",
+ " subject_aspect_qualifier: likelihood\n",
+ " response_mapping:\n",
+ " \"$ref\": \"#/components/x-bte-response-mapping/subject-CHEBI\"\n",
+ " # testExamples:\n",
+ " # - qInput: \"NCIT:C171647\" ## SARS Coronavirus 2 Positive\n",
+ " # oneOutput: \"CHEBI:119915\" ## fentanyl"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 341,
+ "id": "d834fe3e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# the following code writes the x-bte-response-maping of ehr_risk_kg.yaml to an output file titled 'x-bte-kgs-operations-response-mapping.txt'\n",
+ "# copy paste the output to the relevant section in the yaml\n",
+ "with open('x-bte-kgs-operations-response-mapping.txt', 'a') as file:\n",
+ " file.write(\" \" + \"x-bte-response-mapping:\" + \"\\n\")\n",
+ " for sprefix in list(kg_yaml['subject_prefix'].unique()):\n",
+ " file.write(f\" subject-{sprefix}:\" + \"\\n\")\n",
+ " file.write(f\" {sprefix}: subject.{sprefix}\" + \"\\n\")\n",
+ " file.write(f\" input_name: subject.name\" + \"\\n\")\n",
+ " file.write(f\" output_name: object.name\" + \"\\n\")\n",
+ " file.write(f\" edge-attributes: sources.edge_attributes\" + \"\\n\")\n",
+ " file.write(f\" trapi_sources: source.edge_sources\" + \"\\n\")\n",
+ " for oprefix in list(kg_yaml['object_prefix'].unique()):\n",
+ " file.write(f\" object-{oprefix}:\" + \"\\n\")\n",
+ " file.write(f\" {oprefix}: object.{oprefix}\" + \"\\n\")\n",
+ " file.write(f\" input_name: object.name\" + \"\\n\")\n",
+ " file.write(f\" output_name: subject.name\" + \"\\n\")\n",
+ " file.write(f\" edge-attributes: sources.edge_attributes\" + \"\\n\")\n",
+ " file.write(f\" trapi_sources: source.edge_sources\" + \"\\n\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 325,
+ "id": "30781582",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " subject | \n",
+ " object | \n",
+ " predicate | \n",
+ " count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 111072 | \n",
+ " HP:0001974 | \n",
+ " NCIT:C167118 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 211014 | \n",
+ " NCIT:C167118 | \n",
+ " HP:0001974 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 117058 | \n",
+ " HP:0002900 | \n",
+ " NCIT:C167118 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 122579 | \n",
+ " HP:0003124 | \n",
+ " NCIT:C167118 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 117810 | \n",
+ " HP:0002902 | \n",
+ " NCIT:C167118 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 78133 | \n",
+ " CHEBI:77431 | \n",
+ " MONDO:0004721 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 78134 | \n",
+ " CHEBI:77431 | \n",
+ " MONDO:0004866 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 78135 | \n",
+ " CHEBI:77431 | \n",
+ " MONDO:0004868 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 78136 | \n",
+ " CHEBI:77431 | \n",
+ " MONDO:0004946 | \n",
+ " biolink:associated_with_increased_likelihood_of | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 234248 | \n",
+ " UNII:X85G7936GV | \n",
+ " SNOMEDCT:76571007 | \n",
+ " biolink:associated_with_decreased_likelihood_of | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
234249 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " subject object \\\n",
+ "111072 HP:0001974 NCIT:C167118 \n",
+ "211014 NCIT:C167118 HP:0001974 \n",
+ "117058 HP:0002900 NCIT:C167118 \n",
+ "122579 HP:0003124 NCIT:C167118 \n",
+ "117810 HP:0002902 NCIT:C167118 \n",
+ "... ... ... \n",
+ "78133 CHEBI:77431 MONDO:0004721 \n",
+ "78134 CHEBI:77431 MONDO:0004866 \n",
+ "78135 CHEBI:77431 MONDO:0004868 \n",
+ "78136 CHEBI:77431 MONDO:0004946 \n",
+ "234248 UNII:X85G7936GV SNOMEDCT:76571007 \n",
+ "\n",
+ " predicate count \n",
+ "111072 biolink:associated_with_decreased_likelihood_of 4 \n",
+ "211014 biolink:associated_with_decreased_likelihood_of 4 \n",
+ "117058 biolink:associated_with_increased_likelihood_of 4 \n",
+ "122579 biolink:associated_with_decreased_likelihood_of 4 \n",
+ "117810 biolink:associated_with_increased_likelihood_of 3 \n",
+ "... ... ... \n",
+ "78133 biolink:associated_with_decreased_likelihood_of 1 \n",
+ "78134 biolink:associated_with_decreased_likelihood_of 1 \n",
+ "78135 biolink:associated_with_increased_likelihood_of 1 \n",
+ "78136 biolink:associated_with_increased_likelihood_of 1 \n",
+ "234248 biolink:associated_with_decreased_likelihood_of 1 \n",
+ "\n",
+ "[234249 rows x 4 columns]"
+ ]
+ },
+ "execution_count": 325,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# # check to see how many identical triples there are in KG (requested by BTE)\n",
+ "# identical_triples_check = kg.groupby(['subject','object', 'predicate']).size().reset_index().rename(columns={0:'count'}).sort_values(by=['count'], ascending=False)\n",
+ "# identical_triples_check"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "647beb4b",
+ "metadata": {},
+ "source": [
+ "# Make a cleaned version of the parser below"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "9a68e6a8",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{\n",
+ " \"_id\": \"HP:0008629_HP:0000360_0.8401321539277617_00_8796399245685702_10102731\",\n",
+ " \"subject\": {\n",
+ " \"HP\": \"0008629\",\n",
+ " \"id\": \"HP:0008629\",\n",
+ " \"name\": \"Pulsatile tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.0,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 8.796399245685702,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10102731,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"MONDO:0010643_HP:0000360_0.8401321539277617_09998721067797812_8585212287149526_10107468\",\n",
+ " \"subject\": {\n",
+ " \"MONDO\": \"0010643\",\n",
+ " \"id\": \"MONDO:0010643\",\n",
+ " \"name\": \"acute leukemia (disease)\",\n",
+ " \"type\": \"biolink:Disease\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.9998721067797812,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 8.585212287149526,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10107468,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"UNII:25ADE2236L_HP:0000360_0.8401321539277617_09367666401584368_4558176672832635_10095297\",\n",
+ " \"subject\": {\n",
+ " \"UNII\": \"25ADE2236L\",\n",
+ " \"id\": \"UNII:25ADE2236L\",\n",
+ " \"name\": \"thrombin\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.9367666401584368,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 4.558176672832635,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10095297,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"UNII:K16AIQ8CTM_HP:0000360_0.8401321539277617_09985626800193924_43575215395209606_10099409\",\n",
+ " \"subject\": {\n",
+ " \"UNII\": \"K16AIQ8CTM\",\n",
+ " \"id\": \"UNII:K16AIQ8CTM\",\n",
+ " \"name\": \"pertuzumab\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.9985626800193924,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 4.3575215395209606,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10099409,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"MONDO:0007972_HP:0000360_0.8401321539277617_009395878968875304_392606416950393_10100235\",\n",
+ " \"subject\": {\n",
+ " \"MONDO\": \"0007972\",\n",
+ " \"id\": \"MONDO:0007972\",\n",
+ " \"name\": \"Meniere disease\",\n",
+ " \"type\": \"biolink:Disease\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.09395878968875304,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 3.92606416950393,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10100235,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"MONDO:0004866_HP:0000360_0.8401321539277617_00_30228399397470613_10094256\",\n",
+ " \"subject\": {\n",
+ " \"MONDO\": \"0004866\",\n",
+ " \"id\": \"MONDO:0004866\",\n",
+ " \"name\": \"eustachian tube disease\",\n",
+ " \"type\": \"biolink:Disease\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.0,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 3.0228399397470613,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10094256,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"HP:0002321_HP:0000360_0.8401321539277617_00_297792187563902_10105748\",\n",
+ " \"subject\": {\n",
+ " \"HP\": \"0002321\",\n",
+ " \"id\": \"HP:0002321\",\n",
+ " \"name\": \"Vertigo\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.0,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 2.97792187563902,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10105748,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"HP:0011897_HP:0000360_0.8401321539277617_09882157516627652_29664217619814317_10103231\",\n",
+ " \"subject\": {\n",
+ " \"HP\": \"0011897\",\n",
+ " \"id\": \"HP:0011897\",\n",
+ " \"name\": \"Neutrophilia\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.9882157516627652,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 2.9664217619814317,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10103231,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"CHEBI:3403_HP:0000360_0.8401321539277617_07343407676622777_26320323984271305_10094497\",\n",
+ " \"subject\": {\n",
+ " \"CHEBI\": \"3403\",\n",
+ " \"id\": \"CHEBI:3403\",\n",
+ " \"name\": \"carboprost\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.7343407676622777,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 2.6320323984271305,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10094497,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"CHEBI:42758_HP:0000360_0.8401321539277617_041935921163506107_25903432895291627_10090857\",\n",
+ " \"subject\": {\n",
+ " \"CHEBI\": \"42758\",\n",
+ " \"id\": \"CHEBI:42758\",\n",
+ " \"name\": \"dextrose\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.41935921163506107,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 2.5903432895291627,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10090857,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"HP:0030788_HP:0000360_0.8401321539277617_00_2533460251458441_10098414\",\n",
+ " \"subject\": {\n",
+ " \"HP\": \"0030788\",\n",
+ " \"id\": \"HP:0030788\",\n",
+ " \"name\": \"Impacted cerumen\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.0,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 2.533460251458441,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10098414,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"MONDO:0012883_HP:0000360_0.8401321539277617_099451727004904_23595369689943997_10098573\",\n",
+ " \"subject\": {\n",
+ " \"MONDO\": \"0012883\",\n",
+ " \"id\": \"MONDO:0012883\",\n",
+ " \"name\": \"acute promyelocytic leukemia\",\n",
+ " \"type\": \"biolink:Disease\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.99451727004904,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 2.3595369689943997,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10098573,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"CHEBI:15882_HP:0000360_0.8401321539277617_09962946928940472_21934060641992064_10099906\",\n",
+ " \"subject\": {\n",
+ " \"CHEBI\": \"15882\",\n",
+ " \"id\": \"CHEBI:15882\",\n",
+ " \"name\": \"phenol\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.9962946928940472,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 2.1934060641992064,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10099906,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"MONDO:0019065_HP:0000360_0.8401321539277617_09965366860113084_1909472317873171_10094912\",\n",
+ " \"subject\": {\n",
+ " \"MONDO\": \"0019065\",\n",
+ " \"id\": \"MONDO:0019065\",\n",
+ " \"name\": \"amyloidosis (disease)\",\n",
+ " \"type\": \"biolink:Disease\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.9965366860113084,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.909472317873171,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10094912,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"UNII:1RXS4UE564_HP:0000360_0.8401321539277617_09145182816982448_17682389618102223_10099640\",\n",
+ " \"subject\": {\n",
+ " \"UNII\": \"1RXS4UE564\",\n",
+ " \"id\": \"UNII:1RXS4UE564\",\n",
+ " \"name\": \"alteplase\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.9145182816982448,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.7682389618102223,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10099640,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"CHEBI:2637_HP:0000360_0.8401321539277617_09984753422140926_17273309005514552_10097751\",\n",
+ " \"subject\": {\n",
+ " \"CHEBI\": \"2637\",\n",
+ " \"id\": \"CHEBI:2637\",\n",
+ " \"name\": \"amikacin\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.9984753422140926,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.7273309005514552,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10097751,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"CHEBI:28001_HP:0000360_0.8401321539277617_0327274960009581_17001729126903409_10099157\",\n",
+ " \"subject\": {\n",
+ " \"CHEBI\": \"28001\",\n",
+ " \"id\": \"CHEBI:28001\",\n",
+ " \"name\": \"vancomycin\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.327274960009581,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.7001729126903409,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10099157,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"CHEBI:32142_HP:0000360_0.8401321539277617_08551368959397363_16756681953392343_10100133\",\n",
+ " \"subject\": {\n",
+ " \"CHEBI\": \"32142\",\n",
+ " \"id\": \"CHEBI:32142\",\n",
+ " \"name\": \"sodium citrate\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.8551368959397363,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.6756681953392343,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10100133,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"CHEBI:5280_HP:0000360_0.8401321539277617_09843217708630094_16666971285058136_10101665\",\n",
+ " \"subject\": {\n",
+ " \"CHEBI\": \"5280\",\n",
+ " \"id\": \"CHEBI:5280\",\n",
+ " \"name\": \"gatifloxacin\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.9843217708630094,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.6666971285058136,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10101665,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"CHEBI:31705_HP:0000360_0.8401321539277617_09099121721955136_16359973452904688_10097430\",\n",
+ " \"subject\": {\n",
+ " \"CHEBI\": \"31705\",\n",
+ " \"id\": \"CHEBI:31705\",\n",
+ " \"name\": \"iodixanol\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.9099121721955136,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.6359973452904688,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10097430,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"HP:0002315_HP:0000360_0.8401321539277617_00_16166497779176117_10105844\",\n",
+ " \"subject\": {\n",
+ " \"HP\": \"0002315\",\n",
+ " \"id\": \"HP:0002315\",\n",
+ " \"name\": \"Headache\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.0,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.6166497779176117,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10105844,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"CHEBI:6709_HP:0000360_0.8401321539277617_9059419880941276e-14_1598480857062902_10100577\",\n",
+ " \"subject\": {\n",
+ " \"CHEBI\": \"6709\",\n",
+ " \"id\": \"CHEBI:6709\",\n",
+ " \"name\": \"meclizine\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 9.059419880941276e-14,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.598480857062902,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10100577,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"UNII:1BJ477IO2L_HP:0000360_0.8401321539277617_00_15423681686921502_10101289\",\n",
+ " \"subject\": {\n",
+ " \"UNII\": \"1BJ477IO2L\",\n",
+ " \"id\": \"UNII:1BJ477IO2L\",\n",
+ " \"name\": \"gadobutrol\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.0,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.5423681686921502,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10101289,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"CHEBI:9421_HP:0000360_0.8401321539277617_08516479806885544_14807202351994573_10096948\",\n",
+ " \"subject\": {\n",
+ " \"CHEBI\": \"9421\",\n",
+ " \"id\": \"CHEBI:9421\",\n",
+ " \"name\": \"tazobactam\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.8516479806885544,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.4807202351994573,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10096948,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"CHEBI:478164_HP:0000360_0.8401321539277617_07859646244357879_1454940814532088_10106536\",\n",
+ " \"subject\": {\n",
+ " \"CHEBI\": \"478164\",\n",
+ " \"id\": \"CHEBI:478164\",\n",
+ " \"name\": \"cefepime\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.7859646244357879,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.454940814532088,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10106536,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"CHEBI:4462_HP:0000360_0.8401321539277617_08450305750308531_13343018079195257_10093475\",\n",
+ " \"subject\": {\n",
+ " \"CHEBI\": \"4462\",\n",
+ " \"id\": \"CHEBI:4462\",\n",
+ " \"name\": \"sodium phosphate\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.8450305750308531,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.3343018079195257,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10093475,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"SNOMEDCT:127295002_HP:0000360_0.8401321539277617_05945757673388871_13294623195442723_10100698\",\n",
+ " \"subject\": {\n",
+ " \"SNOMEDCT\": \"127295002\",\n",
+ " \"id\": \"SNOMEDCT:127295002\",\n",
+ " \"name\": \"Traumatic brain injury (disorder)\",\n",
+ " \"type\": \"biolink:Disease\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.5945757673388871,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.3294623195442723,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10100698,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"MONDO:0010030_HP:0000360_0.8401321539277617_08231940924433323_13284310019939671_10099754\",\n",
+ " \"subject\": {\n",
+ " \"MONDO\": \"0010030\",\n",
+ " \"id\": \"MONDO:0010030\",\n",
+ " \"name\": \"Sjogren syndrome\",\n",
+ " \"type\": \"biolink:Disease\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.8231940924433323,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.3284310019939671,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10099754,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"UNII:FQ3DRG0N5K_HP:0000360_0.8401321539277617_09997741237977732_12685220499445722_10100132\",\n",
+ " \"subject\": {\n",
+ " \"UNII\": \"FQ3DRG0N5K\",\n",
+ " \"id\": \"UNII:FQ3DRG0N5K\",\n",
+ " \"name\": \"pancrelipase\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.9997741237977732,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.2685220499445722,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10100132,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"HP:0032372_HP:0000360_0.8401321539277617_09739456841856716_12651078229598214_10095958\",\n",
+ " \"subject\": {\n",
+ " \"HP\": \"0032372\",\n",
+ " \"id\": \"HP:0032372\",\n",
+ " \"name\": \"Increased peripheral blast count\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.9739456841856716,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.2651078229598214,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10095958,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"CHEBI:6754_HP:0000360_0.8401321539277617_05082567984079447_11344123128490888_10097573\",\n",
+ " \"subject\": {\n",
+ " \"CHEBI\": \"6754\",\n",
+ " \"id\": \"CHEBI:6754\",\n",
+ " \"name\": \"meperidine\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.5082567984079447,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.1344123128490888,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10097573,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"MONDO:0005391_HP:0000360_0.8401321539277617_019858432436236395_11125118199146522_10096019\",\n",
+ " \"subject\": {\n",
+ " \"MONDO\": \"0005391\",\n",
+ " \"id\": \"MONDO:0005391\",\n",
+ " \"name\": \"restless legs syndrome\",\n",
+ " \"type\": \"biolink:Disease\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.19858432436236395,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.1125118199146522,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10096019,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"MONDO:0018935_HP:0000360_0.8401321539277617_099536202601544_11117474952487418_10099681\",\n",
+ " \"subject\": {\n",
+ " \"MONDO\": \"0018935\",\n",
+ " \"id\": \"MONDO:0018935\",\n",
+ " \"name\": \"hairy cell leukemia\",\n",
+ " \"type\": \"biolink:Disease\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.99536202601544,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.1117474952487418,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10099681,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"MONDO:0011786_HP:0000360_0.8401321539277617_00_11102871262689296_10101809\",\n",
+ " \"subject\": {\n",
+ " \"MONDO\": \"0011786\",\n",
+ " \"id\": \"MONDO:0011786\",\n",
+ " \"name\": \"allergic rhinitis\",\n",
+ " \"type\": \"biolink:Disease\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.0,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.1102871262689296,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10101809,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"MONDO:0002039_HP:0000360_0.8401321539277617_08357342064986439_10945588244442226_10102298\",\n",
+ " \"subject\": {\n",
+ " \"MONDO\": \"0002039\",\n",
+ " \"id\": \"MONDO:0002039\",\n",
+ " \"name\": \"cognitive disorder\",\n",
+ " \"type\": \"biolink:Disease\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.8357342064986439,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.0945588244442226,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10102298,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"HP:0001251_HP:0000360_0.8401321539277617_07111840915054395_10809198194248215_10099363\",\n",
+ " \"subject\": {\n",
+ " \"HP\": \"0001251\",\n",
+ " \"id\": \"HP:0001251\",\n",
+ " \"name\": \"Ataxia\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.7111840915054395,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.0809198194248215,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10099363,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"CHEBI:4031_HP:0000360_0.8401321539277617_08886902851131677_10571543050145833_10098612\",\n",
+ " \"subject\": {\n",
+ " \"CHEBI\": \"4031\",\n",
+ " \"id\": \"CHEBI:4031\",\n",
+ " \"name\": \"cyclosporine\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.8886902851131677,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.0571543050145833,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10098612,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"MONDO:0001119_HP:0000360_0.8401321539277617_09257571836006121_1042593529879973_10100198\",\n",
+ " \"subject\": {\n",
+ " \"MONDO\": \"0001119\",\n",
+ " \"id\": \"MONDO:0001119\",\n",
+ " \"name\": \"premature menopause\",\n",
+ " \"type\": \"biolink:Disease\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.9257571836006121,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.042593529879973,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10100198,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"MONDO:0013600_HP:0000360_0.8401321539277617_00_1040152790331654_10101431\",\n",
+ " \"subject\": {\n",
+ " \"MONDO\": \"0013600\",\n",
+ " \"id\": \"MONDO:0013600\",\n",
+ " \"name\": \"insomnia (disease)\",\n",
+ " \"type\": \"biolink:Disease\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.0,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.040152790331654,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10101431,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "{\n",
+ " \"_id\": \"CHEBI:15407_HP:0000360_0.8401321539277617_0002238827241549002_10328062673251501_10092760\",\n",
+ " \"subject\": {\n",
+ " \"CHEBI\": \"15407\",\n",
+ " \"id\": \"CHEBI:15407\",\n",
+ " \"name\": \"ephedrine\",\n",
+ " \"type\": \"biolink:ChemicalEntity\"\n",
+ " },\n",
+ " \"association\": {\n",
+ " \"predicate\": \"associated_with_increased_likelihood_of\",\n",
+ " \"edge_attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:has_supporting_study_result\",\n",
+ " \"value\": \"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:update_date\",\n",
+ " \"value\": \"2022-05-18\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": 0.002238827241549002,\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": 0.8401321539277617,\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": 1.0328062673251501,\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_size\",\n",
+ " \"value\": 10092760,\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \"value\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_data_source\",\n",
+ " \"value\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\"\n",
+ " }\n",
+ " ]\n",
+ " },\n",
+ " \"object\": {\n",
+ " \"HP\": \"0000360\",\n",
+ " \"id\": \"HP:0000360\",\n",
+ " \"name\": \"Tinnitus\",\n",
+ " \"type\": \"biolink:PhenotypicFeature\"\n",
+ " },\n",
+ " \"source\": {\n",
+ " \"edge_sources\": [\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " },\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "Document IDs appear to be unique\n",
+ "\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import json\n",
+ "import sys, os\n",
+ "import numpy as np\n",
+ "\n",
+ "def parse_ehr_risk(data_folder):\n",
+ "\n",
+ " edges_filename = \"ehr_risk_edges_data_2022_06_01.csv\"\n",
+ " nodes_filename = \"ehr_risk_nodes_data_2022_06_01.csv\"\n",
+ "\n",
+ " nodes_filepath = os.path.join(data_folder, nodes_filename)\n",
+ " edges_filepath = os.path.join(data_folder, edges_filename)\n",
+ " nodes_data = pd.read_csv(nodes_filepath, sep = ',')\n",
+ " edges_data = pd.read_csv(edges_filepath, sep = ',')\n",
+ " \n",
+ " # the nodes file has duplicate ids; fix in enclave in future\n",
+ " nodes_data = nodes_data.drop_duplicates(subset='id', keep=\"first\")\n",
+ " \n",
+ " # biolink category biolink:ChemicalSubstance has been deprecated. Use biolink:ChemicalEntity instead\n",
+ " nodes_data[\"category\"].mask(nodes_data[\"category\"] == \"biolink:ChemicalSubstance\", \"biolink:ChemicalEntity\" , inplace=True )\n",
+ "\n",
+ " # we originally provided the # of patients with condition --> log + patient count, and # of patients without condition --> log - patient count\n",
+ " # get the approximate total number of patients in the study and call it \"total_sample_size\"\n",
+ " edges_data[\"num_patients_with_condition\"] = 10**(edges_data['log_positive_patient_count']) # convert log pos patient count to an actual # \n",
+ " edges_data[\"num_patients_without_condition\"] = 10**(edges_data['log_negative_patient_count']) # convert log neg patient count to an actual #\n",
+ " edges_data = edges_data.drop(['log_positive_patient_count', 'log_negative_patient_count'], axis=1)\n",
+ " edges_data[\"total_sample_size\"] = edges_data[\"num_patients_with_condition\"] + edges_data[\"num_patients_without_condition\"]\n",
+ " edges_data = edges_data.drop(['num_patients_with_condition', 'num_patients_without_condition'], axis=1)\n",
+ " edges_data[\"total_sample_size\"] = np.random.poisson(edges_data[\"total_sample_size\"]) # add poisson noise injection \n",
+ "\n",
+ "# # create confidence interval column by concatenating 'lower_confidence_bound'and 'upper_confidence_bound', then dropping those columns\n",
+ "# edges_data['log_odds_ratio_95_confidence_interval'] = edges_data.apply(lambda row: [row['lower_confidence_bound'], row['upper_confidence_bound']], axis=1)\n",
+ "# edges_data = edges_data.drop(['lower_confidence_bound', 'upper_confidence_bound'], axis=1)\n",
+ " \n",
+ " # ----- RE-CONSTRUCT KG FROM NODES AND EDGES FILES ------ #\n",
+ " # merge the subject names, categories and ids from the nodes csv/table to the edges table\n",
+ " kg = pd.merge(edges_data, nodes_data[['id', 'name', 'category']], left_on='subject', right_on = 'id', how=\"inner\")\n",
+ " kg.rename(columns = {'category_x':'predicate_category',\n",
+ " 'category_y': 'subject_category',\n",
+ " 'id': 'subject_id',\n",
+ " 'name': 'subject_name'}, inplace = True)\n",
+ " # merge the object names, categories and ids from the nodes csv/table to the edges table\n",
+ " kg = pd.merge(kg, nodes_data[['id', 'name', 'category']], left_on='object', right_on = 'id', how=\"inner\")\n",
+ " kg.rename(columns = {'id':'object_id',\n",
+ " 'category': 'object_category',\n",
+ " 'name': 'object_name'}, inplace = True)\n",
+ " # ----- ------------------------------------------ ------ #\n",
+ " \n",
+ " # ensure there are no duplicates\n",
+ " kg = kg.drop_duplicates(['subject', 'object', 'auc_roc', 'p_value', 'feature_coefficient'], keep='first')\n",
+ " \n",
+ " # some of the subjects/objects contain the string literal \"NONE\" (specific culprit is COVID Negative or something) Should look into this in future \n",
+ " kg = kg[~kg[\"subject\"].str.contains(\"NONE\")==True] # subject and object are all CURIEs, not names\n",
+ " kg = kg[~kg[\"object\"].str.contains(\"NONE\")==True]\n",
+ " kg = kg[~kg[\"subject\"].str.contains(\"none\")==True]\n",
+ " kg = kg[~kg[\"object\"].str.contains(\"none\")==True]\n",
+ " kg = kg[~kg[\"subject\"].str.contains(\"None\")==True]\n",
+ " kg = kg[~kg[\"object\"].str.contains(\"None\")==True]\n",
+ " \n",
+ " id_list = [] # use this to check if your document IDs are unique. Collect them and see if they're all unique\n",
+ " \n",
+ " # iterate through each row in KG to yield json formatted triple\n",
+ " for index, row in kg[:40].iterrows(): # comment for testing \n",
+ " id_dict = {} # this is the outter dict that holds inner dicts: subject_dict, association_dict, object_dict, and source_dict\n",
+ " subject_dict = {} # inner dict\n",
+ " association_dict = {} # inner dict\n",
+ " object_dict = {} # inner dict\n",
+ " source_dict = {} # inner dict (provides provenance as per TRAPI 1.4 standards)\n",
+ "\n",
+ " # id generated by concatenating the following: subject_id CURIE, object_id CURIE, AUCROC (removing decimal point) and p-value (removing decimal point), feature coeffcient (removing decimal point), and total sample size\n",
+ " doc_id = \"{}_{}_{}_{}_{}_{}\".format(row[\"subject\"],\n",
+ " row[\"object\"],\n",
+ " str(row['auc_roc']),\n",
+ " str(row['p_value']).replace('.', ''),\n",
+ " str(row['feature_coefficient']).replace('.', ''),\n",
+ " str(row[\"total_sample_size\"]))\n",
+ "\n",
+ " id_list.append(doc_id)\n",
+ " id_dict[\"_id\"] = doc_id\n",
+ " subject_dict[\"{}\".format(row[\"subject\"].split(':')[0])] = \"{}\".format(row[\"subject\"].split(':')[1]) # create the subject dict from the rows of the df \n",
+ " subject_dict[\"id\"] = row[\"subject\"]\n",
+ " subject_dict[\"name\"] = row[\"subject_name\"]\n",
+ " subject_dict[\"type\"] = row[\"subject_category\"]\n",
+ "\n",
+ " association_dict[\"predicate\"] = \"{}\".format(row[\"predicate\"].split(':')[1]) # create the association dict from the rows of the df. Edge attributes need extra work. The predicate is separated out into qualified predicate by X-BTE annotation, so we don't have to worry about qualifiers here\n",
+ " association_dict[\"edge_attributes\"] = []\n",
+ "\n",
+ " source_dict[\"edge_sources\"] = []\n",
+ "\n",
+ " association_dict[\"edge_attributes\"].append(\n",
+ " {\n",
+ " \"attribute_type_id\":\"biolink:has_supporting_study_result\",\n",
+ " \"value\":\"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\":\"biolink:update_date\",\n",
+ " \"value\":row[\"provided_date\"]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": row[\"p_value\"],\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": row[\"auc_roc\"],\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": row['feature_coefficient'],\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ "# {\n",
+ "# \"attribute_type_id\": \"biolink:log_odds_ratio_95_confidence_interval\",\n",
+ "# \"value\": row['log_odds_ratio_95_confidence_interval'],\n",
+ "# \"description\": \"log_odds_ratio_95_confidence_interval\"\n",
+ "# },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:total_sample_size\",\n",
+ " \"value\": row[\"total_sample_size\"],\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ " )\n",
+ " association_dict[\"edge_attributes\"].append(\n",
+ " {\n",
+ " \"attribute_type_id\":\"biolink:primary_knowledge_source\",\n",
+ " \"value\":\"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " }\n",
+ " )\n",
+ " association_dict[\"edge_attributes\"].append(\n",
+ " {\n",
+ " \"attribute_type_id\":\"biolink:supporting_data_source\",\n",
+ " \"value\":\"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\",\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " object_dict[\"{}\".format(row[\"object\"].split(':')[0])] = \"{}\".format(row[\"object\"].split(':')[1]) # create the object dict from the rows of the df \n",
+ " object_dict[\"id\"] = row[\"object\"]\n",
+ " object_dict[\"name\"] = row[\"object_name\"]\n",
+ " object_dict[\"type\"] = row[\"object_category\"]\n",
+ "\n",
+ " source_dict[\"edge_sources\"].append(\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " source_dict[\"edge_sources\"].append(\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " id_dict[\"subject\"] = subject_dict # put the subject, association, object, and source dicts into the outer dict called id_dict\n",
+ " id_dict[\"association\"] = association_dict\n",
+ " id_dict[\"object\"] = object_dict\n",
+ " id_dict[\"source\"] = source_dict\n",
+ " \n",
+ " # throw error for any rows that are missing any relevant values, such as subject name, subject id/CURIE, subject category, p-value, etc...\n",
+ " try:\n",
+ " assert not {x for x in {row[\"total_sample_size\"],\n",
+ " row[\"subject\"],\n",
+ " row[\"subject_name\"],\n",
+ " row[\"subject_category\"],\n",
+ " row[\"object\"],\n",
+ " row[\"object_name\"],\n",
+ " row[\"object_category\"],\n",
+ " row[\"p_value\"],\n",
+ " row[\"auc_roc\"],\n",
+ " row['feature_coefficient']} if x in {None,\n",
+ " \"NONE\",\n",
+ " \"None\",\n",
+ " \"none\",\n",
+ " \"NA\"}}, \"Error: All values including subject and object IDs, categories, names, p-value, AUC-ROC, and feature coefficient must be non-null and not contain string literal None or NONE\"\n",
+ " print(json.dumps(id_dict, indent=2)) # uncomment for testing\n",
+ "# print(index) # uncomment for testing\n",
+ "# yield id_dict # comment for testing\n",
+ " except AssertionError as msg:\n",
+ " print(msg)\n",
+ "\n",
+ " if len(id_list) != len(set(id_list)):\n",
+ " print(\"You do not have unique document IDs for each edge in your KG. Either you have duplicate rows/edges, or you simply didn't make a unique identifer (Document ID) for each one.\\n\\n\\n\")\n",
+ " else:\n",
+ " print(\"Document IDs appear to be unique\\n\\n\\n\")\n",
+ " \n",
+ " \n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f3a074a8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import json\n",
+ "import sys, os\n",
+ "import numpy as np\n",
+ "\n",
+ "def parse_ehr_risk(data_folder):\n",
+ "\n",
+ " edges_filename = \"ehr_risk_edges_data_2022_06_01.csv\"\n",
+ " nodes_filename = \"ehr_risk_nodes_data_2022_06_01.csv\"\n",
+ "\n",
+ " nodes_filepath = os.path.join(data_folder, nodes_filename)\n",
+ " edges_filepath = os.path.join(data_folder, edges_filename)\n",
+ " nodes_data = pd.read_csv(nodes_filepath, sep = ',')\n",
+ " edges_data = pd.read_csv(edges_filepath, sep = ',')\n",
+ " \n",
+ " # the nodes file has duplicate ids; fix in enclave in future\n",
+ " nodes_data = nodes_data.drop_duplicates(subset='id', keep=\"first\")\n",
+ " \n",
+ " # biolink category biolink:ChemicalSubstance has been deprecated. Use biolink:ChemicalEntity instead\n",
+ " nodes_data[\"category\"].mask(nodes_data[\"category\"] == \"biolink:ChemicalSubstance\", \"biolink:ChemicalEntity\" , inplace=True )\n",
+ "\n",
+ " # we originally provided the # of patients with condition --> log + patient count, and # of patients without condition --> log - patient count\n",
+ " # get the approximate total number of patients in the study and call it \"total_sample_size\"\n",
+ " edges_data[\"num_patients_with_condition\"] = 10**(edges_data['log_positive_patient_count']) # convert log pos patient count to an actual # \n",
+ " edges_data[\"num_patients_without_condition\"] = 10**(edges_data['log_negative_patient_count']) # convert log neg patient count to an actual #\n",
+ " edges_data = edges_data.drop(['log_positive_patient_count', 'log_negative_patient_count'], axis=1)\n",
+ " edges_data[\"total_sample_size\"] = edges_data[\"num_patients_with_condition\"] + edges_data[\"num_patients_without_condition\"]\n",
+ " edges_data = edges_data.drop(['num_patients_with_condition', 'num_patients_without_condition'], axis=1)\n",
+ " edges_data[\"total_sample_size\"] = np.random.poisson(edges_data[\"total_sample_size\"]) # add poisson noise injection \n",
+ "\n",
+ "# # create confidence interval column by concatenating 'lower_confidence_bound'and 'upper_confidence_bound', then dropping those columns\n",
+ "# edges_data['log_odds_ratio_95_confidence_interval'] = edges_data.apply(lambda row: [row['lower_confidence_bound'], row['upper_confidence_bound']], axis=1)\n",
+ "# edges_data = edges_data.drop(['lower_confidence_bound', 'upper_confidence_bound'], axis=1)\n",
+ " \n",
+ " # ----- RE-CONSTRUCT KG FROM NODES AND EDGES FILES ------ #\n",
+ " # merge the subject names, categories and ids from the nodes csv/table to the edges table\n",
+ " kg = pd.merge(edges_data, nodes_data[['id', 'name', 'category']], left_on='subject', right_on = 'id', how=\"inner\")\n",
+ " kg.rename(columns = {'category_x':'predicate_category',\n",
+ " 'category_y': 'subject_category',\n",
+ " 'id': 'subject_id',\n",
+ " 'name': 'subject_name'}, inplace = True)\n",
+ " # merge the object names, categories and ids from the nodes csv/table to the edges table\n",
+ " kg = pd.merge(kg, nodes_data[['id', 'name', 'category']], left_on='object', right_on = 'id', how=\"inner\")\n",
+ " kg.rename(columns = {'id':'object_id',\n",
+ " 'category': 'object_category',\n",
+ " 'name': 'object_name'}, inplace = True)\n",
+ " # ----- ------------------------------------------ ------ #\n",
+ " \n",
+ " # ensure there are no duplicates\n",
+ " kg = kg.drop_duplicates(['subject', 'object', 'auc_roc', 'p_value', 'feature_coefficient'], keep='first')\n",
+ " \n",
+ " # some of the subjects/objects contain the string literal \"NONE\" (specific culprit is COVID Negative or something) Should look into this in future \n",
+ " kg = kg[~kg[\"subject\"].str.contains(\"NONE\")==True] # subject and object are all CURIEs, not names\n",
+ " kg = kg[~kg[\"object\"].str.contains(\"NONE\")==True]\n",
+ " kg = kg[~kg[\"subject\"].str.contains(\"none\")==True]\n",
+ " kg = kg[~kg[\"object\"].str.contains(\"none\")==True]\n",
+ " kg = kg[~kg[\"subject\"].str.contains(\"None\")==True]\n",
+ " kg = kg[~kg[\"object\"].str.contains(\"None\")==True]\n",
+ " \n",
+ " id_list = [] # use this to check if your document IDs are unique. Collect them and see if they're all unique\n",
+ " \n",
+ " # iterate through each row in KG to yield json formatted triple\n",
+ " for index, row in kg[:40].iterrows(): # comment for testing \n",
+ " id_dict = {} # this is the outter dict that holds inner dicts: subject_dict, association_dict, object_dict, and source_dict\n",
+ " subject_dict = {} # inner dict\n",
+ " association_dict = {} # inner dict\n",
+ " object_dict = {} # inner dict\n",
+ " source_dict = {} # inner dict (provides provenance as per TRAPI 1.4 standards)\n",
+ "\n",
+ " # id generated by concatenating the following: subject_id CURIE, object_id CURIE, AUCROC (removing decimal point) and p-value (removing decimal point), feature coeffcient (removing decimal point), and total sample size\n",
+ " doc_id = \"{}_{}_{}_{}_{}_{}\".format(row[\"subject\"],\n",
+ " row[\"object\"],\n",
+ " str(row['auc_roc']),\n",
+ " str(row['p_value']).replace('.', ''),\n",
+ " str(row['feature_coefficient']).replace('.', ''),\n",
+ " str(row[\"total_sample_size\"]))\n",
+ "\n",
+ " id_list.append(doc_id)\n",
+ " id_dict[\"_id\"] = doc_id\n",
+ " subject_dict[\"{}\".format(row[\"subject\"].split(':')[0])] = \"{}\".format(row[\"subject\"].split(':')[1]) # create the subject dict from the rows of the df \n",
+ " subject_dict[\"id\"] = row[\"subject\"]\n",
+ " subject_dict[\"name\"] = row[\"subject_name\"]\n",
+ " subject_dict[\"type\"] = row[\"subject_category\"]\n",
+ "\n",
+ " association_dict[\"predicate\"] = \"{}\".format(row[\"predicate\"].split(':')[1]) # create the association dict from the rows of the df. Edge attributes need extra work. The predicate is separated out into qualified predicate by X-BTE annotation, so we don't have to worry about qualifiers here\n",
+ " association_dict[\"edge_attributes\"] = []\n",
+ "\n",
+ " source_dict[\"edge_sources\"] = []\n",
+ "\n",
+ " association_dict[\"edge_attributes\"].append(\n",
+ " {\n",
+ " \"attribute_type_id\":\"biolink:has_supporting_study_result\",\n",
+ " \"value\":\"We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " \"attributes\": [\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_method_type\",\n",
+ " \"value\": \"STATO:0000149\",\n",
+ " \"description\": \"Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\":\"biolink:update_date\",\n",
+ " \"value\":row[\"provided_date\"]\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:p_value\",\n",
+ " \"value\": row[\"p_value\"],\n",
+ " \"description\": \"The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"STATO:0000209\",\n",
+ " \"value\": row[\"auc_roc\"],\n",
+ " \"description\": \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:log_odds_ratio\",\n",
+ " \"value\": row['feature_coefficient'],\n",
+ " \"description\": \"The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.\"\n",
+ " },\n",
+ "# {\n",
+ "# \"attribute_type_id\": \"biolink:log_odds_ratio_95_confidence_interval\",\n",
+ "# \"value\": row['log_odds_ratio_95_confidence_interval'],\n",
+ "# \"description\": \"log_odds_ratio_95_confidence_interval\"\n",
+ "# },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_cohort\",\n",
+ " \"value\": \"age < 18 excluded\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:supporting_study_date_range\",\n",
+ " \"value\": \"2020-2022 (prediction)\"\n",
+ " },\n",
+ " {\n",
+ " \"attribute_type_id\": \"biolink:total_sample_size\",\n",
+ " \"value\": row[\"total_sample_size\"],\n",
+ " \"description\": \"The total number of patients or participants within a sample population.\"\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ " )\n",
+ " association_dict[\"edge_attributes\"].append(\n",
+ " {\n",
+ " \"attribute_type_id\":\"biolink:primary_knowledge_source\",\n",
+ " \"value\":\"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a\",\n",
+ " \"description\": \"The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).\",\n",
+ " }\n",
+ " )\n",
+ " association_dict[\"edge_attributes\"].append(\n",
+ " {\n",
+ " \"attribute_type_id\":\"biolink:supporting_data_source\",\n",
+ " \"value\":\"infores:providence-st-joseph-ehr\",\n",
+ " \"value_type_id\": \"biolink:InformationResource\",\n",
+ " \"value_url\": \"https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP\",\n",
+ " \"description\": \"A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.\",\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " object_dict[\"{}\".format(row[\"object\"].split(':')[0])] = \"{}\".format(row[\"object\"].split(':')[1]) # create the object dict from the rows of the df \n",
+ " object_dict[\"id\"] = row[\"object\"]\n",
+ " object_dict[\"name\"] = row[\"object_name\"]\n",
+ " object_dict[\"type\"] = row[\"object_category\"]\n",
+ "\n",
+ " source_dict[\"edge_sources\"].append(\n",
+ " {\n",
+ " \"resource_id\": \"infores:biothings-multiomics-ehr-risk\",\n",
+ " \"resource_role\": \"primary_knowledge_source\",\n",
+ " \"upstream_resource_ids\": \"infores:providence-st-joseph-ehr\"\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " source_dict[\"edge_sources\"].append(\n",
+ " {\n",
+ " \"resource_id\": \"infores:providence-st-joseph-ehr\",\n",
+ " \"resource_role\": \"supporting_data_source\"\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " id_dict[\"subject\"] = subject_dict # put the subject, association, object, and source dicts into the outer dict called id_dict\n",
+ " id_dict[\"association\"] = association_dict\n",
+ " id_dict[\"object\"] = object_dict\n",
+ " id_dict[\"source\"] = source_dict\n",
+ " \n",
+ " # throw error for any rows that are missing any relevant values, such as subject name, subject id/CURIE, subject category, p-value, etc...\n",
+ "\n",
+ " try:\n",
+ " assert not {x for x in {row[\"total_sample_size\"],\n",
+ " row[\"subject\"],\n",
+ " row[\"subject_name\"],\n",
+ " row[\"subject_category\"],\n",
+ " row[\"object\"],\n",
+ " row[\"object_name\"],\n",
+ " row[\"object_category\"],\n",
+ " row[\"p_value\"],\n",
+ " row[\"auc_roc\"],\n",
+ " row['feature_coefficient']} if x in {None,\n",
+ " \"NONE\",\n",
+ " \"None\",\n",
+ " \"none\",\n",
+ " \"NA\"}}, \"Error: All values including subject and object IDs, categories, names, p-value, AUC-ROC, and feature coefficient must be non-null and not contain string literal None or NONE\"\n",
+ " print(json.dumps(id_dict, indent=2)) # uncomment for testing\n",
+ "# print(index) # uncomment for testing\n",
+ "# yield id_dict # comment for testing\n",
+ " except AssertionError as msg:\n",
+ " print(msg)\n",
+ "\n",
+ " if len(id_list) != len(set(id_list)):\n",
+ " print(\"You do not have unique document IDs for each edge in your KG. Either you have duplicate rows/edges, or you simply didn't make a unique identifer (Document ID) for each one.\\n\\n\\n\")\n",
+ " else:\n",
+ " print(\"Document IDs appear to be unique\\n\\n\\n\")\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 151,
+ "id": "03180ac7",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Document IDs appear to be unique\n",
+ "\n",
+ "[{'_id': 'HP:0008629_HP:0000360_0.8401321539277617_00_8796399245685702_10096539', 'subject': {'HP': '0008629', 'id': 'HP:0008629', 'name': 'Pulsatile tinnitus', 'type': 'biolink:PhenotypicFeature'}, 'association': {'predicate': 'associated_with_increased_likelihood_of', 'edge_attributes': [{'attribute_type_id': 'biolink:has_supporting_study_result', 'value': 'We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).', 'attributes': [{'attribute_type_id': 'biolink:supporting_study_method_type', 'value': 'STATO:0000149', 'description': 'Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)'}, {'attribute_type_id': 'biolink:update_date', 'value': '2022-05-18'}, {'attribute_type_id': 'biolink:p_value', 'value': 0.0, 'description': 'The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.'}, {'attribute_type_id': 'STATO:0000209', 'value': 0.8401321539277617, 'description': \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"}, {'attribute_type_id': 'biolink:log_odds_ratio', 'value': 8.796399245685702, 'description': 'The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.'}, {'attribute_type_id': 'biolink:supporting_study_cohort', 'value': 'age < 18 excluded'}, {'attribute_type_id': 'biolink:supporting_study_date_range', 'value': '2020-2022 (prediction)'}, {'attribute_type_id': 'biolink:supporting_study_size', 'value': 10096539, 'description': 'The total number of patients or participants within a sample population.'}]}, {'attribute_type_id': 'biolink:primary_knowledge_source', 'value': 'infores:biothings-multiomics-ehr-risk', 'value_type_id': 'biolink:InformationResource', 'value_url': 'http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a', 'description': 'The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).'}, {'attribute_type_id': 'biolink:supporting_data_source', 'value': 'infores:providence-st-joseph-ehr', 'value_type_id': 'biolink:InformationResource', 'value_url': 'https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP', 'description': 'A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.'}]}, 'object': {'HP': '0000360', 'id': 'HP:0000360', 'name': 'Tinnitus', 'type': 'biolink:PhenotypicFeature'}, 'source': {'edge_sources': [{'resource_id': 'infores:biothings-multiomics-ehr-risk', 'resource_role': 'primary_knowledge_source', 'upstream_resource_ids': 'infores:providence-st-joseph-ehr'}, {'resource_id': 'infores:providence-st-joseph-ehr', 'resource_role': 'supporting_data_source'}]}}, {'_id': 'MONDO:0010643_HP:0000360_0.8401321539277617_09998721067797812_8585212287149526_10100701', 'subject': {'MONDO': '0010643', 'id': 'MONDO:0010643', 'name': 'acute leukemia (disease)', 'type': 'biolink:Disease'}, 'association': {'predicate': 'associated_with_increased_likelihood_of', 'edge_attributes': [{'attribute_type_id': 'biolink:has_supporting_study_result', 'value': 'We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).', 'attributes': [{'attribute_type_id': 'biolink:supporting_study_method_type', 'value': 'STATO:0000149', 'description': 'Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)'}, {'attribute_type_id': 'biolink:update_date', 'value': '2022-05-18'}, {'attribute_type_id': 'biolink:p_value', 'value': 0.9998721067797812, 'description': 'The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.'}, {'attribute_type_id': 'STATO:0000209', 'value': 0.8401321539277617, 'description': \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"}, {'attribute_type_id': 'biolink:log_odds_ratio', 'value': 8.585212287149526, 'description': 'The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.'}, {'attribute_type_id': 'biolink:supporting_study_cohort', 'value': 'age < 18 excluded'}, {'attribute_type_id': 'biolink:supporting_study_date_range', 'value': '2020-2022 (prediction)'}, {'attribute_type_id': 'biolink:supporting_study_size', 'value': 10100701, 'description': 'The total number of patients or participants within a sample population.'}]}, {'attribute_type_id': 'biolink:primary_knowledge_source', 'value': 'infores:biothings-multiomics-ehr-risk', 'value_type_id': 'biolink:InformationResource', 'value_url': 'http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a', 'description': 'The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).'}, {'attribute_type_id': 'biolink:supporting_data_source', 'value': 'infores:providence-st-joseph-ehr', 'value_type_id': 'biolink:InformationResource', 'value_url': 'https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP', 'description': 'A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.'}]}, 'object': {'HP': '0000360', 'id': 'HP:0000360', 'name': 'Tinnitus', 'type': 'biolink:PhenotypicFeature'}, 'source': {'edge_sources': [{'resource_id': 'infores:biothings-multiomics-ehr-risk', 'resource_role': 'primary_knowledge_source', 'upstream_resource_ids': 'infores:providence-st-joseph-ehr'}, {'resource_id': 'infores:providence-st-joseph-ehr', 'resource_role': 'supporting_data_source'}]}}, {'_id': 'UNII:25ADE2236L_HP:0000360_0.8401321539277617_09367666401584368_4558176672832635_10098743', 'subject': {'UNII': '25ADE2236L', 'id': 'UNII:25ADE2236L', 'name': 'thrombin', 'type': 'biolink:ChemicalSubstance'}, 'association': {'predicate': 'associated_with_increased_likelihood_of', 'edge_attributes': [{'attribute_type_id': 'biolink:has_supporting_study_result', 'value': 'We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).', 'attributes': [{'attribute_type_id': 'biolink:supporting_study_method_type', 'value': 'STATO:0000149', 'description': 'Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)'}, {'attribute_type_id': 'biolink:update_date', 'value': '2022-05-18'}, {'attribute_type_id': 'biolink:p_value', 'value': 0.9367666401584368, 'description': 'The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.'}, {'attribute_type_id': 'STATO:0000209', 'value': 0.8401321539277617, 'description': \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"}, {'attribute_type_id': 'biolink:log_odds_ratio', 'value': 4.558176672832635, 'description': 'The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.'}, {'attribute_type_id': 'biolink:supporting_study_cohort', 'value': 'age < 18 excluded'}, {'attribute_type_id': 'biolink:supporting_study_date_range', 'value': '2020-2022 (prediction)'}, {'attribute_type_id': 'biolink:supporting_study_size', 'value': 10098743, 'description': 'The total number of patients or participants within a sample population.'}]}, {'attribute_type_id': 'biolink:primary_knowledge_source', 'value': 'infores:biothings-multiomics-ehr-risk', 'value_type_id': 'biolink:InformationResource', 'value_url': 'http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a', 'description': 'The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).'}, {'attribute_type_id': 'biolink:supporting_data_source', 'value': 'infores:providence-st-joseph-ehr', 'value_type_id': 'biolink:InformationResource', 'value_url': 'https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP', 'description': 'A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.'}]}, 'object': {'HP': '0000360', 'id': 'HP:0000360', 'name': 'Tinnitus', 'type': 'biolink:PhenotypicFeature'}, 'source': {'edge_sources': [{'resource_id': 'infores:biothings-multiomics-ehr-risk', 'resource_role': 'primary_knowledge_source', 'upstream_resource_ids': 'infores:providence-st-joseph-ehr'}, {'resource_id': 'infores:providence-st-joseph-ehr', 'resource_role': 'supporting_data_source'}]}}, {'_id': 'UNII:K16AIQ8CTM_HP:0000360_0.8401321539277617_09985626800193924_43575215395209606_10097768', 'subject': {'UNII': 'K16AIQ8CTM', 'id': 'UNII:K16AIQ8CTM', 'name': 'pertuzumab', 'type': 'biolink:ChemicalSubstance'}, 'association': {'predicate': 'associated_with_increased_likelihood_of', 'edge_attributes': [{'attribute_type_id': 'biolink:has_supporting_study_result', 'value': 'We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).', 'attributes': [{'attribute_type_id': 'biolink:supporting_study_method_type', 'value': 'STATO:0000149', 'description': 'Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)'}, {'attribute_type_id': 'biolink:update_date', 'value': '2022-05-18'}, {'attribute_type_id': 'biolink:p_value', 'value': 0.9985626800193924, 'description': 'The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.'}, {'attribute_type_id': 'STATO:0000209', 'value': 0.8401321539277617, 'description': \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"}, {'attribute_type_id': 'biolink:log_odds_ratio', 'value': 4.3575215395209606, 'description': 'The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.'}, {'attribute_type_id': 'biolink:supporting_study_cohort', 'value': 'age < 18 excluded'}, {'attribute_type_id': 'biolink:supporting_study_date_range', 'value': '2020-2022 (prediction)'}, {'attribute_type_id': 'biolink:supporting_study_size', 'value': 10097768, 'description': 'The total number of patients or participants within a sample population.'}]}, {'attribute_type_id': 'biolink:primary_knowledge_source', 'value': 'infores:biothings-multiomics-ehr-risk', 'value_type_id': 'biolink:InformationResource', 'value_url': 'http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a', 'description': 'The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).'}, {'attribute_type_id': 'biolink:supporting_data_source', 'value': 'infores:providence-st-joseph-ehr', 'value_type_id': 'biolink:InformationResource', 'value_url': 'https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP', 'description': 'A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.'}]}, 'object': {'HP': '0000360', 'id': 'HP:0000360', 'name': 'Tinnitus', 'type': 'biolink:PhenotypicFeature'}, 'source': {'edge_sources': [{'resource_id': 'infores:biothings-multiomics-ehr-risk', 'resource_role': 'primary_knowledge_source', 'upstream_resource_ids': 'infores:providence-st-joseph-ehr'}, {'resource_id': 'infores:providence-st-joseph-ehr', 'resource_role': 'supporting_data_source'}]}}, {'_id': 'MONDO:0007972_HP:0000360_0.8401321539277617_009395878968875304_392606416950393_10102280', 'subject': {'MONDO': '0007972', 'id': 'MONDO:0007972', 'name': 'Meniere disease', 'type': 'biolink:Disease'}, 'association': {'predicate': 'associated_with_increased_likelihood_of', 'edge_attributes': [{'attribute_type_id': 'biolink:has_supporting_study_result', 'value': 'We train a large collection of multivariable, binary logistic regression models on EHR data for each specific condition/disease/outcome. Features include labs, medications, and phenotypes. Directed edges point from risk factors to specific outcomes (diseases, phenotype, or medication exposure).', 'attributes': [{'attribute_type_id': 'biolink:supporting_study_method_type', 'value': 'STATO:0000149', 'description': 'Binomial logistic regression for analysis of dichotomous dependent variable (in this case, for having this particular condition/disease/outcome or not)'}, {'attribute_type_id': 'biolink:update_date', 'value': '2022-05-18'}, {'attribute_type_id': 'biolink:p_value', 'value': 0.09395878968875304, 'description': 'The p-value represents the probability of observing the estimated coefficient (or more extreme value) under the assumption of the null hypothesis (which assumes that there is no relationship between the independent variable and outcome variable). The p-value associated with each coefficient helps determine whether the relationship between the independent variable and the outcome is statistically significant. A low p-value suggests that the observed relationship between the independent variable and the outcome is unlikely to occur by chance alone, providing evidence against the null hypothesis.'}, {'attribute_type_id': 'STATO:0000209', 'value': 0.8401321539277617, 'description': \"The AUROC provides a way to evaluate the model's ability to discriminate between the two classes (the presenece of absence of condition/disease/outcome). Values range between 0-1; the higher the AUROC, the better the model's ability to discriminate between clasess.\"}, {'attribute_type_id': 'biolink:log_odds_ratio', 'value': 3.92606416950393, 'description': 'The logarithm of the odds ratio (log odds ratio), or the ratio of the odds of event Y occurring in an exposed group versus the odds of event Y occurring in a non-exposed group.'}, {'attribute_type_id': 'biolink:supporting_study_cohort', 'value': 'age < 18 excluded'}, {'attribute_type_id': 'biolink:supporting_study_date_range', 'value': '2020-2022 (prediction)'}, {'attribute_type_id': 'biolink:supporting_study_size', 'value': 10102280, 'description': 'The total number of patients or participants within a sample population.'}]}, {'attribute_type_id': 'biolink:primary_knowledge_source', 'value': 'infores:biothings-multiomics-ehr-risk', 'value_type_id': 'biolink:InformationResource', 'value_url': 'http://smart-api.info/registry?q=d86a24f6027ffe778f84ba10a7a1861a', 'description': 'The EHR Risk KP is created and maintained by the Multiomics Provider team from the Institute for Systems Biology in Seattle, WA. Through a partnership with Providence/Swedish Health Services and Institute for Systems Biology, we analyze over 26 million EHRs. We use these records to train a large collection of interpretable machine learning models which are integrated into a single large Knowledge Graph, with directed edges pointing from risk factors to specific outcomes (diseases, phenotype, or medication exposure).'}, {'attribute_type_id': 'biolink:supporting_data_source', 'value': 'infores:providence-st-joseph-ehr', 'value_type_id': 'biolink:InformationResource', 'value_url': 'https://github.com/NCATSTranslator/Translator-All/wiki/EHR-Risk-KP', 'description': 'A partnership with Providence/Swedish Health Services and Institute for Systems Biology allows analysis of 26 million EHRs from patients in seven states in the US, including Alaska, California, Montana, Oregon, Washington, Texas, and New Mexico. Please email data-access@isbscience.org for more information.'}]}, 'object': {'HP': '0000360', 'id': 'HP:0000360', 'name': 'Tinnitus', 'type': 'biolink:PhenotypicFeature'}, 'source': {'edge_sources': [{'resource_id': 'infores:biothings-multiomics-ehr-risk', 'resource_role': 'primary_knowledge_source', 'upstream_resource_ids': 'infores:providence-st-joseph-ehr'}, {'resource_id': 'infores:providence-st-joseph-ehr', 'resource_role': 'supporting_data_source'}]}}]\n"
+ ]
+ }
+ ],
+ "source": [
+ "def main():\n",
+ " data_folder = \"../../data\" # uncomment for testing\n",
+ " parse_ehr_risk(data_folder) \n",
+ "\n",
+ " \n",
+ "if __name__ == \"__main__\":\n",
+ " main()\n",
+ "\n",
+ "# def f(): return list(parse_ehr_risk(data_folder))\n",
+ "\n",
+ "# print(f())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4f8e6295",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b391313d",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a9131832",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7d579a7c",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3deaf666",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "38af4ce6",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "975b39b8",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ee8cb830",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e8da035e",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "36dbb201",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ea4aaa3b",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1993c430",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5c1a5344",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d2b7d630",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "87af529d",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4d0a883a",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "855f14b3",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b9dd7dd4",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dffef3dd",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a1f1b174",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "14117bf5",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5b4f2f35",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f620ac0c",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f9f8ddcb",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2c12970b",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6075e928",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b885d179",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0ad7fd3a",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "31661afb",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "33b5bdf8",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5f2de47d",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fc5ba61b",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dfbc4354",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6ed22495",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0005db9e",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "04e71223",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "76b3e63f",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6399bef2",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "508fcd23",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "80e6c8da",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "81f51f0b",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ddcf59bd",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d2f7626b",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "43f24102",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6b054438",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b600cab2",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a26f5991",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d19cabbf",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ed293615",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fcff8936",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "67cea542",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4683b850",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "53da2865",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "### from UI, we found duplicate predicate in June 6 2020 edges files"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "658cef8a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "edges_data.loc[(edges_data['subject'] == 'MONDO:0011849') & (edges_data['object'] == 'MONDO:0005083')]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "61b5337d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "edges_data.loc[(edges_data['subject'] == 'MONDO:0005083') & (edges_data['object'] == 'MONDO:0011849')]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c149a94c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "https://github.com/uhbrar/ReasonerAPI/blob/update_guide/MigrationAndImplementationGuide1-4.md"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ed91434c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "variable names to use for Clinical Data Committee:\n",
+ "log_odds_ratio, total_sample_size, log_odds_ratio_95_confidence_interval"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cecd7f59",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# example 1.4 output from molepro\n",
+ "{\n",
+ " \t\"attribute_type_id\": \"biolink:aggregator_knowledge_source\",\t\n",
+ " \t\"value\": \"infores:molepro\", \n",
+ " \t\"value_type_id\": \"biolink:InformationResource\", \n",
+ " \t\"value_url\": \"https://translator.broadinstitute.org/molepro/trapi/v1.0\",\n",
+ " \t\"description\": \"The Molecular Data Provider KP from NCATS Translator\",\n",
+ " \t\"attribute_source\": \"infores:molepro\"\n",
+ " },\n",
+ "\t{\n",
+ " \t\"attribute_type_id\": \"biolink:aggregator_knowledge_source\",\n",
+ " \t\"value\": \"infores:chembl\",\n",
+ " \t\"value_type_id\": \"biolink:InformationResource\", \n",
+ " \t\"value_url\": \"https://www.ebi.ac.uk/chembl\",\n",
+ " \t\"description\": \"ChEMBL is a manually curated database of bioactive molecules...\",\n",
+ " \t\"attribute_source\": \"infores:molepro\"\n",
+ "\t},\n",
+ "\t{\n",
+ " \t\"attribute_type_id\": \"biolink:primary_knowledge_source\",\n",
+ " \t\"value\": \"infores:clinical-trials-gov\", \n",
+ " \t\"value_type_id\": \"biolink:InformationResource\", \n",
+ " \t\"value_url\": \"https://www.clinicaltrials.gov\",\n",
+ " \t\"description\": \"ClinicalTrials.gov is...\",\n",
+ " \t\"attribute_source\": \"infores:chembl\"\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c3a5ca8d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\"edges\": [\n",
+ " {\n",
+ " \"id\": \"Association002\", \n",
+ " \"category\": \"biolink:FeatureVariableAssociation\",\n",
+ " \"subject\": \"ncit:C29886\" # Airborne Particulate Matter (PM2.5), \t \n",
+ " \"predicate\": \"biolink:correlates_with\",\n",
+ " \"object\": \"tvfo:xxxxx\" # t.b.d. term for 'ED Visits for Asthma',\n",
+ " \"subject_modifier\": \"biolink:Exposure\",\n",
+ " \"attributes\": [\n",
+ "\t{\n",
+ " \t\"attribute_type_id\": \"biolink:primary_knowledge_source\", \n",
+ " \t\"value\": \"infores:icees-asthma\", \n",
+ " \t\"value_type_id\": \"biolink:InformationResource\",\t \n",
+ " \t\"value_url\": \"https://icees.renci.org:16339\",\n",
+ " \t\"description\": \"The ICEES Provider ...\",\n",
+ " \t\"attribute_source\": \"infores:icees-asthma\"\n",
+ "\t},\n",
+ "\t{\n",
+ " \t\"attribute_type_id\": \"biolink:supporting_data_source\", \n",
+ " \t\"value\": \"infores:us-epa-airborne-pollutant-exposures-data\",\n",
+ " \t\"value_type_id\": \"biolink:InformationResource\",\t \n",
+ " \t\"description\": \"US Environmental Protection Agency Airborne Pollutant Exposure Data\",\n",
+ " \t\"attribute_source\": \"infores:icees-asthma\"\n",
+ "\t},\n",
+ "\t{\n",
+ " \t\"attribute_type_id\": \"biolink:supporting_data_source\", \n",
+ " \t\"value\": \"infores:unc-cdw-health\",\n",
+ " \t\"value_type_id\": \"biolink:InformationResource\",\t \n",
+ " \t\"description\": \"UNC Carolina Data Warehouse for Health Patient EHR Data\",\n",
+ " \t\"attribute_source\": \"infores:icees-asthma\"\n",
+ "\t}\n",
+ " ]\n",
+ " }\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "df78b8ff",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "edges_sig[\"num_patients_with_condition\"] = 10**(edges_sig['log_positive_patient_count'])\n",
+ "edges_sig[\"num_patients_without_condition\"] = 10**(edges_sig['log_negative_patient_count'])\n",
+ "edges_sig"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}