-
Notifications
You must be signed in to change notification settings - Fork 3
/
smudge_E.py
101 lines (73 loc) · 3.05 KB
/
smudge_E.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import networkx as nx
import random
import pdb
import numpy as np
from gensim.models import Word2Vec
def read_into_dict(file_):
some_dict = {}
with open(file_) as f:
for line in f:
items = line.strip().split()
item1 = items[0]
item2 = items[1:]
some_dict[item1] = item2
return some_dict
def smudge_phenos(graph, num_walk, super_classes, genes_phenos, disease_phenos):
all_walks = []
nodes = graph.nodes()
genes_with_phenos = genes_phenos.keys()
genes_nodes = [node for node in nodes if node.isdigit()]
disease_nodes = [node for node in nodes if node.startswith('OMIM:')]
print('Number of genes nodes: {}'.format(len(genes_nodes)))
print('Number of disease nodes: {}'.format(len(disease_nodes)))
for dis in disease_nodes: #walk over disease nodes
for walk in range(num_walk):
path = []
if dis in disease_phenos:
phenos = disease_phenos[dis]
pheno = random.choice(phenos)
path2root = super_classes[pheno]
path.append(dis)
path.append(pheno)
path.extend(path2root)
all_walks.append(path)
for node in genes_nodes: #walk over PPI nodes
for walk in range(num_walk):
path = []
neigbors = graph.neighbors(node)
ppi_adj = list(set(neigbors).intersection(genes_nodes))
if ppi_adj:
rand_node = random.choice(ppi_adj)
stumbles = 0
while(rand_node not in genes_with_phenos): #keep walking until find gene/w pheno
if (stumbles > 5):
break
neigbors = graph.neighbors(rand_node)
if neigbors:
rand_node = random.choice(neigbors)
stumbles = stumbles + 1
if stumbles > 5:
continue
phenos = genes_phenos[rand_node]
pheno = random.choice(phenos)
if pheno in super_classes:
path2root = super_classes[pheno]
path.append(node)
path.append(pheno) #pheno
path.extend(path2root)
all_walks.append(path)
return all_walks
if __name__ == '__main__':
data = '../../../Documents/smudge_updated_data/'
#modify input files for human data
graph = nx.read_edgelist(data+'human_graph.txt', create_using=nx.DiGraph(), data=(('label', str),))
genes_phenos = read_into_dict(data+'human_genes_mouse_phen.txt')
disease_phenos = read_into_dict(data+'omim_hpos.txt')
super_classes = read_into_dict(data+'phenomNet_super_classes.txt')
print('The number of nodes in graph is: {}'.format(len(graph.nodes())))
print('Walking PPIs and Phenotypes ...')
walks = smudge_phenos(graph, 500, super_classes, genes_phenos, disease_phenos)
print('Training the graph corpus...')
model = Word2Vec(walks,size=128, window=40, min_count=1, sg =1, workers=24)
model.save_word2vec_format(data+'smudge_E_Vec_human_embeddings_500.txt')
pdb.set_trace()