-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtoolsGraph.py
289 lines (242 loc) · 11.3 KB
/
toolsGraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
import numpy as np
import pandas as pd
import pyfastg
import networkx as nx
from Bio import SeqIO
import time
import subprocess
import multiprocessing
import itertools
from multiprocessing import Pool
def metaSpades_assembly(folder_name, read1, read2, isAssembly, isGT, num_processes):
genome_name = 'Data/' + folder_name + '/ref_genome.fasta'
spades_out = 'Data/' + folder_name + '/assemblies'
myUnitig = spades_out + '/before_rr.fasta'
nucmer_name = spades_out + '/contig_mapping'
nucmer_repeat = nucmer_name + '.coords'
k = 55
if isAssembly:
spades_cmd = ['spades.py' + ' -1 ' + read1 + ' -2 ' + read2 + ' -o ' + spades_out + ' -t ' + str(num_processes) + ' --meta' + ' --disable-rr ' + ' -m 70']
with open('Data/' + folder_name + '/spades_terminal_file.txt', "w") as outfile:
p_spades = subprocess.run(spades_cmd, shell=True, stdout=outfile, stderr=outfile)
if isGT:
command_nucmer_1 = ['nucmer' + ' --maxmatch' + ' --nosimplify' + ' -l ' + str(int(k)-1) + ' -c ' + str(int(k)-1) + ' --prefix=' + nucmer_name + ' ' + genome_name + ' ' + myUnitig]
command_nucmer_2 = ['show-coords' + ' -r ' + nucmer_name + '.delta' + ' > ' + nucmer_repeat]
with open('Data/' + folder_name + '/nucmer_terminal_file1.txt', "w") as outfile:
p_nucmer1 = subprocess.run(command_nucmer_1, shell=True, stdout=outfile, stderr=outfile)
with open('Data/' + folder_name + '/nucmer_terminal_file2.txt', "w") as outfile:
p_nucmer2 = subprocess.run(command_nucmer_2, shell=True, stdout=outfile, stderr=outfile)
print("Assembly done")
def get_repeats(coords_file_name, myIDY=100, myL=100, myCN=2):
dfR = pd.read_csv(coords_file_name, skiprows=5, delimiter=' ', header=None, skipinitialspace=True)
dfR.columns = ["S1", "E1", "r1", "S2", "E2", "r2", "Len1", "Len2", "r3", "IDY", "r4", "TAGS"]
dfR = dfR.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
dfR[['TagR', 'TagU']] = dfR['TAGS'].str.split('\t', expand=True)
dfR[['NODE', 'TagU', 'length', 'LenU', 'coverage','coverageU']] = dfR['TagU'].str.split('_', expand=True)
dfR = dfR.drop(columns=['S1', 'E1', 'r1', 'S2', 'E2', 'r2', 'Len1', 'TAGS', 'r3', 'r4', 'NODE', 'length', 'coverage', 'coverageU'])
dfR.Len2 = dfR.Len2.astype(int)
dfR.IDY = dfR.IDY.astype(float)
dfR.TagU = dfR.TagU.astype(int)
dfR.LenU = dfR.LenU.astype(int)
dfR['Len2_norm'] = dfR['Len2'] / dfR['LenU'] * 100
filtered_rows = dfR[dfR['TagU'].duplicated(keep=False) & (dfR['IDY'] >= myIDY) & (dfR['Len2_norm'] >= myL)]
tagu_counts = filtered_rows.groupby('TagU').size()
filtered_df = filtered_rows[filtered_rows['TagU'].isin(tagu_counts[tagu_counts >= myCN].index)]
Repeat_U_0 = []
Repeat_U_99 = []
for tagu, group in filtered_df.groupby('TagU'):
tagr_values = group['TagR'].unique()
if len(tagr_values) > 1:
Repeat_U_99.append(tagu)
else:
Repeat_U_0.append(tagu)
print("Number of Intra Repeat unitigs:", len(Repeat_U_0))
print("Number of Inter Repeat unitigs:", len(Repeat_U_99))
return Repeat_U_0, Repeat_U_99
def find_map_chunck(before_rr_dict):
node_id = []
contig_id = []
for contig_name, sequence1 in before_rr_dict.items():
for node_name, sequence2 in fastg_dict.items():
if sequence2 == sequence1:
node_name_map = node_name
break
name_1_split = contig_name.split('_')
name_2_split = node_name_map.split('_')
contig_id_i = name_1_split[1]
node_id_i = name_2_split[1]
node_id.append(node_id_i+'+')
contig_id.append(int(contig_id_i))
node_id.append(node_id_i+'-')
contig_id.append(int(contig_id_i))
return node_id, contig_id
def map_sequence_names(before_rr, fastg, name, num_processes):
global fastg_dict
before_rr_dict = {}
fastg_dict = {}
for record in SeqIO.parse(before_rr, "fasta"):
before_rr_dict[record.id] = record.seq
for record in SeqIO.parse(fastg, "fasta"):
fastg_dict[record.id] = record.seq
print("Len of before_rr_dict: ", len(before_rr_dict))
print("Len of fastg_dict: ", len(fastg_dict))
# split the before_rr dictionary to num_processes smaller dictionaries:
print("num_processes: ", num_processes)
smaller_dicts = [{} for _ in range(num_processes)]
print("smaller_dicts len: ", len(smaller_dicts))
k = len(before_rr_dict) // num_processes + 1
print("k: ", k)
for i, (key, value) in enumerate(before_rr_dict.items()):
smaller_dicts[i // k][key] = value
args_list = []
for i in range(num_processes):
args_list.append(smaller_dicts[i])
print("arg_list len: ", len(args_list))
node_id = []
contig_id = []
with multiprocessing.Pool(processes=num_processes) as p:
results = p.map(find_map_chunck, args_list)
for node_id_i, contig_id_i in results:
node_id.extend(node_id_i)
contig_id.extend(contig_id_i)
# make a dictionary from node_id to contig_id:
mapping_dict = dict(zip(node_id, contig_id))
mapping_dict_df = pd.DataFrame(list(mapping_dict.items()), columns=['bandage', 'contig'])
mapping_dict_df.to_csv('Data/' + name + '/Bandage_map.csv', index=False)
return mapping_dict
def get_hybG(graphContigs, before_rr, name = 'shakya_1', num_processes=multiprocessing.cpu_count()):
G_hybrid = pyfastg.parse_fastg(graphContigs)
mapping_dict = map_sequence_names(before_rr, graphContigs, name, num_processes)
G_hybrid = nx.relabel_nodes(G_hybrid, mapping_dict)
G_hybrid = G_hybrid.to_undirected()
nx.write_edgelist(G_hybrid, 'Data/' + name + '/adj_matrix.txt', data=False, delimiter=' ')
node_label_to_index = {label: index for index, label in enumerate(G_hybrid.nodes())}
df_graph_map = pd.DataFrame(list(node_label_to_index.items()), columns=['contig', 'idx'])
df_graph_map.to_csv('Data/' + name + '/G_map.csv', index=False)
print("G_hybrid formed and saved")
print(G_hybrid)
return G_hybrid
def get_length_cov(before_rr):
unitig_lengths = {}
unitig_coverages = {}
with open(before_rr) as f:
for line in f:
if line.startswith('>NODE'):
parts = line.strip().split('_')
unitig_id = int(parts[1])
length = int(parts[3])
coverage = float(parts[5])
unitig_lengths[unitig_id] = length
unitig_coverages[unitig_id] = coverage
return unitig_lengths, unitig_coverages
def chunks(l, n):
"""Divide a list of nodes `l` in `n` chunks"""
l_c = iter(l)
while 1:
x = tuple(itertools.islice(l_c, n))
if not x:
return
yield x
def betweenness_centrality_parallel(G, processes=None):
"""Parallel betweenness centrality function"""
p = Pool(processes=processes)
node_divisor = len(p._pool) * 4
node_chunks = list(chunks(G.nodes(), G.order() // node_divisor))
num_chunks = len(node_chunks)
bt_sc = p.starmap(
nx.betweenness_centrality_subset,
zip(
[G] * num_chunks,
node_chunks,
[list(G)] * num_chunks,
[True] * num_chunks,
[None] * num_chunks,
),
)
# Reduce the partial solutions
bt_c = bt_sc[0]
for bt in bt_sc[1:]:
for n in bt:
bt_c[n] += bt[n]
return bt_c
def get_nodeFeatures(G_hybrid, before_rr, num_processes=multiprocessing.cpu_count()):
"""Unitig length and coverage:"""
start_time = time.time()
unitig_lengths, unitig_coverages = get_length_cov(before_rr)
end_time = time.time()
print("Time taken to calculate length and coverage features: ", end_time - start_time)
"""Simple degree":"""
start_time = time.time()
max_degree = max([G_hybrid.degree(n) for n in G_hybrid.nodes])
print("max degree: ", max_degree)
degree_dict_simple = {n: G_hybrid.degree(n) / max_degree for n in G_hybrid.nodes}
print("Simple degree calculated")
end_time = time.time()
print("Time taken to calculate simple degree: ", end_time - start_time)
# remove the self loops:
G_hybrid.remove_edges_from(nx.selfloop_edges(G_hybrid))
"""Betweenness centrality:"""
start_time = time.time()
betweenness_dict = betweenness_centrality_parallel(G_hybrid, num_processes)
betweenness_dict = {node: betweenness_dict[node] for node in G_hybrid.nodes()}
print("Betweenness centrality calculated")
end_time = time.time()
print("Time taken to calculate betweenness centrality: ", end_time - start_time)
"""K-Core number:"""
start_time = time.time()
core_dict = nx.core_number(G_hybrid)
core_dict = {node: core_dict[node] for node in G_hybrid.nodes()}
print("K-Core number calculated")
end_time = time.time()
print("Time taken to calculate K-Core number: ", end_time - start_time)
"""Clustering coeff:"""
start_time = time.time()
clustering_coefficient = nx.clustering(G_hybrid, weight=None)
clustering_dict = {node: clustering_coefficient[node] for node in G_hybrid.nodes()}
print("Clustering coeff calculated")
end_time = time.time()
print("Time taken to calculate clustering coeff: ", end_time - start_time)
"""skewed links:"""
start_time = time.time()
skewed_edges = {}
for node in G_hybrid.nodes():
num_neighs = len(list(G_hybrid.neighbors(node)))
if num_neighs == 0:
skewed_edges[node] = 0
else:
s_count = 0
for neighs in G_hybrid.neighbors(node):
if unitig_coverages[node] >= 2*unitig_coverages[neighs]:
s_count += 1
skewed_edges[node] = s_count/ num_neighs
print("Skewed links calculated")
end_time = time.time()
print("Time taken to calculate skewed links: ", end_time - start_time)
print("All features calculated")
feature_dict_G = {node:
[unitig_lengths[node],
unitig_coverages[node],
degree_dict_simple[node],
betweenness_dict[node],
core_dict[node],
clustering_dict[node],
skewed_edges[node]
] for node in G_hybrid.nodes()}
node_label_to_index = {node: index for index, node in enumerate(G_hybrid.nodes())}
df_unitig_node = pd.DataFrame(list(node_label_to_index.items()), columns=['contig', 'idx'])
return feature_dict_G, df_unitig_node
def save_data(feature_dict_G, all_Repeats, df_unitig_node, name = 'shakya_1'):
X_G = np.array([feature_dict_G[k] for k in feature_dict_G])
y_binary_G = np.array([1 if k in all_Repeats else 0 for k in feature_dict_G])
N = X_G.shape[0]
m = X_G.shape[1]
""" save X and y for this graph:"""
np.savez('Data/' + name + '/X_G.npz', X_G)
np.savez('Data/' + name + '/y_binary.npz', y_binary_G)
print("X, y saved")
print("Number of unitigs on graph: ", N)
print("Number of features: ", m)
print("Number of non-repeat unitigs on the graph: ", N - np.sum(y_binary_G))
""" save X and y for non-graph unitigs:"""
df_unitig_node['y_true_B'] = df_unitig_node.apply(lambda row: 1 if row['contig'] in all_Repeats else 0, axis=1)
df_unitig_node.to_csv('Data/' + name + '/G_node_to_index.csv', index=False)