-
Notifications
You must be signed in to change notification settings - Fork 0
/
SP_calculator_DB.py
300 lines (247 loc) · 11.8 KB
/
SP_calculator_DB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
import sys
import os
import re
import zipfile
import time
from time import gmtime, strftime
import logging
import pickle
from Bio.PDB import *
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from numpy import *
def calculate_distance(res1, res2, chain, chain2):
"""
Retrieves a distance value between two residues.
Does the calculation for GLY residues, without presenting CB
"""
res_name1 = chain[res1].get_resname()
res_name2 = chain2[res2].get_resname()
try:
if res_name1 == 'GLY':
# get atom coordinates as vectors
n = chain[res1]['N'].get_vector()
c = chain[res1]['C'].get_vector()
ca = chain[res1]['CA'].get_vector()
# center at origin
n = n - ca
c = c - ca
# find rotation matrix that rotates n -120 degrees along the ca-c vector
rot = rotaxis(-pi * 120.0 / 180.0, c)
# apply rotation to ca-n vector
cb_at_origin = n.left_multiply(rot)
# put on top of ca atom
cb1 = cb_at_origin + ca
if res_name2 == 'GLY':
# get atom coordinates as vectors
n = chain2[res2]['N'].get_vector()
c = chain2[res2]['C'].get_vector()
ca = chain2[res2]['CA'].get_vector()
# center at origin
n = n - ca
c = c - ca
# find rotation matrix that rotates n -120 degrees along the ca-c vector
rot = rotaxis(-pi * 120.0 / 180.0, c)
# apply rotation to ca-n vector
cb_at_origin = n.left_multiply(rot)
# put on top of ca atom
cb2 = cb_at_origin + ca
if res_name1 == 'GLY':
distance = sqrt((cb1[0]-cb2[0])**2 + (cb1[1]-cb2[1])**2 + (cb1[2]-cb2[2])**2)
else:
cb1 = chain[res1]['CB'].get_vector()
distance = sqrt((cb1[0]-cb2[0])**2 + (cb1[1]-cb2[1])**2 + (cb1[2]-cb2[2])**2)
elif res_name1 == 'GLY':
cb2 = chain2[res2]['CB'].get_vector()
distance = sqrt((cb1[0]-cb2[0])**2 + (cb1[1]-cb2[1])**2 + (cb1[2]-cb2[2])**2)
else:
distance = chain[res1]['CB'] - chain2[res2]['CB']
except:
logging.info('Some trouble to access the atoms, located in {} or {}. Most probable explanation: \
The structure presents only CA'.format(res1,res2))
return 0
return distance
def select_and_save_distances(beg_helix1,end_helix1,
beg_helix2,end_helix2,
chain1,chain2, transdom_intra_distances,
transdom_aa_freq, transdom_dist_freq):
# Compares the positions of each chain and within.
# Checks if the residue has name, some of them are empty
if chain1 is chain2:
if beg_helix1 is beg_helix2:
beg_helix2 += 5
for res1 in range(beg_helix1, end_helix1 + 1):
try:
res_name1 = chain1[res1].get_resname()
except KeyError as e:
continue
for res2 in range(beg_helix2, end_helix2 + 1):
if res1 == res2:
continue
try:
res_name2 = chain2[res2].get_resname()
except KeyError as e:
print('This position ( {} ) didnt present name, skiping it.'.format(res2))
continue
distance = calculate_distance(res1, res2, chain1, chain2)
if distance %1 >= 0.5:
distance = round(distance)
else:
distance = distance - (distance % 1)
distance = round(distance) + 0.5
save_results_dict(res_name1, res_name2,
distance, transdom_intra_distances,
transdom_aa_freq,
transdom_dist_freq)
def save_results_dict(res_name1, res_name2, distance, dict_name_dist, dict_name_freq,dict_freq_dist):
"""
Saves the results of the distances and the frequency in the designed dict.
The frequency dict is given by default, not necessary to definite it
"""
# Check if the entry of the 2 aa has been generated reversely!
if (res_name2, res_name1) in dict_name_dist:
dict_name_dist[(res_name2, res_name1)].setdefault(distance, 0)
dict_name_dist[(res_name2, res_name1)][distance] += 1
dict_name_dist[(res_name2, res_name1)][999] += 1
dict_name_freq.setdefault(res_name1, 0)
dict_name_freq[res_name1] += 1
return
# Save results into a dict-list
dict_name_dist.setdefault((res_name1, res_name2), {})
dict_name_dist[(res_name1, res_name2)].setdefault(distance, 0)
dict_name_dist[(res_name1, res_name2)][distance] += 1
dict_name_dist[(res_name1, res_name2)].setdefault(999, 0)
dict_name_dist[(res_name1, res_name2)][999]+= 1
dict_name_freq.setdefault(res_name1, 0)
dict_name_freq[res_name1] += 1
dict_freq_dist.setdefault(distance,0)
dict_freq_dist[distance] += 1
return
def total_division_and_pickling(tuple_dict):
"""Divides each dictionary with the total count and saves the result in a pickle"""
transdom_intra_distances = tuple_dict[0]
transdom_inter_distances = tuple_dict[1]
transdom_aa_freq = tuple_dict[2]
transdom_dist_intra_freq = tuple_dict[3]
transdom_dist_inter_freq = tuple_dict[4]
for dires,value in transdom_inter_distances.items():
for dist,count in value.items():
if dist == 999:
continue
transdom_inter_distances[dires][dist] = transdom_inter_distances[dires][dist] / transdom_inter_distances[dires][999]
for dires,value in transdom_intra_distances.items():
for dist,count in value.items():
if dist == 999:
continue
transdom_intra_distances[dires][dist] = transdom_intra_distances[dires][dist] / transdom_intra_distances[dires][999]
total_aa = sum(values for values in transdom_aa_freq.values())
for key,value in transdom_aa_freq.items():
transdom_aa_freq[key] = transdom_aa_freq[key] / total_aa
total_intra = sum(values for values in transdom_dist_intra_freq.values())
for key,value in transdom_dist_intra_freq.items():
transdom_dist_intra_freq[key] = transdom_dist_intra_freq[key] / total_intra
total_inter = sum(values for values in transdom_dist_inter_freq.values())
for key,value in transdom_dist_inter_freq.items():
transdom_dist_inter_freq[key] = transdom_dist_inter_freq[key] / total_inter
tuple_dict = (transdom_intra_distances,
transdom_inter_distances,transdom_aa_freq,
transdom_dist_intra_freq,transdom_dist_inter_freq)
pickle.dump(tuple_dict, open( "SP_dataset.p", "wb" ) )
logging.info('THE END\n\n')
exit()
def MAIN(directory):
"""
Calculates intra and inter distances of transmembrane domains present in
the protein.
Returns dataset of distances for all the proteins analysed.
"""
transdom_inter_distances = {}
transdom_intra_distances = {}
transdom_aa_freq = {}
transdom_dist_intra_freq = {}
transdom_dist_inter_freq = {}
logging.info('Processing the following dir: {}'.format(directory))
filename_chain_dom_dict = pickle.load( open( "my_phobius_domains.p", "rb" ))
for PDB_id, chains_file in filename_chain_dom_dict.items():
if filename_chain_dom_dict[PDB_id] == None:
continue
PDB_id = PDB_id[:-5] + 'pdb'
file_dir = os.path.join(directory, PDB_id)
filename = file_dir
# Select the file and generate a structure var with all the pdb inside.
#SOmetimes there is no option to generate the structure due to PDB files errors.
file_cif = '/home/adria/UPF/PDB_cif/' + PDB_id[:-3] + 'cif'
mmcif_dict = MMCIF2Dict(file_cif)
try:
entity = PDBParser()
structure = entity.get_structure(mmcif_dict['_entry.id'], filename)
logging.info('Working with {}'.format(mmcif_dict['_entry.id']))
model = structure[0]
except Exception as e:
logging.info('Unable to define structure, {}. '.format(mmcif_dict['_entry.id']))
continue
#CHeck if the protein has helix domains.
chain_ind = 0
typical_chains = ('A','B','C','D','E','F','G','H','I','J','K','L')
for chain, domains in chains_file.items():
logging.info('Chain processed: {}\n Transdom: {}'.format(chain, domains))
non_interesting_case = re.search("[0-9]|[a-z]", chain)
other_non_interesting_case = re.search("[P-Z]", chain)
if non_interesting_case or other_non_interesting_case:
continue
if chain not in model:
if 'A' not in filename_chain_dom_dict.keys():
chain = typical_chains[chain_ind]
chain_ind += 1
chain1 = model[chain]
for val in domains:
beg_helix1, end_helix1 = int(val[0]), int(val[1])
# ################INTRA DISTANCES ####################################
select_and_save_distances(beg_helix1,end_helix1,
beg_helix1,end_helix1,
chain1,chain1, transdom_intra_distances,
transdom_aa_freq,
transdom_dist_intra_freq)
# ################INTER DISTANCES SAME CHAIN ##############################
for val2 in domains:
if val == val2:
continue
beg_helix2, end_helix2 = int(val2[0]), int(val2[1])
select_and_save_distances(beg_helix1,end_helix1,
beg_helix2,end_helix2,
chain1,chain1, transdom_inter_distances,
transdom_aa_freq,
transdom_dist_inter_freq)
# ################INTER DISTANCES BTWN CHAIN ########################
chain_ind2 = 0
for chain2, value in chains_file.items():
non_interesting_case2 = re.search("[0-9]|[a-z]", chain)
other_non_interesting_case2 = re.search("[P-Z]", chain)
if non_interesting_case2 or other_non_interesting_case2:
continue
if chain2 not in model:
if 'A' not in filename_chain_dom_dict.keys():
chain2 = typical_chains[chain_ind]
chain_ind2 += 1
chain2 = model[chain2]
if chain1 == chain2:
continue
for val3 in value:
beg_helix3, end_helix3 = int(val3[0]), int(val3[1])
select_and_save_distances(beg_helix1,end_helix1,
beg_helix3,end_helix3,
chain1,chain2, transdom_inter_distances,
transdom_aa_freq,
transdom_dist_inter_freq)
tuple_dict = transdom_intra_distances,transdom_inter_distances,transdom_aa_freq,transdom_dist_intra_freq,transdom_dist_inter_freq
total_division_and_pickling(tuple_dict)
if __name__ == "__main__":
logging.basicConfig(filename=("log_record_" + sys.argv[0] + ".txt"), level=logging.INFO)
logging.info('\n\nNew running: ' + strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()))
if len(sys.argv) >= 1:
dir_path = sys.argv[1]
if os.path.isdir(dir_path):
MAIN(dir_path)
else:
raise ValueError("We only work with directory files sir.")
else:
raise ValueError("Please specify input [1]")