-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path_2_run_completing_sidechain_downstream.py
238 lines (208 loc) · 14.9 KB
/
_2_run_completing_sidechain_downstream.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# step2:
# use foldx buildmodel to complete side chain atom
# for downstream pdbs, non-ATOM entries like HETATM have not been removed from each pdb file
# after the side chain completion, we need consider how to correspond a foldx WT protein with AF2 generated MT protein (and removing all non-ATOM entries)
import os
import shutil
import re
import numpy as np
import pandas as pd
def complete_side_chain(root_path, folder_name):
if not os.path.exists('temp'):
os.makedirs('temp')
print('root_path + folder_name:', root_path + folder_name)
# in order to generate effective residue graph for equivariant models, all residues in a protein should have complete backbone atoms (C, CA, N, O)
if os.path.exists(root_path + folder_name):
residues = ['ARG', 'MET', 'VAL', 'ASN', 'PRO', 'THR', 'PHE', 'ASP', 'ILE',
'ALA', 'GLY', 'GLU', 'LEU', 'SER', 'LYS', 'TYR', 'CYS', 'HIS', 'GLN', 'TRP']
res_code = ['R', 'M', 'V', 'N', 'P', 'T', 'F', 'D', 'I',
'A', 'G', 'E', 'L', 'S', 'K', 'Y', 'C', 'H', 'Q', 'W']
res2code = {x: idxx for x, idxx in zip(residues, res_code)}
chains = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
chain_code = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'G', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
chain2code = {x: idxx for x, idxx in zip(chains, chain_code)}
code2chain = {x: idxx for x, idxx in zip(chain_code, chains)}
# read the pdb name set in current folder
pdbs = os.listdir(root_path + folder_name)
print('total pdb wide-type complex number in current folder:', len(pdbs))
# read the original mutation file for completing side chain atoms more precisely, S stands for single site mutation, while M stands for muliple site mutation
if folder_name.strip('./') == 'M1101':
mutation_set = pd.read_csv(root_path + 'M1101.csv', encoding='latin-1')
elif folder_name.strip('./') == 'M1707':
mutation_set = pd.read_csv(root_path + 'M1707.csv')
elif folder_name.strip('./') == 'S641':
mutation_set = pd.read_csv(root_path + 'S641.csv')
elif folder_name.strip('./') == 'S645':
mutation_set = pd.read_csv(root_path + 'S645.csv')
elif folder_name.strip('./') == 'S1131':
mutation_set = pd.read_csv(root_path + 'S1131.csv')
elif folder_name.strip('./') == 'S4169':
mutation_set = pd.read_csv(root_path + 'S4169.csv')
mutation_set_np = np.array(mutation_set)
if not os.path.exists(root_path + folder_name.strip('/') + '_foldx/'):
os.makedirs(root_path + folder_name.strip('/') + '_foldx/')
my_re = re.compile(r'[A-Za-z]', re.S)
# for downstream side chain completion, there are some complexes which are very big, not suitable to be completed repeatedly per mutation case
# in this case, a name can be set, for all mutations based on the same widetype complex, only one completed pdb will be used as the widetype protein for all mutation cases
large_complex_name = '1KBH'
counter1 = 0 # counter1 is used to record which pdb is being processed now in downstream sets (the order is based on corresponding csv files, i.e., mutation_set_np)
for i in range(mutation_set_np.shape[0]):
counter1 += 1
if counter1 % 3000 == 0:
print('counter:', counter1)
current_entry = mutation_set_np[i]
pdb = current_entry[0] + '.pdb'
# for selecting specific pdbs
# a foldx test about incomplete backbones: the incomplete residue will be deleted, and the overall residue serial number will not be re-arranged
# if pdb != '1JRH.pdb' or counter1 != 138:
# continue
print('current pdb to be processed:', pdb, 'counter:', counter1) # updated pdb name
# only the successfully completed protein (through all the procedures in the loop) can be skipped
if os.path.exists('{}/{}'.format(root_path + folder_name.strip('/') + '_foldx', pdb[:-4] + '__' + str(counter1) + '.pdb')):
print('FoldX_{} already exists'.format(pdb))
continue
# before reading current pdb for finding side chain completion positions and calling foldx
# first to check large_complex_name for saving the completion time for over large proteins
if large_complex_name and large_complex_name == current_entry[0]: # current_entry[0] does not include '.pdb'
existing_pdbs = os.listdir('{}/'.format(root_path + folder_name.strip('/') + '_foldx'))
temp_list = sorted([i for i in existing_pdbs if (large_complex_name in i)])
if len(temp_list) != 0: # only in this case, the previously generated pdb will be copied, otherwise still need to call foldx
shutil.copyfile('{}/{}'.format(root_path + folder_name.strip('/') + '_foldx', temp_list[0]),
'{}/{}'.format(root_path + folder_name.strip('/') + '_foldx', pdb[:-4] + '__' + str(counter1) + '.pdb'))
continue
# read current pdb
with open(root_path + folder_name + pdb) as f:
lines = f.readlines()
# a check to make sure all chain names are letters instead of numbers (for being processed by FoldX)
if lines[0][21].isdigit(): # get the chain id of the first line of pdb (should be an 'ATOM' line)
# record the pdb that should be modified the chain names
with open('./data/data_information/{}_chainchange_protein.txt'.format(folder_name.strip('/')), 'a') as f:
f.write(pdb[:-4] + '__' + str(counter1) + '.pdb' + '\n')
print('chain change protein: {}'.format(pdb[:-4] + '__' + str(counter1) + '.pdb'))
# *** when processing the downstream datasets, current plan is just to record such samples, instead of modifying them like the way of completing the pretraining set ***
continue # skip to the next pdb
counter2 = 0
temp_set1 = set()
for line in lines:
# add an extra limitation, the total residue number cannot exceed 3000:
if line[0:4] == 'ATOM' and line[12:16].strip() == 'CA':
counter2 +=1
# add an extra limitation, at least in a pdb file, the backbone atoms including C, CA, N, O should be retained:
if line[0:4] == 'ATOM':
temp_set1.add(line[12:16].strip())
if counter2 >= 3000:
with open('./data/data_information/{}_overlarge_protein.txt'.format(folder_name.strip('/')), 'a') as f:
f.write(pdb[:-4] + '__' + str(counter1) + '.pdb' + '\n')
print('overlarge protein: {}'.format(pdb[:-4] + '__' + str(counter1) + '.pdb'))
continue
if len(temp_set1) < 4: # C, CA, N, O
with open('./data/data_information/{}_overincomplete_protein.txt'.format(folder_name.strip('/')), 'a') as f:
f.write(pdb[:-4] + '__' + str(counter1) + '.pdb' + '\n')
print('overincomplete protein: {}'.format(pdb[:-4] + '__' + str(counter1) + '.pdb'))
continue
# find a proper mutation site for foldx
residue_flag = -1
temp_flag1 = False
temp_set4 = set()
specified_flag = True # for the case of that the mutation information is known (for downstream datasets)
if specified_flag == True:
# M1101, S641, S645, S1131, S4169 mutation information is on 5th column with the format of D:L483T
# M1707 mutation information is on 5th column with the format of DA11A
cont = ''
counter3 = 0
mutations = current_entry[4].split(',') # DA11A,MA14V,HA18Q,RA19H,FA25A,QA29K,EA33R
# foldx cont example: 'FA39L,FB39L;'
for mutation in mutations:
counter3 += 1
if folder_name.strip('./') == 'M1707': # DA11A,MA14V,HA18Q,RA19H,FA25A,QA29K,EA33R
resabbv = mutation[0] # WT
chainid = mutation[1]
res_idx = mutation[2:-1]
mutname = mutation[-1]
else: # D:L483T,D:V486P,D:H487A,D:A488M,D:I491L,D:A492P,D:M496I
mutation = mutation.split(':')
chainid = mutation[0]
resabbv = mutation[-1][0]
res_idx = mutation[-1][1:-1]
mutname = mutation[-1][-1]
if counter3 < len(mutations):
cont += '{}{}{}{},'.format(resabbv, chainid, res_idx, resabbv)
else:
cont += '{}{}{}{};'.format(resabbv, chainid, res_idx, resabbv)
print('cout for mutation:', cont)
# add a checkpoint for further adjusting cout
# cont = 'YH100AY'
with open('individual_list.txt', 'w') as f:
f.write(cont)
else: # for the pretraining set and other sets
for line in lines:
# print(line[0:4], line[22:28].strip(), re.findall(my_re, line[22:28].strip()), re.findall(my_re, '1a'))
# it seems that foldx cannot handle the residue serial number like 1A/1B (based on line[22:28]), thus filter it out
# it seems that foldx cannot handle the mutation on residue that is lack of complete backbone atoms (C/CA/N/O), becuase foldx will remove this residue (Creating a GAP, e.g., 1hxn_1.pdb)
if line[0:4] == 'ATOM':
# res_idx = int(line[22:28].strip()) # residue serial number
res_idx = line[22:28].strip() # need to consider residue serial number like 1A/1B, thus cannot use int transformation here
if residue_flag != res_idx:
# check the logic of foldx for processing incomplete backbone atoms
# if not ('C' in temp_set4 and 'CA' in temp_set4 and 'N' in temp_set4 and 'O' in temp_set4) and counter3 > 0:
# print('residue_flag:', residue_flag, line, pdb)
# exit()
residue_flag = res_idx
temp_set4 = set()
temp_set4.add(line[12:16].strip()) # atom name
# counter3 += 1
else:
temp_set4.add(line[12:16].strip())
if 'C' in temp_set4 and 'CA' in temp_set4 and 'N' in temp_set4 and 'O' in temp_set4: # need to find a residue postion in which backbone atoms are complete
if not len(re.findall(my_re, line[22:28].strip())): # find residue serial number not like 1A/1B
resname = line[17:21].strip() # had been modified, actually residue is three-digit rather than 4- or more digit, like ALA
chainid = line[21] # A
res_idx = line[22:28].strip() # 4 (is a range, info[2:-1])
# need to consider manual-made amino acid type
if resname in res2code.keys():
resabbv = res2code[resname]
else:
# if outlier residue type is found, this protein will not be retained (resabbv as the input of foldx will not be generated)
with open('./data/data_information/{}_aaoutlier_protein.txt'.format(folder_name.strip('/')), 'a') as f:
f.write(pdb + '\n')
print('aaoutlier protein: {}'.format(pdb))
temp_flag1 = True
break
print('resname, chainid, res_idx, resabbv:', resname, chainid, res_idx, resabbv)
break
# for the test example, do not forget to use exit() below to prevent false pdb generation
# resabbv, chainid, res_idx = 'F', 'B', 13
with open('individual_list.txt', 'w') as f:
# cont = '{}{}{}{};'.format(wildname, chainid, resid, wildname)
cont = '{}{}{}{};'.format(resabbv, chainid, res_idx, resabbv)
f.write(cont) # write to the main dict for guiding foldx to complete side chain
# skip current pdb if current pdb is categorized into 'aaoutlier protein'
if temp_flag1 == True:
continue
# call foldx
# pdb is the full pdb file name
comm = 'foldx_4 --command=BuildModel --pdb={} --mutant-file={} --output-dir={} --pdb-dir={}'.format(
pdb, 'individual_list.txt', 'temp', root_path + folder_name) # output the generated files to temp folder
os.system(comm)
print('finishing to run FoldX.')
# exit() # end here for observing the output of foldx
# if code normally runs to here, indicating that at least no explicit error arises during calling foldx (but it is still possible that no valid mutation file is generated)
# pdb[:-4] is the pdb file name removing the '.pdb' suffix
# record pdbs that cannot be processed by foldx due to the reasons beyond all the screening logics above
if not os.path.exists('{}/{}_1.pdb'.format('temp', pdb[:-4])):
with open('./data/data_information/{}_otherinvalid_protein.txt'.format(folder_name.strip('/')), 'a') as f:
f.write(pdb[:-4] + '__' + str(counter1) + '.pdb' + '\n')
print('other invalid protein: {}'.format(pdb[:-4] + '__' + str(counter1) + '.pdb'))
continue
else:
shutil.move('{}/{}_1.pdb'.format('temp', pdb[:-4]), '{}/{}'.format(root_path + folder_name.strip('/') + '_foldx/', pdb[:-4] + '__' + str(counter1) + '.pdb'))
print('current processed file number:', len(set(os.listdir(root_path + folder_name.strip('/') + '_foldx/'))))
shutil.rmtree('temp')
if not os.path.exists('temp'):
os.makedirs('temp')
else:
print('required original pdb files do not exist.')
if __name__ == '__main__':
# for downstream pdbs, non-ATOM entries like HETATM have not been removed from each pdb file
# after the side chain completion, we need consider how to correspond a foldx WT protein with AF2 generated MT protein (and removing all non-ATOM entries)
complete_side_chain('./data/', 'M1101/') # the processed pdbs are output to the specified fold_name with an extra 'foldx' suffix