forked from prcurran/hotspots
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhotspots_json_out.py
292 lines (233 loc) · 10.2 KB
/
hotspots_json_out.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
from ccdc.protein import Protein
from ccdc.io import MoleculeWriter
from hotspots.hs_io import HotspotWriter
from hotspots.calculation import Runner
import pandas as pd
import re
import json
from time import perf_counter
import os
from pathlib import Path
import argparse
# import glob
WORKING_DIR = Path(__file__).parent.resolve()
NUM_ROTS = 3000
output_probe_label = {
'donor_projected' : 'donor',
'donor_ch_projected' : 'donor CH',
'acceptor_projected' : 'acceptor',
'hydrophobe' : 'hydrophobe',
}
def categorise_hotspot_score(score, probe):
if probe in output_probe_label:
probe = output_probe_label[probe]
if score > 17:
return f"strong {probe}"
elif score >= 14:
return f"moderate {probe}"
else:
return f"weak {probe}"
def find_hotspots(file, args, profile=False):
pdb_id, file_type = file.split(".")
out_dir = f"{args.input_dir}/results/{NUM_ROTS}/{pdb_id}_{file_type}"
if not os.path.exists(f"{args.input_dir}/results"):
os.mkdir(f"{args.input_dir}/results")
if not os.path.exists(f"{args.input_dir}/results/{NUM_ROTS}"):
os.mkdir(f"{args.input_dir}/results/{NUM_ROTS}")
if not os.path.exists(out_dir):
os.mkdir(out_dir)
timings = {"pdb_id": pdb_id,
"file_type": file_type}
t0 = perf_counter()
print(f"Begin processing for {pdb_id}...")
filepath = args.input_dir + f"/{file}"
prot = Protein.from_file(filepath)
prot.remove_all_waters()
prot.add_hydrogens()
for c in prot.cofactors:
prot.remove_cofactor(c.identifier)
for l in prot.ligands:
prot.remove_ligand(l.identifier)
# Create lookup dictionaries of chain and residue IDs
chain_id_lookup = {}
residue_id_lookup = {}
for c in prot.chains:
chain_id_lookup[c.identifier] = c.author_identifier
residue_id_lookup[c.identifier] = {}
for r in c.residues:
# residue identifier is returned as "C:RES123" where "C" is chain ID
rid = r.identifier.split(":")[-1]
rid_author = r.author_identifier.split(":")[-1]
residue_id_lookup[c.identifier][rid] = rid_author
t1 = perf_counter()
timings['pre-processing'] = t1-t0
if profile == True:
print(f"Pre-processing protein = {t1-t0:.2f} seconds")
save_cavity = "cavity" in args.retain
runner = Runner(pdb_id=pdb_id, output_path=out_dir, save_cavity=save_cavity)
settings = Runner.Settings()
settings.nrotations = NUM_ROTS
# Only SuperStar jobs are parallelised (one job per processor). By default there are 3 jobs, when calculating charged interactions there are 5.
print("file_ID =", pdb_id)
results = runner.from_protein(prot,
nprocesses=1,
charged_probes=False,
buriedness_method='ghecom',
settings=settings)
t2 = perf_counter()
timings['calculation'] = t2-t1
if profile:
print(f"Whole calculation process = {t2-t1:.1f} seconds")
# Creates "results/pdb1/out.zip"
with HotspotWriter(out_dir) as writer:
writer.write(results)
# print(f"Calculations for {file_ID} saved to {out_dir}")
# t4 = perf_counter()
# if profile == True:
# print(f"Save initial results = {t4-t2:.2f} seconds")
p = results.score(results.protein, as_dict=True)
t3 = perf_counter()
timings['scoring'] = t3-t2
if profile:
print(f"Scoring = {t3-t2:.1f} seconds")
df = (pd.DataFrame.from_dict(p, orient='index', columns= ['type','chain_ID','residue_ID','score'])
.reset_index()
.rename(columns={'index':"atom_ID"})
.groupby(['chain_ID','residue_ID'])[['atom_ID','type','score']]
.agg(list)
)
df.to_csv(f"{out_dir}/{pdb_id}_hotspots.csv")
t4 = perf_counter()
timings['create_dataframe'] = t4-t3
if profile:
print(f"Create dataframe = {t4-t3:.2f} seconds")
final_dict = {
"data_resource": "CSD/Fragment Hotspots",
"resource_version": "CSD v5.45",
"software_version": "Hotspots v1.06",
"resource_entry_url": "https://www.ccdc.cam.ac.uk/open-source-products/fragment-hotspots/",
"release_date": "20/02/2024",
"pdb_id": pdb_id,
"chains": [],
"sites" : [],
"evidence_code_ontology": [
{
"eco_term": "automatically integrated combinatorial computational and experimental evidence used in automatic assertion",
"eco_code": "ECO_0007667"
}
]
}
for chain, single_chain_df in df.groupby(level=0):
single_chain_df = single_chain_df.droplevel(0)
author_chain = chain_id_lookup[chain]
chain_dict = {'chain_label' : author_chain,
'residues' : []
}
for residue in single_chain_df.index:
residue_dict = {}
if residue not in residue_id_lookup[chain]:
print(f"WARNING: Skipping residue {residue} reported by hotspots as not in lookup table for {chain}")
continue
author_residue = residue_id_lookup[chain][residue]
try:
residue_dict = {'pdb_res_label' : re.findall("\d+", author_residue)[0],
'aa_type' : re.findall("[A-Z]+", author_residue)[0],
'site_data' : []
}
except:
print(f"Error found for {residue}")
num_sites_in_residue = len(single_chain_df.loc[residue, "atom_ID"])
try:
for i in range(num_sites_in_residue):
site_id_ref = single_chain_df.loc[residue, "atom_ID"][i]
raw_score = round(single_chain_df.loc[residue, "score"][i])
raw_score_unit = single_chain_df.loc[residue, "type"][i][0]
atom_dict = {'site_id_ref' : site_id_ref,
'raw_score' : raw_score,
'raw_score_unit' : raw_score_unit,
'confidence_classification' : 'null'
}
score_label = categorise_hotspot_score(raw_score, raw_score_unit)
site_dict = {'site_id' : site_id_ref,
'label' : score_label
}
residue_dict['site_data'].append(atom_dict)
final_dict['sites'].append(site_dict)
except:
raise
print(f"Error found for {residue}")
chain_dict['residues'].append(residue_dict)
final_dict['chains'].append(chain_dict)
# Serializing json
json_object = json.dumps(final_dict, indent=4)
# Writing to sample.json
with open(f"{out_dir}/{pdb_id}_hotspots.json", "w") as outfile:
outfile.write(json_object)
print(f'Hotspots for {pdb_id} written to "{out_dir}/{pdb_id}_hotspots.json"')
t5 = perf_counter()
if profile:
print(f"Process JSON - {t5-t4:.2f} seconds")
timings['total'] = t5-t0
return timings
if __name__ == '__main__':
retainable_files = ['cavity',]
parser = argparse.ArgumentParser(description='Run Hotspots on all suitable files in a subdirectory')
parser.add_argument('input_dir',
help='the subdirectory containing the input files',
type=str)
parser.add_argument('-s', '--sort',
action='store_true',
help='sort all files by size and run smallest first')
parser.add_argument('--retain',
choices=retainable_files,
default=[],
nargs='*',
help='retain intermediate files')
args = parser.parse_args()
input_dir = WORKING_DIR / args.input_dir
# Storing list of all files (file paths)
# in the given directory in list_of_files
# Sort list of file names by size
list_of_files = filter(lambda x: os.path.isfile
(os.path.join(input_dir, x)),
os.listdir(input_dir) )
list_of_files = sorted(list_of_files,
key = lambda x: os.stat
(os.path.join(input_dir, x)).st_size)
# Iterate over sorted list of file
# names and print them along with size one by one
for name_of_file in list_of_files:
path_of_file = os.path.join(input_dir, name_of_file)
size_of_file = os.stat(path_of_file).st_size
print(size_of_file, ' -->', name_of_file)
try:
ref_df = pd.read_csv("processed_files_and_timings.csv")
print("ref_df loaded")
id_set = set(ref_df['pdb_id'])
except FileNotFoundError:
print("No record of processed files found")
print("Creating a new file instead")
ref_df = pd.DataFrame()
id_set = set()
print(f"{(list_of_files)} to process")
for file in list_of_files:
pdb_id, file_type = file.split(".")
if file_type not in {'cif', 'ent', 'pdb'}:
print(f"{file} is not of a recognised file type")
continue
if pdb_id in id_set:
print(f"{pdb_id} already processed")
continue
if 1:#try:
timings = find_hotspots(file, args, profile=False)
print(f"\n{file} processed in {timings['total']:.1f} seconds")
# find_hotspots returns a dict that must be converted to df
results_list = []
results_list.append(timings)
new_df = pd.DataFrame.from_dict(results_list)
# add results to existing and save latest version
ref_df = pd.concat([ref_df, new_df], axis=0)
print(f"{len(ref_df)} of {len(list_of_files)} files complete!\n")
ref_df.to_csv("processed_files_and_timings.csv", index=False)
else:#except Exception as error:
print(f"An error occurred for {file}:", type(error).__name__, "–", error)