-
Notifications
You must be signed in to change notification settings - Fork 0
/
drms.py
487 lines (365 loc) · 23.4 KB
/
drms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
#!/usr/bin/env python
#
# drms.py
#
# A suite of functions to confirm/verify mean structure hypothesis using distance RMS metric (different from normal RMSD).
#
# Includes code that:
# - calculate the mean distance matrix (i.e. a distance matrix with each entry already weighted-averaged from individual "trial" dist matrices)
# - calculate the distance RMS given two (sets of) matrices
# - some verification function that makes sure the dist matrix is a valid one
#
#
#
import numpy
import os
from tqdm.auto import tqdm, trange
import pickle
import gc
import argparse
## Variables
variants = ['wsh2045', 'wsh2029','wsh2036','wsh2044','wsh2054'] # list of all the variants to loop over, also used to access those files.
ref_var = [ 'dist_matrix', 'dist_matrix_avg', 'dist_matrix_avg_helix' ] # list of file names for reference structure
rfc_path = "/home/jml230/BdpA Publishable/rfc" # should be pointed to your current directory.
mean_matrix_template = '/home/jml230/BdpA Publishable/rfc/{var}_eng/mean_matrix.npy' # points to the mean_matrix file, assumed saved as a numpy array file.
dist_matrix_template = '/home/jml230/BdpA Publishable/avg_struct/{var}/{ref}.dat' # points to where the reference files, assumed saved as a txt file.
weights_template = '/home/jml230/bdpa_{var}_p2_haMSM/restart0_scikit_nowat/run1/{var}_final_reweighted_weights.dat' # points to where the final weights are, a text file
matrix_template = '/home/jml230/BdpA Publishable/rfc/{var}/dist_matrix_{file_num}.npy' # points to where the dist matrix arrays are. Assume numpy array files.
eligible_states_template = '/home/jml230/BdpA Publishable/new_dist_with_state_def/{var}/eligible_states_eng_all.dat' # points to where the eligible states list are.
single_matrix_template = '/home/jml230/BdpA Publishable/rfc/{var}_eng/dist_matrix_eng_all.npy' # points to where the dist matrix arrays are. Assume numpy array files. Single versions, so files are not per-run.
single_weights_template = '/home/jml230/BdpA Publishable/rfc/{var}_eng/{var}_eligible_states_eng_all.dat' # points to where the final weights are, a text file
### Old/tester variables
#variants = ['wsh2045']
#ref_path = "/home/jml230/BdpA Publishable/new_folded_struct"
#matrix_template = "/home/jml230/BdpA Publishable/rfc/{var}/dist_matrix_{file}.npy"
parser = argparse.ArgumentParser(description='Process size_var')
parser.add_argument('--size', metavar='N', nargs='?', type=int, action='store', const=8, default=8)
parser.add_argument('--subset', action='store_true', help='calculate submatrix of (default) size 8')
parser.add_argument('--sigma', default=6, type=int, help="exponent for the logistic contact")
parser.add_argument('--radius', default=5.0, type=float, help="midpoint distance for the logistic contact")
parser.add_argument('--eligible_states', '-eg', action='store_true', help="use a list of eligible states instead of every single segment.")
parser.add_argument('--single', '-s', action='store_true', help="distance matrices are saved in a single file instead of per-run.")
args = parser.parse_args()
size_var = args.size
subset_var = args.subset
sigma_var = args.sigma
radius_var = args.radius
## Functions
def compare_drms(subset=False, size=size_var):
"""
The catch-all function to calculate drms of a set of distance matrices and then weight-average it.
Final output will be a pickle object that it of the following format (called drms.pickle):
A list of dictionaries, each one corresponding to a reference structure. There will be n-elements in the list, corresponding to the number of frames.
There will also be additional datasets (2x the normal number) if subset is set to True, where those datasets will include the sub-matrices. Those will be in the order of (i,i), (i+1,i+1), ..., (N-1,N-1), (N,N).
Just pickle.load() it after it's done. Will likely be huge.
"""
for variant in tqdm(variants, desc='variant'): # Loop through variants
final_list = []
weights = numpy.loadtxt(weights_template.format(var=variant), skiprows=1)
matrices, num_traj = read_individual_matrices(variant)
for matrix in tqdm(matrices, desc=f'{variant} traj'): # Go through each trajectory
ref_dict = {}
final_list.append(ref_dict)
for ref_type in ref_var:
ref_matrix = numpy.loadtxt(dist_matrix_template.format(var=variant, ref=ref_type))
if subset:
# Grab the submatrices and do checks
submatrices = get_submatrices(matrix, size)
ref_submatrices = get_submatrices(ref_matrix, size)
num_matrices = len(submatrices) # Figure out the number submatrices
assert len(submatrices) == len(ref_submatrices), 'Wasn\'t able to generate the same number of submatrices for the reference and mean matrices.'
# Calculating the submatrices stuff and output to dedicated file
submatrix_list = []
for idx in range(0, num_matrices):
val_drms = drms(submatrices[idx], ref_submatrices[idx])
submatrix_list.append(val_drms)
# Saving the array
ref_dict[f'{ref_type}_submatrices'] = numpy.asarray(submatrix_list)
# Do the normal thing of computing the whole matrix drms
val_drms = drms(matrix, ref_matrix)
ref_dict[ref_type]=val_drms
with open(f'{rfc_path}/{variant}/drms_{size}.pickle', 'wb') as po:
pickle.dump(final_list, po)
def compare_mean_drms(subset=False, size=size_var):
"""
The catch-all function to compare a single mean distance matrix with a set of reference distance matrices.
"""
for variant in variants: # Loop through variants
#TODO current tied to having one mean_matrix and multiple ref_var. Can be generalized to more...
mean_matrix = numpy.load(mean_matrix_template.format(var=variant)) # Load mean matrix
with open(f'{rfc_path}/{variant}/dist_output.dat', 'w') as fo:
fo.write(f"{'ref_type':30} {'dRMS':>18}\n")
for ref_type in ref_var:
ref_matrix = numpy.loadtxt(dist_matrix_template.format(var=variant, ref=ref_type))
if subset:
# Grab the submatrices and do checks
submatrices = get_submatrices(mean_matrix, size)
ref_submatrices = get_submatrices(ref_matrix, size)
num_matrices = len(submatrices) # Figure out the number submatrices
assert len(submatrices) == len(ref_submatrices), 'Wasn\'t able to generate the same number of submatrices for the reference and mean matrices.'
# Calculating the submatrices stuff and output to dedicated file
with open(f'{rfc_path}/{variant}/{ref_type}_output.dat', 'w') as subfo:
subfo.write(f"{'index':>8}\t{'dRMS':>18}\n")
for idx in range(0, num_matrices):
val_drms = drms(submatrices[idx], ref_submatrices[idx])
subfo.write(f'{idx:8}\t{val_drms:>18}\n')
# Do the normal thing of computing the whole matrix drms
val_drms = drms(mean_matrix, ref_matrix)
fo.write(f'{ref_type:30} {val_drms:>18}\n')
def compare_transformed_drms(sigma=sigma_var, radius=radius_var, subset=False, size=size_var):
"""
The catch-all function to calculate a transformed drms of a set of distance matrices and then weight-average it.
Instead of the dRMS, the "distance" is transformed to CC = 1- [1/(1+(r/radius)^sigma)] instead.
This theoretically will limit the extra effects of longer distances distances dominating (who has extra room for fluctuations)
Sigma is defaulted to 6. Larger values is supposed to make the 1->0 transition sharper.
Radius is defaulted to 5 Angstroms. This is the "inflection point" for the sigmoid shape. 5 Angstroms is close enough to the 4.5 Ang we use for contacts I think.
Shamefully stolen from PyLOOS and Dr. Alan Grossfield. This is because I already had distances calculated and his code doesn't take
into account of weights.
Final output will be a pickle object that it of the following format (called logistic_drms.pickle):
A list of dictionaries, each one corresponding to a reference structure. There will be n-elements in the list, corresponding to the number of frames.
There will also be additional datasets (2x the normal number) if subset is set to True, where those datasets will include the sub-matrices. Those will be in the order of (i,i), (i+1,i+1), ..., (N-1,N-1), (N,N).
Just pickle.load() it after it's done. Will likely be huge if doing subsets.
"""
for variant in tqdm(variants, desc='variant'): # Loop through variants
final_list = []
if args.single: # If the trajectories are already saved as 1 .npy
weights = numpy.loadtxt(single_weights_template.format(var=variant), skiprows=1, usecols=3)
matrices, num_traj = read_matrices(variant)
else: # If trajectories are still saved as "per-run"
weights = numpy.loadtxt(weights_template.format(var=variant), skiprows=1)
matrices, num_traj = read_individual_matrices(variant)
for matrix in tqdm(matrices, desc=f'{variant} traj'): # Go through each trajectory
ref_dict = {}
final_list.append(ref_dict)
for ref_type in ref_var:
ref_matrix = numpy.loadtxt(dist_matrix_template.format(var=variant, ref=ref_type))
# Do the logistic transformation
ref_matrix = logistic_transform(ref_matrix, sigma_var, radius_var)
matrix = logistic_transform(matrix, sigma_var, radius_var)
if subset:
# Grab the submatrices and do checks
submatrices = get_submatrices(matrix, size)
ref_submatrices = get_submatrices(ref_matrix, size)
num_matrices = len(submatrices) # Figure out the number submatrices
assert len(submatrices) == len(ref_submatrices), 'Wasn\'t able to generate the same number of submatrices for the reference and mean matrices.'
# Calculating the submatrices stuff and output to dedicated file
submatrix_list = []
for idx in range(0, num_matrices):
val_drms = drms(submatrices[idx], ref_submatrices[idx])
submatrix_list.append(val_drms)
# Saving the array
ref_dict[f'{ref_type}_submatrices'] = numpy.asarray(submatrix_list)
# Do the normal thing of computing the whole matrix drms
val_drms = drms(matrix, ref_matrix)
ref_dict[ref_type]=val_drms
with open(f'{rfc_path}/{variant}/logistic_drms_{radius_var}.pickle', 'wb') as po:
pickle.dump(final_list, po)
def compare_transformed_mean_drms(sigma=sigma_var, radius=radius_var, subset=False, size=size_var):
"""
The catch-all function to compare a single mean distance matrix with a set of reference distance matrices. All matrices are logistically transformed before it is compared.
"""
for variant in variants: # Loop through variants
#TODO current tied to having one mean_matrix and multiple ref_var. Can be generalized to more...
mean_matrix = numpy.load(mean_matrix_template.format(var=variant)) # Load mean matrix
mean_matrix = logistic_transform(mean_matrix, sigma_var, radius_var) # Logistic Transform the mean matrix
with open(f'{rfc_path}/{variant}/dist_transformed_{radius_var}_output.dat', 'w') as fo:
fo.write(f"{'ref_type':30} {'dRMS':>18}\n")
for ref_type in ref_var:
ref_matrix = numpy.loadtxt(dist_matrix_template.format(var=variant, ref=ref_type))
# Do the logistic transformation
ref_matrix = logistic_transform(ref_matrix, sigma_var, radius_var)
if subset:
# Grab the submatrices and do checks
submatrices = get_submatrices(mean_matrix, size)
ref_submatrices = get_submatrices(ref_matrix, size)
num_matrices = len(submatrices) # Figure out the number submatrices
assert len(submatrices) == len(ref_submatrices), 'Wasn\'t able to generate the same number of submatrices for the reference and mean matrices.'
# Calculating the submatrices stuff and output to dedicated file
with open(f'{rfc_path}/{variant}/{ref_type}_transformed_{radius_var}_output.dat', 'w') as subfo:
subfo.write(f"{'index':>8}\t{'dRMS':>18}\n")
for idx in range(0, num_matrices):
val_drms = drms(submatrices[idx], ref_submatrices[idx])
subfo.write(f'{idx:8}\t{val_drms:>18}\n')
# Do the normal thing of computing the whole matrix drms
val_drms = drms(mean_matrix, ref_matrix)
fo.write(f'{ref_type:30} {val_drms:>18}\n')
def calc_avg_length():
"""
The catch-all function to extract average length of a set of n residue from the distance matrices and then weight-average it.
Saves as a numpy array and returns nothing.
"""
for variant in tqdm(variants, desc='variant'): # Loop through variants
weights = numpy.loadtxt(weights_template.format(var=variant), skiprows=1)
matrices, num_traj = read_individual_matrices(variant)
matrix_dim = len(matrices[0])
for size_idx in range(2, matrix_dim + 1): # Loop through the different sizes of submatrices; note: 1-based
num_matrices = matrix_dim - size_idx + 1 # Has to be 1-based
submatrix_list = numpy.zeros(num_matrices) # List of weighted avg distance for each atom pair
for traj_idx, matrix in enumerate(tqdm(matrices, desc=f'{variant} traj')): # Go through each trajectory
# Grab the submatrices and do checks
submatrices = get_submatrices(matrix, size_idx)
assert num_matrices == len(submatrices), f'Wrong indexing; {len(submatrices)} != expected length of {num_matrices}'
# Calculating the submatrices stuff and weight-average them
submatrix_list = numpy.zeros(num_matrices)
index_list = numpy.full(num_matrices, size_idx)
for idx in range(0, num_matrices):
submatrix_list[idx] += weights[traj_idx] * matrix[0,-1]
# Appending to the final numpy array
if not final_list:
final_list = numpy.dstack((index_list, submatrix_list))
else:
final_list = numpy.append(final_list, numpy.dstack((index_list, submatrix_list), axis=1))
with open(f'{rfc_path}/{variant}/avg_dist.npy', 'wb') as no:
numpy.save(no, final_list)
def logistic_transform(a_matrix, sigma=6, radius=5.0):
"""
Code to calculate the logistic transform. Shamefully stolen from PyLOOS and Dr. Grossfield.
Instead of the dRMS, each "distance" is transformed CC = 1- [1/(1+(r/radius)^sigma)] instead. The latter term is referred to C(r)
This theoretically will limit the extra effects of longer distances distances dominating (who has extra room for fluctuations)
Sigma is defaulted to 6. Larger values is supposed to make the 1->0 transition sharper.
Radius is defaulted to 5 Angstroms. This is the "inflection point" for the sigmoid shape. 5 Angstroms is close enough to the 4.5 Ang we use for contacts I think.
returns trans_matrix, which is the transformed matrix.
"""
check_dist_matrix(a_matrix, desc='logistic transform input')
a_shape = len(a_matrix)
trans_matrix = a_matrix # Duplicate...
for row in range(0, a_shape): # Loop through all rows
for column in range(row, a_shape): # Loop through all columns
term = (a_matrix[row, column] / radius )**sigma
if row == column:
trans_matrix[row, column] = 1 + term
else:
trans_matrix[row, column] = 1 + term
trans_matrix[column, row] = 1 + term
# Do the 1 / [term] thing for the C(r) term...
trans_matrix = 1 / trans_matrix
# Do the final 1 - C(r) thing...
trans_matrix = 1 - trans_matrix
return trans_matrix
def drms(a_matrix, b_matrix):
"""
Code to calculate dRMS:
Applies the following equation given distance matrices a_matrix and b_matrix and returns a float:
drms(a_matrix, b_matrix) = sqrt( 1/(n * (n-1)) * sum of i[ sum of j[ (a_matrix(ij) - b_matrix(ij))^2 ] ] )
"""
check_dist_matrix(a_matrix, desc='First input')
check_dist_matrix(b_matrix, desc='Second input')
a_shape = len(a_matrix)
b_shape = len(b_matrix)
assert a_shape == b_shape, "Matrices given are not of the same shape."
sum_term = 0 # Value of the double sum
diff = 0 # Value of the difference
#TODO Tied to 2 dimensions now. should expand to a new algorithm using nditer
for row in range(0, a_shape): # Loop through all rows
for column in range(row+1, a_shape): # Loop through columns that are > than row, this is to prevent double counting and speed up stuff.
diff = a_matrix[row, column] - b_matrix[row, column]
sum_term += 2 * (diff)**2
denominator = 1 / (a_shape * (a_shape-1))
return numpy.sqrt(denominator * sum_term)
def check_dist_matrix(matrix, desc='Matrix'):
# Check if input is a list or an array-type.
if not isinstance(matrix, (list, numpy.ndarray)):
raise TypeError(f'{desc} is not a list or an array.')
# Check dimension of matrix
dimension = numpy.shape(matrix)
for idx in range(1, len(dimension)):
if dimension[idx-1] != dimension[idx]:
raise TypeError(f'{desc} is not a square matrix.')
# Check to see if symmetrical
assert numpy.allclose(matrix, matrix.T), f'{desc} is not symmetrical. Probably not a valid distance matrix.'
# Check if diagonal is zero
for x in range(0,len(dimension)):
assert matrix[x,x] == 0, f'Diagonal element ({x},{x}) is not zero in the mean matrix.'
def get_submatrices(a_matrix, size=8):
"""
Function that picks out submatrices
Say given the size 8 (default) then it'll loop through the square matrix and output all the possible contiguous submatrices into
one big array. Does the crazy checks first (check_dist_matrix()).
returns a list with all the arrays.
"""
check_dist_matrix(a_matrix, desc='Matrix')
size = int(size) # Make sure the size is an integer
matrix_size = len(a_matrix) # Grab the length of the input matrix
submatrices = [] # Empty submatrix
for idx in range(0, matrix_size - size):
#TODO Currently tied to 2 dimensions. should expand to a new algorithm using nditer.
temp_matrix = a_matrix[idx:idx+size, idx:idx+size]
try:
submatrices.append(temp_matrix)
except:
submatrices = [temp_matrix]
assert len(submatrices) == matrix_size - size, "Number of submatrices generated \({len(submatrices)}\) is not consistent with what's expected ({matrix_size - size})"
return submatrices
def read_individual_matrices(variant):
"""
Function that reads the individual distance matrices (from the .npy files) and concatenates them into one big list.
returns: the list of individual distance matrices and the number of distance matrices in the former.
"""
num_traj = 0 # Counter to check we have the correct final matrix, that is the sum of shape[0] == the shape[0] of 'matrices'
matrices = numpy.asarray([])
for file_idx in trange(1,6, desc=f'{variant} reading file:'): # Loop through the 5 files
temp = numpy.load(matrix_template.format(var=variant, file_num=file_idx))
num_traj += temp.shape[0]
try:
matrices = numpy.append(matrices, temp, axis=0)
except ValueError:
matrices = temp
assert matrices.shape[0] == num_traj, 'The combined numpy array matrix is not created correctly.'
return matrices, num_traj
def read_matrices(variant):
"""
Function that reads the distance matrices that are already concatenated into single .npy.
returns: the list of individual distance matrices and the number of distance matrices in the former.
"""
matrices = numpy.load(single_matrix_template.format(var=variant))
num_traj = len(matrices)
return matrices, num_traj
def load_weights(variant):
"""
Function that reads the weights and then explicitly normalize them if necessary.
"""
if args.single:
weights = numpy.loadtxt(single_weights_template.format(var=variant), skiprows=1, usecols=3)
else:
weights = numpy.loadtxt(weights_template.format(var=variant), skiprows=1)
if not numpy.isclose(sum(weights), 1):
print('Sum doesn\'t add up to 1, explicitly normalizing it now.')
weights = weights / sum(weights)
return weights
def calc_mean_matrix(use_weights=False):
"""
A catch-all function that reads individual distance matrix and then weights them if weights=True.
"""
for variant in tqdm(variants, desc='variants'): # Loop through variants
# Create dummy matrix and all that, assumed to be 58 x 58, here.
#TODO generalize this.
mean_matrix = numpy.zeros((58,58)) # Blank mean matrix
if args.single: # If the trajectories are already saved as 1 .npy
matrices, num_traj = read_matrices(variant)
else: # If trajectories are still saved as "per-run"
matrices, num_traj = read_individual_matrices(variant)
# If we're using weights
if use_weights:
weights = load_weights(variant)
else:
weights = numpy.ones(matrices.shape[0])
weights = weights / sum(weights)
assert matrices.shape[0] == weights.shape[0], f'Number of trajectories are not consistent. You have {matrices.shape[0]} segments but {weights.shape[0]} number of weights in the file.'
for traj_idx in trange(0, num_traj, desc=f'{variant} traj number'):
for row in range(0,58): # Loop through all rows
for column in range(row+1,58): # Loop through columns that are > than row, this is to prevent double counting and speed up stuff.
val = matrices[traj_idx, row, column]
mean_matrix[row, column] += weights[traj_idx] * val
mean_matrix[column, row] += weights[traj_idx] * val
# Final check to see if it's a valid matrix
check_dist_matrix(mean_matrix)
numpy.save(f'{rfc_path}/{variant}_eng/mean_matrix.npy', mean_matrix)
if __name__ == "__main__":
#compare_drms(subset=subset_var, size=size_var)
# Runs through the whole suite of generating a "mean" distance matrix then compare it to a list of refs for the overall dRMS.
#calc_mean_matrix(use_weights=True)
#compare_mean_drms(subset=subset_var, size=size_var)
compare_transformed_drms(sigma=sigma_var, radius=radius_var, subset=subset_var, size=size_var)
compare_transformed_mean_drms(sigma=sigma_var, radius=radius_var, subset=subset_var, size=size_var)
#calc_mean_matrix(use_weights=True)
pass