-
Notifications
You must be signed in to change notification settings - Fork 3
/
helpers.py
482 lines (359 loc) · 18.1 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
import os
import h5py
import numpy as np
from joblib import Parallel, delayed
from scipy.stats import chi2
from tensorflow.python.client import device_lib
from tqdm import tqdm
import random
from parameters_complete import SYN_DATA_DIR, ttbr as ttbr, syn_n_subjects, pnorm_feature_scaling, inform_snps, seed, \
REAL_DATA_DIR
def get_available_gpus():
local_device_protos = device_lib.list_local_devices()
return len([x.name for x in local_device_protos if x.device_type == 'GPU'])
def generate_name_from_params(params):
RUN_DIR_NAME = ""
for key, val in params.items():
if key not in ['feature_matrix_path', 'y_path', 'epochs','verbose', 'optimizer', 'batch_size']:
RUN_DIR_NAME += "{}-{}__".format(key, val)
RUN_DIR_NAME += "{}".format(os.environ['SGE_TASK_ID'])
return RUN_DIR_NAME
def remove_small_frequencies(chromosome):
"""
This returns a chromosom with only minor allel freq > 0.15
This chromosom can be safely used to generate synthetic genotypes.
The returned value CAN CONTAIN UNMAPPED SNPs !
"""
chromosome[chromosome == 48] = 255
n_indiv = chromosome.shape[0]
lex_min = np.tile(np.min(chromosome, axis=(0, 2)), [n_indiv, 1])
allel1 = chromosome[:, :, 0]
allel2 = chromosome[:, :, 1]
lexmin_mask_1 = (allel1 == lex_min)
lexmin_mask_2 = (allel2 == lex_min)
maf = (lexmin_mask_1.sum(0) + lexmin_mask_2.sum(0))/(2*n_indiv) # n_snps
maf = np.minimum(maf, 1-maf)
chromosome[chromosome == 255] = 48
chrom_low_f_removed = chromosome[:, maf > 0.15, :]
chrom_low_f_removed.sort()
check_genotype_unique_allels(chrom_low_f_removed)
return chrom_low_f_removed
def check_genotype_unique_allels(genotype):
assert(max([len(np.unique(genotype[:, i, :]))
for i in range(genotype.shape[1])]) <= 3)
def generate_syn_genotypes(root_path=SYN_DATA_DIR, n_subjects=syn_n_subjects, n_info_snps=20, n_noise_snps=10000, quantity=1):
""" Generate synthetic genotypes and labels by removing all minor allels with low frequency, and missing SNPs.
First step of data preprocessing, has to be followed by string_to_featmat()
> Checks that that each SNP in each chromosome has at most 2 unique values in the whole dataset.
"""
print("Starting synthetic genotypes generation...")
try:
os.remove(os.path.join(root_path, 'genomic.h5py'))
except FileNotFoundError:
pass
with h5py.File(os.path.join(REAL_DATA_DIR,'CD','chromo_2.mat'), 'r') as f2:
chrom2_full = np.array(f2.get('X')).T
chrom2_full = chrom2_full.reshape(chrom2_full.shape[0], -1, 3)[:, :, :2]
chrom2_full = remove_small_frequencies(chrom2_full)
chrom2_full = chrom2_full[:, :n_noise_snps]
assert chrom2_full.shape[0] > n_subjects
chrom2 = chrom2_full[:n_subjects]
with h5py.File(os.path.join(REAL_DATA_DIR,'CD', 'chromo_1.mat'), 'r') as f:
chrom1_full = np.array(f.get('X')).T
chrom1_full = chrom1_full.reshape(chrom1_full.shape[0], -1, 3)[:, :, :2]
chrom1_full = remove_small_frequencies(chrom1_full)
assert chrom1_full.shape[0] > n_subjects
chrom1 = chrom1_full[:n_subjects] # Keep only 300 people
# Create #rep different datasets, each with a different set of informative SNPs - no, Alex made them all different, but they only need to be random, not all different. otherwise we couldnt have 10000 datasets.
half_noise_size = int(n_noise_snps/2)
with h5py.File(os.path.join(root_path, 'genomic.h5py'), 'w') as file:
for i in tqdm(range(quantity)):
# random starting position
start_info = random.choice(np.arange(len(chrom1[0,:])-n_info_snps))
#chrom1_subset = chrom1[:, i*n_info_snps:(i+1)*n_info_snps]
chrom1_subset = chrom1[:, start_info:(start_info+n_info_snps)]
data = np.concatenate((chrom2[:, :half_noise_size], chrom1_subset, chrom2[:, half_noise_size:half_noise_size*2]), axis=1)
## If the number of encoded SNPs is insufficient
if data.shape[1] != n_info_snps + n_noise_snps:
raise Exception("Not enough SNPs")
# Write everything!
file.create_dataset(str(i), data=data)
return os.path.join(root_path, 'genomic.h5py')
def generate_syn_phenotypes(root_path=SYN_DATA_DIR, tower_to_base_ratio=ttbr, n_info_snps=20, n_noise_snps=10000, quantity=1):
"""
> Assumes that each SNP has at most 3 unique values in the whole dataset (Two allels and possibly unmapped values)
IMPORTANT: DOES NOT LOAD FROM FILE
returns: dict(key, labels)
"""
print("Starting synthetic phenotypes generation...")
# Generate Labels from UNIQUE SNP at position 9
info_snp_idx = int(n_noise_snps/2) + int(n_info_snps/2)
labels_dict = {}
def f(genotype, key):
n_indiv = genotype.shape[0]
info_snp = genotype[:, info_snp_idx] # (n,2)
major_allel = np.max(info_snp)
major_mask_1 = (info_snp[:, 0] == major_allel) # n
major_mask_2 = (info_snp[:, 1] == major_allel) # n
invalid_mask = (info_snp[:, 0] == 48) | (info_snp[:, 1] == 48)
nb_major_allels = np.sum(
[major_mask_1, major_mask_2, invalid_mask], axis=0) - 1.0 # n
probabilities = 1 / \
(1 + np.exp(-tower_to_base_ratio * (nb_major_allels - np.median(nb_major_allels))))
random_vector = np.random.RandomState(seed).uniform(low=0.0, high=1.0, size=n_indiv)
labels = np.where(probabilities > random_vector, 1, -1)
assert genotype.shape[0] == labels.shape[0]
labels_dict[key] = labels
del genotype
with h5py.File(os.path.join(root_path, 'genomic.h5py'), 'r') as h5py_file:
Parallel(n_jobs=-1, require='sharedmem')(delayed(f)(h5py_file[str(i)][:],str(i)) for i in tqdm(range(quantity)))
return labels_dict
def char_matrix_to_featmat(data, embedding_type='2d', norm_feature_scaling=pnorm_feature_scaling):
### Global Parameters ###
(n_subjects, num_snp3, _) = data.shape
# Computes lexicographically highest and lowest nucleotides for each position of each strand
lexmax_overall_per_snp = np.max(data, axis=(0, 2))
#data_now = data.copy()
data[data == 48] = 255
lexmin_overall_per_snp = np.min(data, axis=(0, 2))
data[data == 255] = 48
# Masks showing valid or invalid indices
# SNPs being unchanged amongst the whole dataset, hold no information
lexmin_mask_per_snp = np.tile(lexmin_overall_per_snp, [n_subjects, 1])
lexmax_mask_per_snp = np.tile(lexmax_overall_per_snp, [n_subjects, 1])
invalid_bool_mask = (lexmin_mask_per_snp == lexmax_mask_per_snp)
allele1 = data[:, :, 0]
allele2 = data[:, :, 1]
# indices where allel1 equals the lowest value
allele1_lexminor_mask = (allele1 == lexmin_mask_per_snp)
# indices where allel1 equals the highest value
allele1_lexmajor_mask = (allele1 == lexmax_mask_per_snp)
# indices where allel2 equals the lowest value
allele2_lexminor_mask = (allele2 == lexmin_mask_per_snp)
# indices where allel2 equals the highest value
allele2_lexmajor_mask = (allele2 == lexmax_mask_per_snp)
f_m = np.zeros((n_subjects, num_snp3), dtype=(int, 3))
f_m[allele1_lexminor_mask & allele2_lexminor_mask] = [1, 0, 0]
f_m[(allele1_lexmajor_mask & allele2_lexminor_mask) |
(allele1_lexminor_mask & allele2_lexmajor_mask)] = [0, 1, 0]
f_m[allele1_lexmajor_mask & allele2_lexmajor_mask] = [0, 0, 1]
f_m[invalid_bool_mask] = [0, 0, 0]
f_m = np.reshape(f_m, (n_subjects, 3*num_snp3))
f_m = f_m.astype(np.double)
# Rescale feature matrix
f_m -= np.mean(f_m, dtype=np.float64, axis=0) # centering
stddev = ((np.abs(f_m)**norm_feature_scaling).mean(axis=0) * f_m.shape[1])**(1.0/norm_feature_scaling)
# Safe division
f_m = np.divide(f_m, stddev, out=np.zeros_like(f_m), where=stddev!=0)
# Reshape Feature matrix
if embedding_type == '2d':
pass
elif embedding_type == '3d':
f_m = np.reshape(f_m, (n_subjects, num_snp3, 3))
return f_m.astype(float)
def char_matrix_to_featmat_no_scaling(data, embedding_type='2d', norm_feature_scaling=pnorm_feature_scaling):
### Global Parameters ###
(n_subjects, num_snp3, _) = data.shape
# Computes lexicographically highest and lowest nucleotides for each position of each strand
lexmax_overall_per_snp = np.max(data, axis=(0, 2))
data[data == 48] = 255
lexmin_overall_per_snp = np.min(data, axis=(0, 2))
data[data == 255] = 48
# Masks showing valid or invalid indices
# SNPs being unchanged amongst the whole dataset, hold no information
lexmin_mask_per_snp = np.tile(lexmin_overall_per_snp, [n_subjects, 1])
lexmax_mask_per_snp = np.tile(lexmax_overall_per_snp, [n_subjects, 1])
invalid_bool_mask = (lexmin_mask_per_snp == lexmax_mask_per_snp)
allele1 = data[:, :, 0]
allele2 = data[:, :, 1]
# indices where allel1 equals the lowest value
allele1_lexminor_mask = (allele1 == lexmin_mask_per_snp)
# indices where allel1 equals the highest value
allele1_lexmajor_mask = (allele1 == lexmax_mask_per_snp)
# indices where allel2 equals the lowest value
allele2_lexminor_mask = (allele2 == lexmin_mask_per_snp)
# indices where allel2 equals the highest value
allele2_lexmajor_mask = (allele2 == lexmax_mask_per_snp)
f_m = np.zeros((n_subjects, num_snp3), dtype=(int, 3))
f_m[allele1_lexminor_mask & allele2_lexminor_mask] = [1, 0, 0]
f_m[(allele1_lexmajor_mask & allele2_lexminor_mask) |
(allele1_lexminor_mask & allele2_lexmajor_mask)] = [0, 1, 0]
f_m[allele1_lexmajor_mask & allele2_lexmajor_mask] = [0, 0, 1]
f_m[invalid_bool_mask] = [0, 0, 0]
f_m = np.reshape(f_m, (n_subjects, 3*num_snp3))
f_m = f_m.astype(np.double)
## Rescale feature matrix
#f_m -= np.mean(f_m, dtype=np.float64, axis=0) # centering
#stddev = ((np.abs(f_m)**norm_feature_scaling).mean(axis=0) * f_m.shape[1])**(1.0/norm_feature_scaling)
## Safe division
#f_m = np.divide(f_m, stddev, out=np.zeros_like(f_m), where=stddev!=0)
# Reshape Feature matrix
if embedding_type == '2d':
pass
elif embedding_type == '3d':
f_m = np.reshape(f_m, (n_subjects, num_snp3, 3))
return f_m.astype(float)
def genomic_to_featmat(embedding_type='2d', overwrite=False):
"""
Transforms a h5py dictionary of genomic matrix of chars to a tensor of features
{'0': genomic_mat_0, '1': genomic_mat_1,..., 'rep': genomic_mat_rep }
=> {'0': feat_mat_0, '1': feat_mat_1,..., 'rep': feat_mat_rep }
:param embedding_type: 3d ( n_subjects, 3*n_snps) or 2d ( n_subjects, n_snps, 3)
:param overwrite:
:return:
"""
data_path = os.path.join(SYN_DATA_DIR, 'genomic.h5py')
fm_path = os.path.join(SYN_DATA_DIR, embedding_type + '_fm.h5py')
if overwrite:
try:
os.remove(fm_path)
except (FileNotFoundError, OSError):
pass
if not overwrite:
try:
return h5py.File(fm_path, 'r')
except (FileNotFoundError, OSError):
print('Featmat not found: generating new one...')
with h5py.File(fm_path, 'w') as feature_file:
with h5py.File(data_path, 'r') as data_file:
for key in tqdm(list(data_file.keys())):
data = data_file[key][:]
f_m = char_matrix_to_featmat(data, embedding_type)
feature_file.create_dataset(key, data=f_m)
del data
return h5py.File(fm_path, 'r')
def moving_average(weights, window, pnorm_filter):
"""
Weights postprocessing filter based on a moving average
Inspired from https://uk.mathworks.com/matlabcentral/fileexchange/12276-moving_average-v3-1-mar-2008
"""
wnew = weights ** pnorm_filter
wnew = np.concatenate((
np.zeros(int((window - 1) / 2 + 1)),
wnew,
np.zeros(int((window - 1) / 2))),
axis=None)
wnew = np.cumsum(wnew)
wnew = (wnew[window:] - wnew[0:-window]) ** (1.0 / pnorm_filter)
wnew /= window ** (1.0 / pnorm_filter)
return wnew
def chi_square(data, labels):
"""
data: Char matrix (n * n_snp * 2)
labels: -1, 1 encoding
filter_indices: array of indices ordering the supposed top k p values
"""
n_subjects, n_snps, _ = data.shape
valid_mask = (data[:, :, 0] != 48) & (data[:, :, 1] != 48) # n , n_snps
# Find greatest and lowest char code, SNP-wise
data[data == 48] = 255
lex_min_per_snp = data.min(axis=(0, 2)) # n_snps
data[data == 255] = 48
# Valid case masks - n * n_snp
cases_mask = (labels == 1)
cases_mask = np.repeat(cases_mask, n_snps, axis=0).reshape((-1, n_snps))
controls_mask = (labels == -1)
controls_mask = np.repeat(controls_mask, n_snps,
axis=0).reshape((-1, n_snps))
# Valid control masks
valid_cases_mask = valid_mask & cases_mask
num_valid_cases = np.sum(valid_cases_mask, axis=0)
valid_controls_mask = valid_mask & controls_mask
num_valid_controls = np.sum(valid_controls_mask, axis=0)
# Allel pairs masks
lex_min_mat = np.tile(lex_min_per_snp, [n_subjects, 1])
minor_allels_mask = (data[:, :, 0] == lex_min_mat) & (
data[:, :, 1] == lex_min_mat)
both_allels_mask = (data[:, :, 0] != data[:, :, 1])
# Populate valid_table
"""
This table is a per-SNP lookup table for the allel combinations (rows) and case or control nature (columns)
valid_table[0] : cases
valid_table[1] : controls
valid_table[i,j]: [n_snp], for each SNP genotype, number of individuals bearing it
"""
valid_table = np.zeros((2, 3, n_snps))
valid_table[0, 0] = (minor_allels_mask & valid_cases_mask).sum(
axis=0) # 2 Minor allels, case
valid_table[0, 1] = (both_allels_mask & valid_cases_mask).sum(
axis=0) # Both allels, case
valid_table[0, 2] = num_valid_cases - valid_table[0, 0] - \
valid_table[0, 1] # 2 Major allels, case
valid_table[1, 0] = (minor_allels_mask & valid_controls_mask).sum(
axis=0) # 2 Minor allels, control
valid_table[1, 1] = (both_allels_mask & valid_controls_mask).sum(
axis=0) # Both allels, control
valid_table[1, 2] = num_valid_controls - valid_table[1, 0] - \
valid_table[1, 1] # 2 Major allels, case
# compute p-values
row_marginals = np.squeeze(np.sum(valid_table, axis=1))
col_marginals = np.squeeze(np.sum(valid_table, axis=0))
valid_n_by_snp = np.sum(row_marginals, axis=0)
chisq_value = np.zeros(n_snps)
for i in range(2):
for j in range(3):
# Independence assumption: e = nb_c_or_c * genotype_table / N
e = row_marginals[i] * col_marginals[j] / valid_n_by_snp
# Expected can be 0 if dataset does not contain this (genotype, group) combination
nonzero_mask = (e != 0)
observed = valid_table[i, j, nonzero_mask]
zero_mask = np.logical_not(nonzero_mask)
if (np.sum(zero_mask) != len(e)): # If expected has at least one nonzero value
chisq_value[nonzero_mask] += (observed -
e[nonzero_mask])**2 / e[nonzero_mask]
pvalues = chi2.sf(chisq_value, 2)
return pvalues
def plot_pvalues(complete_pvalues, top_indices_sorted, axes ):
print("Performing complete X2 to prepare plotting...")
axes.scatter(range(len(complete_pvalues)),-np.log10(complete_pvalues), marker='.', color='darkblue')
axes.scatter(top_indices_sorted,-np.log10(complete_pvalues[top_indices_sorted]), color='fuchsia')
#axes.set_ylabel('$-log_{10}$(p-value)')
#axes.set_xlabel('SNP position')
axes.set_ylim(bottom =0)
def compute_metrics(scores, truth, threshold):
"""
Computes True Positive Rate, Expected Number of False Positives, Family-wide error rate and precision
:param scores: Pvalues
:param truth: Vector of size (n_snps) with 1s where the loci are informative, else 0s
:param threshold: Loci with scores greater than this value will be considered informative
:return: tpr, enfr, fwer, precision
"""
n_experiments = scores.shape[0]
selected_pvalues_mask = (scores <= threshold) # n, n_snp
tp = (selected_pvalues_mask & truth).sum()
fp = (selected_pvalues_mask & np.logical_not(truth)).sum()
tpr = tp / (n_experiments * inform_snps)
enfr = fp / n_experiments # ENFR: false rejection of null hyp, that is FALSE POSITIVE
fwer = ((selected_pvalues_mask & np.logical_not(truth)).sum(axis=1) > 0).sum()/n_experiments
precision = (tp / (tp + fp)) if (tp + fp)!=0 else 0
assert 0 <= tpr <= 1
assert 0 <= fwer <= 1
assert 0 <= precision <= 1
return tpr, enfr, fwer, precision
def postprocess_weights(weights,top_k, filter_window_size, p_svm, p_pnorm_filter):
"""
Rescales weights to unit p-norm, then run a moving average and returns the top_k greatest indices
:param weights: SVM weights or relevance mapping from which draw the top_k greatest indices
:param top_k: number of selected indices
:param filter_window_size: size of window during the moving average
:param p_svm: norm used in the p_normalization step
:param p_pnorm_filter: norm used in the moving average
:return: sorted indices with top k greatest pvalues, averaged weights
"""
averaged_weights = postprocess_weights_without_avg(weights, p_svm)
averaged_weights = moving_average(averaged_weights,filter_window_size, p_pnorm_filter)
top_indices_sorted = averaged_weights.argsort()[::-1][:top_k] # Gets indices of top_k greatest elements
return top_indices_sorted, averaged_weights
def postprocess_weights_without_avg(weights, p_svm):
"""
First step of weights postprocessing. Rescales weights to unit 2-norm
:input shape (n_snps * 3)
:return shape (n_snps)
"""
if np.count_nonzero(weights)==0:
return np.zeros(weights.reshape(-1, 3).shape[0])
weights_ = np.absolute(weights)/np.linalg.norm(weights, ord=2)
weights_ = np.absolute(weights_.reshape(-1, 3))# Group weights by 3 (yields locus's importance measure)
weights_ = np.sum(weights_**p_svm, axis=1)**(1.0/p_svm)
weights_ /= np.linalg.norm(weights_, ord=2)
return weights_