33
33
# the minimum raw score for a position to be included in the ALL_NON_ZERO_SCORES array
34
34
MIN_SCORE_THRESHOLD = 0.01
35
35
36
+ INSERTED_BASES_CONTEXT = 5
36
37
37
38
class Annotator :
38
39
@@ -148,6 +149,7 @@ def get_delta_scores_for_transcript(x_ref, x_alt, ref_len, alt_len, strand, cov,
148
149
np .max (y_alt [:, cov // 2 :cov // 2 + alt_len ], axis = 1 )[:, None , :],
149
150
y_alt [:, cov // 2 + alt_len :]],
150
151
axis = 1 )
152
+
151
153
#MNP handling
152
154
elif ref_len > 1 and alt_len > 1 :
153
155
zblock = np .zeros ((1 ,ref_len - 1 ,3 ))
@@ -161,6 +163,21 @@ def get_delta_scores_for_transcript(x_ref, x_alt, ref_len, alt_len, strand, cov,
161
163
return y_ref , y_alt
162
164
163
165
166
+ def compute_scores_for_inserted_bases (y_ref , y_alt , alt_len , cov ):
167
+ # if the variant is an insertion, this array will contain the raw sores for the inserted bases.
168
+ # this is used for addressing https://github.com/broadinstitute/SpliceAI-lookup/issues/84
169
+ y_ref_inserted_bases = np .concatenate ([
170
+ y_ref [:, 1 + cov // 2 - INSERTED_BASES_CONTEXT : 1 + cov // 2 ],
171
+ np .zeros ((1 , alt_len - 1 , 3 )),
172
+ y_ref [:, 1 + cov // 2 : 1 + cov // 2 + INSERTED_BASES_CONTEXT ],
173
+ ], axis = 1 )
174
+
175
+ y_alt_inserted_bases = y_alt [:, 1 + cov // 2 - INSERTED_BASES_CONTEXT : 1 + cov // 2 + (alt_len - 1 ) + INSERTED_BASES_CONTEXT ]
176
+
177
+ assert y_ref_inserted_bases .shape == y_alt_inserted_bases .shape
178
+
179
+ return y_ref_inserted_bases , y_alt_inserted_bases
180
+
164
181
165
182
def get_delta_scores (record , ann , dist_var , mask ):
166
183
@@ -197,7 +214,7 @@ def get_delta_scores(record, ann, dist_var, mask):
197
214
logging .warning ('Skipping record (ref too long): {}' .format (record ))
198
215
return scores
199
216
200
- genomic_coords = np .arange (record .pos - cov // 2 , record .pos + cov // 2 + 1 )
217
+ genomic_coords = np .arange (record .pos - cov // 2 , record .pos + cov // 2 + 1 )
201
218
202
219
# many of the transcripts in a gene can have the same tx start & stop positions, so their results can be cached
203
220
# since SpliceAI scores (prior to masking) depend only on transcript start & stop coordinates and strand.
@@ -226,7 +243,7 @@ def get_delta_scores(record, ann, dist_var, mask):
226
243
args = (x_ref , x_alt , ref_len , alt_len , strand , cov )
227
244
if args not in delta_scores_transcript_cache :
228
245
model_prediction_count += 1
229
- delta_scores_transcript_cache [args ] = get_delta_scores_for_transcript (* args , ann = ann )
246
+ delta_scores_transcript_cache [args ] = get_delta_scores_for_transcript (* args , ann = ann )
230
247
231
248
y_ref , y_alt = delta_scores_transcript_cache [args ]
232
249
@@ -250,18 +267,59 @@ def get_delta_scores(record, ann, dist_var, mask):
250
267
raise ValueError (f"SpliceAI internal error: len(genomic_coords) != y_alt.shape[1]: "
251
268
f"{ len (genomic_coords )} != { y_alt .shape [1 ]} " )
252
269
270
+ DS_AG = (y [1 , idx_pa , 1 ]- y [0 , idx_pa , 1 ])* (1 - mask_pa )
271
+ DS_AL = (y [0 , idx_na , 1 ]- y [1 , idx_na , 1 ])* (1 - mask_na )
272
+ DS_DG = (y [1 , idx_pd , 2 ]- y [0 , idx_pd , 2 ])* (1 - mask_pd )
273
+ DS_DL = (y [0 , idx_nd , 2 ]- y [1 , idx_nd , 2 ])* (1 - mask_nd )
274
+
275
+ DP_AG = int (idx_pa - cov // 2 )
276
+ DP_AL = int (idx_na - cov // 2 )
277
+ DP_DG = int (idx_pd - cov // 2 )
278
+ DP_DL = int (idx_nd - cov // 2 )
279
+
280
+ if ref_len == 1 and alt_len >= 3 and (
281
+ (DS_AG >= 0.2 and DP_AG == 0 ) or
282
+ (DS_AL >= 0.2 and DP_AL == 0 ) or
283
+ (DS_DG >= 0.2 and DP_DG == 0 ) or
284
+ (DS_DL >= 0.2 and DP_DL == 0 )):
285
+
286
+ inserted_bases_genomic_coords = np .concatenate ([
287
+ np .arange (record .pos - INSERTED_BASES_CONTEXT + 1 , record .pos + 1 ),
288
+ [f"+{ offset } " for offset in np .arange (1 , alt_len )],
289
+ np .arange (record .pos + 1 , record .pos + INSERTED_BASES_CONTEXT + 1 ),
290
+ ])
291
+
292
+ y_ref_inserted_bases , y_alt_inserted_bases = compute_scores_for_inserted_bases (
293
+ y_ref , y_alt , alt_len , cov )
294
+
295
+ ref_seq = (
296
+ seq [wid // 2 - INSERTED_BASES_CONTEXT + 1 : wid // 2 + 1 ] +
297
+ " " * (alt_len - 1 ) +
298
+ seq [wid // 2 + 1 : wid // 2 + 1 + INSERTED_BASES_CONTEXT ]
299
+ )
300
+ alt_seq = (
301
+ seq [wid // 2 - INSERTED_BASES_CONTEXT : wid // 2 ] +
302
+ record .alts [j ][1 :] +
303
+ seq [wid // 2 + len (record .ref ) : wid // 2 + len (record .ref ) + INSERTED_BASES_CONTEXT ]
304
+ )
305
+
306
+ assert len (ref_seq ) == len (alt_seq ), f"len(ref_seq) != len(alt_seq): { len (ref_seq )} != { len (alt_seq )} "
307
+
308
+ else :
309
+ inserted_bases_genomic_coords = ref_seq = alt_seq = y_ref_inserted_bases = y_alt_inserted_bases = None
310
+
253
311
scores .append ({
254
312
"ALLELE" : record .alts [j ],
255
313
"NAME" : genes [i ],
256
314
"STRAND" : strands [i ],
257
- "DS_AG" : f"{ ( y [ 1 , idx_pa , 1 ] - y [ 0 , idx_pa , 1 ]) * ( 1 - mask_pa ) :{FLOAT_FORMAT }} " ,
258
- "DS_AL" : f"{ ( y [ 0 , idx_na , 1 ] - y [ 1 , idx_na , 1 ]) * ( 1 - mask_na ) :{FLOAT_FORMAT }} " ,
259
- "DS_DG" : f"{ ( y [ 1 , idx_pd , 2 ] - y [ 0 , idx_pd , 2 ]) * ( 1 - mask_pd ) :{FLOAT_FORMAT }} " ,
260
- "DS_DL" : f"{ ( y [ 0 , idx_nd , 2 ] - y [ 1 , idx_nd , 2 ]) * ( 1 - mask_nd ) :{FLOAT_FORMAT }} " ,
261
- "DP_AG" : int ( idx_pa - cov // 2 ) ,
262
- "DP_AL" : int ( idx_na - cov // 2 ) ,
263
- "DP_DG" : int ( idx_pd - cov // 2 ) ,
264
- "DP_DL" : int ( idx_nd - cov // 2 ) ,
315
+ "DS_AG" : f"{ DS_AG :{FLOAT_FORMAT }} " ,
316
+ "DS_AL" : f"{ DS_AL :{FLOAT_FORMAT }} " ,
317
+ "DS_DG" : f"{ DS_DG :{FLOAT_FORMAT }} " ,
318
+ "DS_DL" : f"{ DS_DL :{FLOAT_FORMAT }} " ,
319
+ "DP_AG" : DP_AG ,
320
+ "DP_AL" : DP_AL ,
321
+ "DP_DG" : DP_DG ,
322
+ "DP_DL" : DP_DL ,
265
323
"DS_AG_REF" : f"{ y [0 , idx_pa , 1 ]:{FLOAT_FORMAT }} " ,
266
324
"DS_AL_REF" : f"{ y [0 , idx_na , 1 ]:{FLOAT_FORMAT }} " ,
267
325
"DS_DG_REF" : f"{ y [0 , idx_pd , 2 ]:{FLOAT_FORMAT }} " ,
@@ -282,9 +340,20 @@ def get_delta_scores(record, ann, dist_var, mask):
282
340
) if any (score >= MIN_SCORE_THRESHOLD for score in (ref_acceptor_score , alt_acceptor_score , ref_donor_score , ref_acceptor_score ))
283
341
or i in (idx_pa , idx_na , idx_pd , idx_nd )
284
342
],
343
+ "SCORES_OF_INSERTED_BASES" : [] if y_alt_inserted_bases is None else [
344
+ {
345
+ "chrom" : chrom ,
346
+ "pos" : genomic_coord ,
347
+ "ref" : ref_base ,
348
+ "alt" : alt_base ,
349
+ "RA" : f"{ ref_acceptor_score :{FLOAT_FORMAT }} " ,
350
+ "AA" : f"{ alt_acceptor_score :{FLOAT_FORMAT }} " ,
351
+ "RD" : f"{ ref_donor_score :{FLOAT_FORMAT }} " ,
352
+ "AD" : f"{ alt_donor_score :{FLOAT_FORMAT }} " ,
353
+ } for i , (genomic_coord , ref_base , alt_base , ref_acceptor_score , alt_acceptor_score , ref_donor_score , alt_donor_score ) in enumerate (zip (
354
+ inserted_bases_genomic_coords , ref_seq , alt_seq , y_ref_inserted_bases [0 , :, 1 ], y_alt_inserted_bases [0 , :, 1 ], y_ref_inserted_bases [0 , :, 2 ], y_alt_inserted_bases [0 , :, 2 ]))
355
+ ],
285
356
})
286
357
287
- #print(f"Done computing scores. Hit cache for {total_count - model_prediction_count:,d} out of {total_count:,d} transcripts")
288
-
289
358
return scores
290
359
0 commit comments