Skip to content

Commit 3cb1520

Browse files
committed
added SCORES_OF_INSERTED_BASES to output to address broadinstitute/SpliceAI-lookup#84
1 parent 4a39426 commit 3cb1520

File tree

1 file changed

+81
-12
lines changed

1 file changed

+81
-12
lines changed

spliceai/utils.py

+81-12
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
# the minimum raw score for a position to be included in the ALL_NON_ZERO_SCORES array
3434
MIN_SCORE_THRESHOLD = 0.01
3535

36+
INSERTED_BASES_CONTEXT = 5
3637

3738
class Annotator:
3839

@@ -148,6 +149,7 @@ def get_delta_scores_for_transcript(x_ref, x_alt, ref_len, alt_len, strand, cov,
148149
np.max(y_alt[:, cov//2:cov//2+alt_len], axis=1)[:, None, :],
149150
y_alt[:, cov//2+alt_len:]],
150151
axis=1)
152+
151153
#MNP handling
152154
elif ref_len > 1 and alt_len > 1:
153155
zblock = np.zeros((1,ref_len-1,3))
@@ -161,6 +163,21 @@ def get_delta_scores_for_transcript(x_ref, x_alt, ref_len, alt_len, strand, cov,
161163
return y_ref, y_alt
162164

163165

166+
def compute_scores_for_inserted_bases(y_ref, y_alt, alt_len, cov):
167+
# if the variant is an insertion, this array will contain the raw sores for the inserted bases.
168+
# this is used for addressing https://github.com/broadinstitute/SpliceAI-lookup/issues/84
169+
y_ref_inserted_bases = np.concatenate([
170+
y_ref[:, 1 + cov//2 - INSERTED_BASES_CONTEXT : 1 + cov//2],
171+
np.zeros((1, alt_len - 1, 3)),
172+
y_ref[:, 1 + cov//2 : 1 + cov//2 + INSERTED_BASES_CONTEXT],
173+
], axis=1)
174+
175+
y_alt_inserted_bases = y_alt[:, 1 + cov//2 - INSERTED_BASES_CONTEXT: 1 + cov//2 + (alt_len - 1) + INSERTED_BASES_CONTEXT]
176+
177+
assert y_ref_inserted_bases.shape == y_alt_inserted_bases.shape
178+
179+
return y_ref_inserted_bases, y_alt_inserted_bases
180+
164181

165182
def get_delta_scores(record, ann, dist_var, mask):
166183

@@ -197,7 +214,7 @@ def get_delta_scores(record, ann, dist_var, mask):
197214
logging.warning('Skipping record (ref too long): {}'.format(record))
198215
return scores
199216

200-
genomic_coords = np.arange(record.pos-cov//2, record.pos+cov//2 + 1)
217+
genomic_coords = np.arange(record.pos - cov//2, record.pos + cov//2 + 1)
201218

202219
# many of the transcripts in a gene can have the same tx start & stop positions, so their results can be cached
203220
# since SpliceAI scores (prior to masking) depend only on transcript start & stop coordinates and strand.
@@ -226,7 +243,7 @@ def get_delta_scores(record, ann, dist_var, mask):
226243
args = (x_ref, x_alt, ref_len, alt_len, strand, cov)
227244
if args not in delta_scores_transcript_cache:
228245
model_prediction_count += 1
229-
delta_scores_transcript_cache[args] = get_delta_scores_for_transcript(*args, ann=ann)
246+
delta_scores_transcript_cache[args] = get_delta_scores_for_transcript(*args, ann=ann)
230247

231248
y_ref, y_alt = delta_scores_transcript_cache[args]
232249

@@ -250,18 +267,59 @@ def get_delta_scores(record, ann, dist_var, mask):
250267
raise ValueError(f"SpliceAI internal error: len(genomic_coords) != y_alt.shape[1]: "
251268
f"{len(genomic_coords)} != {y_alt.shape[1]}")
252269

270+
DS_AG = (y[1, idx_pa, 1]-y[0, idx_pa, 1])*(1-mask_pa)
271+
DS_AL = (y[0, idx_na, 1]-y[1, idx_na, 1])*(1-mask_na)
272+
DS_DG = (y[1, idx_pd, 2]-y[0, idx_pd, 2])*(1-mask_pd)
273+
DS_DL = (y[0, idx_nd, 2]-y[1, idx_nd, 2])*(1-mask_nd)
274+
275+
DP_AG = int(idx_pa-cov//2)
276+
DP_AL = int(idx_na-cov//2)
277+
DP_DG = int(idx_pd-cov//2)
278+
DP_DL = int(idx_nd-cov//2)
279+
280+
if ref_len == 1 and alt_len >= 3 and (
281+
(DS_AG >= 0.2 and DP_AG == 0) or
282+
(DS_AL >= 0.2 and DP_AL == 0) or
283+
(DS_DG >= 0.2 and DP_DG == 0) or
284+
(DS_DL >= 0.2 and DP_DL == 0)):
285+
286+
inserted_bases_genomic_coords = np.concatenate([
287+
np.arange(record.pos - INSERTED_BASES_CONTEXT + 1, record.pos + 1),
288+
[f"+{offset}" for offset in np.arange(1, alt_len)],
289+
np.arange(record.pos + 1, record.pos + INSERTED_BASES_CONTEXT + 1),
290+
])
291+
292+
y_ref_inserted_bases, y_alt_inserted_bases = compute_scores_for_inserted_bases(
293+
y_ref, y_alt, alt_len, cov)
294+
295+
ref_seq = (
296+
seq[wid//2 - INSERTED_BASES_CONTEXT + 1: wid//2 + 1] +
297+
" " * (alt_len - 1) +
298+
seq[wid//2 + 1 : wid//2 + 1 + INSERTED_BASES_CONTEXT]
299+
)
300+
alt_seq = (
301+
seq[wid//2 - INSERTED_BASES_CONTEXT : wid//2] +
302+
record.alts[j][1:] +
303+
seq[wid//2 + len(record.ref) : wid//2 + len(record.ref) + INSERTED_BASES_CONTEXT]
304+
)
305+
306+
assert len(ref_seq) == len(alt_seq), f"len(ref_seq) != len(alt_seq): {len(ref_seq)} != {len(alt_seq)}"
307+
308+
else:
309+
inserted_bases_genomic_coords = ref_seq = alt_seq = y_ref_inserted_bases = y_alt_inserted_bases = None
310+
253311
scores.append({
254312
"ALLELE": record.alts[j],
255313
"NAME": genes[i],
256314
"STRAND": strands[i],
257-
"DS_AG": f"{(y[1, idx_pa, 1]-y[0, idx_pa, 1])*(1-mask_pa):{FLOAT_FORMAT}}",
258-
"DS_AL": f"{(y[0, idx_na, 1]-y[1, idx_na, 1])*(1-mask_na):{FLOAT_FORMAT}}",
259-
"DS_DG": f"{(y[1, idx_pd, 2]-y[0, idx_pd, 2])*(1-mask_pd):{FLOAT_FORMAT}}",
260-
"DS_DL": f"{(y[0, idx_nd, 2]-y[1, idx_nd, 2])*(1-mask_nd):{FLOAT_FORMAT}}",
261-
"DP_AG": int(idx_pa-cov//2),
262-
"DP_AL": int(idx_na-cov//2),
263-
"DP_DG": int(idx_pd-cov//2),
264-
"DP_DL": int(idx_nd-cov//2),
315+
"DS_AG": f"{DS_AG:{FLOAT_FORMAT}}",
316+
"DS_AL": f"{DS_AL:{FLOAT_FORMAT}}",
317+
"DS_DG": f"{DS_DG:{FLOAT_FORMAT}}",
318+
"DS_DL": f"{DS_DL:{FLOAT_FORMAT}}",
319+
"DP_AG": DP_AG,
320+
"DP_AL": DP_AL,
321+
"DP_DG": DP_DG,
322+
"DP_DL": DP_DL,
265323
"DS_AG_REF": f"{y[0, idx_pa, 1]:{FLOAT_FORMAT}}",
266324
"DS_AL_REF": f"{y[0, idx_na, 1]:{FLOAT_FORMAT}}",
267325
"DS_DG_REF": f"{y[0, idx_pd, 2]:{FLOAT_FORMAT}}",
@@ -282,9 +340,20 @@ def get_delta_scores(record, ann, dist_var, mask):
282340
) if any(score >= MIN_SCORE_THRESHOLD for score in (ref_acceptor_score, alt_acceptor_score, ref_donor_score, ref_acceptor_score))
283341
or i in (idx_pa, idx_na, idx_pd, idx_nd)
284342
],
343+
"SCORES_OF_INSERTED_BASES": [] if y_alt_inserted_bases is None else [
344+
{
345+
"chrom": chrom,
346+
"pos": genomic_coord,
347+
"ref": ref_base,
348+
"alt": alt_base,
349+
"RA": f"{ref_acceptor_score:{FLOAT_FORMAT}}",
350+
"AA": f"{alt_acceptor_score:{FLOAT_FORMAT}}",
351+
"RD": f"{ref_donor_score:{FLOAT_FORMAT}}",
352+
"AD": f"{alt_donor_score:{FLOAT_FORMAT}}",
353+
} for i, (genomic_coord, ref_base, alt_base, ref_acceptor_score, alt_acceptor_score, ref_donor_score, alt_donor_score) in enumerate(zip(
354+
inserted_bases_genomic_coords, ref_seq, alt_seq, y_ref_inserted_bases[0, :, 1], y_alt_inserted_bases[0, :, 1], y_ref_inserted_bases[0, :, 2], y_alt_inserted_bases[0, :, 2]))
355+
],
285356
})
286357

287-
#print(f"Done computing scores. Hit cache for {total_count - model_prediction_count:,d} out of {total_count:,d} transcripts")
288-
289358
return scores
290359

0 commit comments

Comments
 (0)