-
Notifications
You must be signed in to change notification settings - Fork 2
/
getFeatures-getReadFeatures.py
392 lines (306 loc) · 10.2 KB
/
getFeatures-getReadFeatures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
#!/usr/bin/python
#Processes reads, parsing CIGAR strings and extracting qualities
#and writes gz output
import csv
import gzip
import os
import numpy
import pysam
import re
import sys
bam = sys.argv[1]
bed = sys.argv[2]
outFeature = sys.argv[3]
#Open BAM file with pysam
bamData = pysam.AlignmentFile(bam, "rb")
#Open BED file and read regions to interrogate
regions = []
with open(bed, "r") as b:
for region in csv.reader(b, delimiter="\t"):
#add 1 since BED is 0-based
regions.append([region[0], int(region[1]) + 1, int(region[2]) + 1])
#Prepare to write TSV GZ
outFeatureRaw = outFeature + ".raw" #when finish, rename to remove .raw
o = gzip.open(outFeatureRaw, "wt", newline="")
writer = csv.writer(o, delimiter="\t", lineterminator="\n")
#where we'll store info about six features
#two qualities
# mapQ - mapping quality
# baseQ - base quality
#some from CIGAR string
# allSC - base is SC
# edgeSC - SC ends on this base
# ins - insertion
# edgeDel - deletion
# scCons - SC consistency
# meanSCQual - baseQ of SC bases
# nHQual - proportion of reads with mapQ above threshold (13) [really calc'ed from baseQ]
# scDist - signed mean distance along reads to SC (0 if base is SC)
#keys are [chrom][pos][featureType]
features = {}
def recordFeature(chrom, pos, value, featureType):
#value could be None (for increment allSC, edgeSC, ins, edgeDel),
#a float (for qualities)
#or list [scPos, scBase] for scCons
if chrom not in features:
features[chrom] = {}
if pos not in features[chrom]:
features[chrom][pos] = {}
if featureType == "scCons":
scPos = value[0]
scBase = value[1]
#features[chrom][pos][featureType] is all for scCons
#a dict, where keys are positions
#and values are dictionaries
#of keys A, T, C, G with values how many times each occurs
if featureType not in features[chrom][pos]:
features[chrom][pos][featureType] = {}
if scPos not in features[chrom][pos][featureType]:
features[chrom][pos][featureType][scPos] = {}
if scBase not in features[chrom][pos][featureType][scPos]:
features[chrom][pos][featureType][scPos][scBase] = 1
else:
features[chrom][pos][featureType][scPos][scBase] += 1
elif featureType in ["allSC", "edgeSC", "ins", "edgeDel"]:
#set to 1 or increment
if featureType not in features[chrom][pos]:
features[chrom][pos][featureType] = 1
else:
features[chrom][pos][featureType] += 1
else: #mapQ, baseQ, scQual, nHQual, scDist
#append to array so can take mean later
if featureType not in features[chrom][pos]:
features[chrom][pos][featureType] = [value]
else:
features[chrom][pos][featureType].append(value)
def recordInsertion(i, chrom, oStart, oEnd, _):
#mark the insertion point
recordFeature(chrom, oStart, None, "ins")
def recordDeletion(i, chrom, oStart, oEnd, _):
#mark the edges of the deletion
recordFeature(chrom, oStart, None, "edgeDel")
if oEnd != oStart - 1:
#the deletion is not a single base
#(in which case there's only position to mark)
#actually, for single-base deletions, should we mark the base twice?
#decrement to avoid off by 1
recordFeature(chrom, oEnd - 1, None, "edgeDel")
def queryPosForEdgeSCPos(scOnLeft, edgeSCPos, read):
#get_aligned_pairs() returns list of [queryPos, pos]
# except pos is None for SC bases
# so find queryPos for base just after/before SC
aligned = read.get_aligned_pairs()
#subtract one to match genomic coordinate to pysam coordinate
boundaryPos = edgeSCPos - 1
if scOnLeft:
#add 1 since need first non SC base
boundaryPos += 1
else:
#subtract 1 since need last non SC base
boundaryPos -= 1
try:
boundaryQueryPos = [q for q in aligned if q[1] == boundaryPos][0][0]
except:
print("Warning: Could not find SC query position for read: {}".format(read.tostring()))
return None
#shift back to find query pos for edgSCPos
if scOnLeft:
edgeSCQueryPos = boundaryQueryPos - 1
else:
edgeSCQueryPos = boundaryQueryPos + 1
return edgeSCQueryPos
def recordSoftClipping(i, chrom, oStart, oEnd, read):
#1. Record all SC
#SC covers [oStart, oEnd)
for pos in range(oStart, oEnd):
recordFeature(chrom, pos, None, "allSC")
#2. Record distance to SC
#for positions that are not SC, record signed distance to SC
#for positions that are SC, record 0
rStart = read.reference_start + 1
rEnd = rStart + read.query_length
for pos in range(rStart, rEnd):
if pos < oStart:
#before SC
distance = oStart - pos
elif pos >= oEnd:
#after SC
distance = oEnd - pos
else:
distance = 0
recordFeature(chrom, pos, distance, "scDist")
#3. Record inner edge of SC
scOnLeft = i == 0
if scOnLeft:
#last SC base is on right
edgeSCPos = oEnd - 1
else:
#last SC base is on left
edgeSCPos = oStart
recordFeature(chrom, edgeSCPos, None, "edgeSC")
#3. Record base for all SC
def recordSCBase(queryPos, pos):
base = read.query_sequence[queryPos]
recordFeature(chrom, edgeSCPos, [pos, base], "scCons")
#also record base quality
baseQ = read.query_qualities[queryPos]
recordFeature(chrom, edgeSCPos, baseQ, "scQual")
edgeSCQueryPos = queryPosForEdgeSCPos(scOnLeft, edgeSCPos, read)
queryPos = edgeSCQueryPos
if not queryPos:
return
if scOnLeft:
#start at inner edge of SC, move along SC toward left of edge read
for pos in reversed(range(oStart, oEnd)):
recordSCBase(queryPos, pos)
queryPos -= 1
else:
#start at inner edge of SC, move along SC toward right edge of read
for pos in range(oStart, oEnd):
recordSCBase(queryPos, pos)
queryPos += 1
def processCIGAR(chrom, read):
#add 1 to make position match what's in BAM
rStart = read.reference_start + 1
cigar = read.cigarstring
if not read.cigartuples:
return
#loop over operations in CIGAR string
for i, (oCode, oWidth) in enumerate(read.cigartuples):
#we only care about
# insertions (1),
# deletions (2),
# and soft-clipping (4)
if oCode not in [1, 2, 4]:
continue
#find the start and end of the operation
if (i == 0):
#first operation in the read:
#read starts at first mapped base
#so subtract oWidth to figure out actual start
oStart = rStart - oWidth
oEnd = rStart
else:
#start at the start of the read,
oStart = rStart
#add the sum of the oWidths of
#all consuming operations before this
nonConsumingCodes = [1, 4, 5, 6]
previous = read.cigartuples[:i]
previousConsuming = [c for c in previous if c[0] not in nonConsumingCodes]
previousConsumingWidthsSum = sum([int(c[1]) for c in previousConsuming])
#then adjust oStart and oEnd
oStart += previousConsumingWidthsSum
oEnd = oStart + oWidth
#record operation
{
1 : recordInsertion,
2 : recordDeletion,
4 : recordSoftClipping
}[oCode](i, chrom, oStart, oEnd, read)
#pass read just for recordSoftClipping so can record SC bases
def processQualities(chrom, read):
#separate cigar and quality extraction into separate functions?
#Extract qualities from reads
#mapQ
mapQ = read.mapping_quality
#apply to all places where read matches to ref
for pos in read.get_reference_positions():
recordFeature(chrom, pos, mapQ, "mapQ")
#extract base quality for each base
for [queryPos, pos] in read.get_aligned_pairs():
if queryPos is not None:
#skip del, refskip?
baseQ = read.query_qualities[queryPos]
recordFeature(chrom, pos, baseQ, "baseQ")
def getSCCons(f, pos, nullValue):
if "scCons" not in f or not f["scCons"]:
return nullValue
#SHORTCUT: if edgeSC == 1, scCons == 1
if "edgeSC" in f and f["edgeSC"] == 1:
return 1.0
bases = f["scCons"]
consistencies = []
for scPos in bases:
scBaseCounts = bases[scPos].values()
#this is dict, where keys are A, T, C, G
#and values are how often each occurs
#do values to get just occurences
consistency = max(scBaseCounts) / sum(scBaseCounts) #ratio of times most common element appears
consistencies.append(consistency)
return numpy.mean(consistencies)
def getFeatures(chrom, pos, nullValue):
nFeatures = 10
if chrom not in features or pos not in features[chrom]:
data = [nullValue] * nFeatures
else:
f = features[chrom][pos]
scCons = getSCCons(f, pos, nullValue)
#get meanBaseQ -- AND nHQual
if "baseQ" not in f or not f["baseQ"]:
meanBaseQ = nullValue
nHQual = nullValue
else:
bQs = f["baseQ"]
meanBaseQ = numpy.mean(bQs)
nHQual = len(list(filter(lambda bQ : bQ > 13, bQs))) / float(len(bQs))
#get mean mapQ
if "mapQ" not in f or not f["mapQ"]:
meanMapQ = nullValue
else:
meanMapQ = numpy.mean(f["mapQ"])
#get mean scQual
if "scQual" not in f or not f["scQual"]:
meanSCQual = nullValue
else:
meanSCQual = numpy.mean(f["scQual"])
#get mean scDist
if "scDist" not in f or not f["scDist"]:
meanSCDist = 1000
#nullValue is low, which would provide strong signal that close to SC
#so when we don't know, use a high placeholder flag
else:
meanSCDist = numpy.mean(f["scDist"])
data = [
meanMapQ,
meanBaseQ,
f["allSC"] if "allSC" in f else nullValue,
f["edgeSC"] if "edgeSC" in f else nullValue,
f["ins"] if "ins" in f else nullValue,
f["edgeDel"] if "edgeDel" in f else nullValue,
scCons,
meanSCQual,
nHQual,
meanSCDist
]
return data
#Iterate over BAM file
#Populating features with values to write out
windowSize = 10000 #If bottleneck is I/O, using bigger windowSize should make faster
windowBuffer = 300
for [chrom, start, end] in regions:
windowStart = start
while windowStart < end:
windowEnd = windowStart + windowSize
if windowEnd > end:
windowEnd = end
features.clear()
#Calculating
#Extract reads from this region
#Store the results in features
for read in bamData.fetch(chrom, max(0, windowStart - windowBuffer), windowEnd + windowBuffer):
processCIGAR(chrom, read)
processQualities(chrom, read)
#Writing
for pos in range(windowStart, windowEnd):
#note that this makes sure that all the positions we pull data for are in fact in this window
#just iterating over the positions in features[chrom] could result in
#pulling data for positoins outside this window
#since reads that are both inside and outside the window can be pulled by fetch
nullValue = 0
featuresData = getFeatures(chrom, pos, nullValue)
writer.writerow([chrom, pos] + featuresData)
#Shift window
windowStart += windowSize
os.rename(outFeatureRaw, outFeature)
print("Done.")