forked from hovestadt/methylCtools
-
Notifications
You must be signed in to change notification settings - Fork 1
/
bconv.py
292 lines (235 loc) · 10.1 KB
/
bconv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#!/usr/bin/env python
#######################################
# methylCtools bconv
# v1.0.0
# 10 june 2018
#
# volker hovestadt
# developed at the german cancer research center, 2011-2015
# methylctools@hovestadt.bio
#
#
# reads are converted back to original state. positions are stored in read id.
# if a read (pair) maps illegally (reverse to Watson strand/forward to Crick
# strand, assuming strand specific protocol) it is set QCfail (flag +512).
# if there are more than INT non-CpG unconverted cytosines within the aligned part
# of a read, the read/pair is set QCfail.
#
# input must be in BAM format (unsorted (!) bwa output, use "-" for stdin).
# output is in BAM format, use "-" for stdout.
def mod_bconv(sysargv):
import sys
import argparse
import pysam
import re
import datetime
def nicetime(): return datetime.datetime.now().strftime("[bconv %Y-%m-%d %H:%M:%S]")
#######################################
# arguments, filehandles
parser = argparse.ArgumentParser(prog="methylCtools bconv", version="0.9.4", description="re-converts reads to original state following bwa alignment")
parser.add_argument("-s", "--silent", dest="qf", action="store_false", help="do not show status messages")
parser.add_argument("-m", "--metrics", metavar="metrics.txt", dest="outMETRICS", action="store", default=False, help="write metrics output")
groupinput = parser.add_argument_group("input files, required")
groupinput.add_argument("inBAM", metavar="aln.conv.bam", action="store", default=False, help="bwa alignment (unsorted), \"-\" for stdin")
groupoutput = parser.add_argument_group("output files, will be created")
groupoutput.add_argument("outBAM", metavar="aln.bam", action="store", default=False, help="converted alignment, \"-\" for stdout")
parser.add_argument("-u", "--maxunconv", metavar="INT", dest="maxnCGc", type=int, default=False, action="store", help="set reads containing > INT non-CG unconverted cytosines qc-fail [off]")
args = parser.parse_args(sysargv)
try:
samfileIN = pysam.Samfile(args.inBAM, "rb")
h = samfileIN.header # construct new header (merge alignments from Watson and Crick strand of same contig)
hseqnew = []
for tid in range(len(h["SQ"])): # header should be in the order of reference file generated by faconv, otherwise exit
if tid%2==0:
if h["SQ"][tid]["SN"][-2:] != "_W": sys.exit("methylCtools bconv: error: invalid BAM header")
hseqnew.append(h["SQ"][tid])
hseqnew[-1]["SN"] = h["SQ"][tid]["SN"][:-2]
else:
if h["SQ"][tid]["SN"][-2:] != "_C": sys.exit("methylCtools bconv: error: invalid BAM header")
h["SQ"] = hseqnew
if args.outBAM == "-":
samfileOUT = pysam.Samfile(args.outBAM, "wbu", header=h)
else:
samfileOUT = pysam.Samfile(args.outBAM, "wb", header=h)
if args.outMETRICS:
metricsOUT = open(args.outMETRICS, "w")
except IOError as strerror:
sys.exit("methylCtools bconv: error: %s" % strerror)
if args.qf: sys.stderr.write("%s command: %s\n" % (nicetime(), " ".join(sys.argv)))
#######################################
# define functions
def setQCfail(read):
if not read.is_qcfail:
read.flag += 512
c[6] += 1
return read
def convert(p, s, r): # p: conversion positions, s: sequence, r: is_reverse
pp = re.split("[C,G]", p)
if r: pp.reverse()
pi = 0
for i in pp[:-1]:
if i == "": i = 0
else: i = int(i)
pi += i+1
if s[pi-1] == "T": s = s[:pi-1] + "C" + s[pi:]
elif s[pi-1] == "A": s = s[:pi-1] + "G" + s[pi:]
else: sys.exit("methylCtools bconv: error: unexpected conversion")
return s
def changetid(t):
if t >= 0:
if t%2 == 0: return t/2
else: return (t-1)/2
else: return t
#######################################
# main
if args.qf: sys.stderr.write("%s start: processing alignments\n" % nicetime())
c = [0] *9 # [reads, paired, mapped, qcfail, proper, singleton, illegal, nCGc, isize<readlen]
isized = [0] *501 # insert size distribution
tidd = [0] *len(hseqnew)
while 1:
try:
read1 = samfileIN.next()
while read1.is_secondary: read1 = samfileIN.next() # do not process secondary alignments (bwa-mem)
except StopIteration: break
if read1.is_paired:
### PE read
try:
read2 = samfileIN.next()
while read2.is_secondary: read2 = samfileIN.next()
except StopIteration: break
# convert
read1Q, read2Q = read1.qname.split("."), read2.qname.split(".")
read1H = read1Q[1]
if len(read2Q) == 3: read2H = read2Q[2]
else: read2H = read2Q[1]
if read1Q[0] != read2Q[0]: sys.exit("methylCtools bconv: error: input read order: %s, %s" % (read1Q[0], read2Q[0]))
read1.qname, read2.qname = read1Q[0], read2Q[0] # restore original ID
read1qual, read2qual = read1.qual, read2.qual # pysam "bug": qual is reset when seq is changed
read1.seq = convert(read1H, read1.seq, read1.is_reverse)
read2.seq = convert(read2H, read2.seq, read2.is_reverse)
read1.qual, read2.qual = read1qual, read2qual
# check for illegal mapping
for read in [read1, read2]:
if not read.is_unmapped:
if read.tid%2 == 0:
if read.is_read1: # 99
if read.is_reverse:
setQCfail(read1), setQCfail(read2)
else: # 147
if not read.is_reverse:
setQCfail(read1), setQCfail(read2)
else:
if read.is_read1: # 83
if not read.is_reverse:
setQCfail(read1), setQCfail(read2)
else:
if read.is_reverse: # 163
setQCfail(read1), setQCfail(read2)
# check for unconverted non-CpGs
if not type(args.maxnCGc) == type(False):
nCGc1, nCGc2 = 0, 0
if not read1.is_unmapped:
if read1.is_read1 != read1.is_reverse: nCGc1 = len([s2 for s2 in re.findall("C[A,C,T]", read1.query)])
else: nCGc1 = len([s2 for s2 in re.findall("[A,G,T]G", read1.query)])
if not read2.is_unmapped:
if read2.is_read1 != read2.is_reverse: nCGc2 = len([s2 for s2 in re.findall("C[A,C,T]", read2.query)])
else: nCGc2 = len([s2 for s2 in re.findall("[A,G,T]G", read2.query)])
if nCGc1 > args.maxnCGc or nCGc2 > args.maxnCGc:
if not read1.is_qcfail: read1.flag += 512
if not read2.is_qcfail: read2.flag += 512
c[7] += 2
# change tid
read1.tid = changetid(read1.tid)
read1.mrnm = changetid(read1.mrnm)
read2.tid = changetid(read2.tid)
read2.mrnm = changetid(read2.mrnm)
# write
samfileOUT.write(read1)
samfileOUT.write(read2)
# metrics
c[0] += 2 # reads
if args.outMETRICS:
c[1] += 2 # paired
for read in [read1, read2]:
if read.is_proper_pair:
c[4] += 1 # proper pair
c[2] += 1
tidd[read.tid] += 1
elif not read.is_unmapped:
c[2] += 1 # mapped
if read.mate_is_unmapped: c[5] += 1 # singleton
elif read.tid == read.mrnm and read.pos == read.mpos: c[8] += 1 # isize < read length
tidd[read.tid] += 1
if not read.isize == 0 and abs(read.isize) <= 500:
isized[abs(read.isize)] += 1 # isize
if read.is_qcfail: c[3] += 1 # qcfail
if args.qf and c[0]%10**6 <= 1: sys.stderr.write("%s status: %i alignments processed\n" % (nicetime(), c[0]))
else:
### SE read
if read1.is_secondary: continue
# convert
read1Q = read1.qname.split(".")
read1H = read1Q[1]
read1.qname = read1Q[0] # restore original ID
read1qual = read1.qual # pysam "bug": qual is reset when seq is changed
read1.seq = convert(read1H, read1.seq, read1.is_reverse)
read1.qual = read1qual
# check for illegal mapping
if not read1.is_unmapped:
if read1.tid%2 == 0:
if read1.is_reverse: # 0
setQCfail(read1)
else:
if not read1.is_reverse: # 16
setQCfail(read1)
# check for unconverted non-CpGs
if not type(args.maxnCGc) == type(False):
nCGc1 = 0
if not read1.is_unmapped:
if not read1.is_reverse: nCGc1 = len([s2 for s2 in re.findall("C[A,C,T]", read1.query)])
else: nCGc1 = len([s2 for s2 in re.findall("[A,G,T]G", read1.query)])
if nCGc1 > args.maxnCGc:
if not read1.is_qcfail: read1.flag += 512
c[7] += 1
# change tid
read1.tid = changetid(read1.tid)
read1.mrnm = changetid(read1.mrnm)
# write
samfileOUT.write(read1)
# metrics
c[0] += 1 # reads
if args.outMETRICS:
if not read1.is_unmapped:
c[2] += 1 # mapped
tidd[read1.tid] += 1 # tid
if read1.is_qcfail: c[3] += 1 # qcfail
if args.qf and c[0]%10**6 == 0: sys.stderr.write("%s status: %i alignments processed\n" % (nicetime(), c[0]))
#######################################
# end
if args.outMETRICS:
metricsOUT.write("# flag statistics\n")
metricsOUT.write("%10i in total\n" % c[0])
metricsOUT.write("%10i paired in sequencing\n\n" % c[1])
metricsOUT.write("%10i mapped (%.2f%%)\n" % (c[2], c[2]/float(c[0])*100))
metricsOUT.write("%10i properly paired (%.2f%%)\n" % (c[4], c[4]/float(c[0])*100))
metricsOUT.write("%10i isize smaller read length (%.2f%%)\n" % (c[8], c[8]/float(c[0])*100))
metricsOUT.write("%10i singletons (%.2f%%)\n\n" % (c[5], c[5]/float(c[0])*100))
if not type(args.maxnCGc) == type(False):
metricsOUT.write("%10i QC-fail (%.2f%%)\n" % (c[3], c[3]/float(c[0])*100))
metricsOUT.write("%10i illegally mapped (%.2f%%)\n" % (c[6], c[6]/float(c[0])*100))
metricsOUT.write("%10i >%i non-CG Cs (%.2f%%)\n" % (c[7], args.maxnCGc, c[7]/float(c[0])*100))
else:
metricsOUT.write("%10i illegally mapped (%.2f%%)\n" % (c[3], c[3]/float(c[0])*100))
metricsOUT.write("\n# mapped reads per contig\n")
for tid in range(len(tidd)):
metricsOUT.write("%10i %s (%.2f%%)\n" % (tidd[tid], samfileOUT.getrname(tid), (tidd[tid]/float(c[2])*100)))
metricsOUT.write("\n# insert size distribution\n")
for isize in range(len(isized)):
metricsOUT.write("%10i %i\n" % (isized[isize], isize))
metricsOUT.close()
samfileIN.close()
samfileOUT.close()
if args.qf: sys.stderr.write("%s end: %i alignments processed\n" % (nicetime(), c[0]))
if __name__ == "__main__":
import sys
mod_bconv(sys.argv[1:])