-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfilter_miscalled_Cs.py
executable file
·83 lines (65 loc) · 2.77 KB
/
filter_miscalled_Cs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python3
"""
> filter_miscalled_Cs.py <
Script to process the .cov files produced by bismark_methylation_extractor, and
removes methylated positions that potentially arise by chance.
A super-conservative estimate of 1% miscall rate is assumed (might be as low
as 0.1%, evidenced by the CHG and CHH call rate in S. pistillata libraries).
This miscall rate can be tweaked, by changing --miscall.
The script ONLY considers positions that have at least one methylated position.
There's no way to "miscall" a C as methylated... if it's not methylated. As
a result, the B-H correction only takes into consideration the universe of
positions with at least one methylated read.
A binomial test is run to check the cumulative P value (P >= k), where k is the
number of methylated sites, and P values are further B-H corrected.
"""
import argparse
import csv
import sys
import re
import scipy.stats
import correct_p_values
import natural_sort
parser = argparse.ArgumentParser(description="""
Script to process the .cov files produced by bismark_methylation_extractor, and
removes methylated positions that potentially arise by chance.""")
parser.add_argument('bismark_cov', metavar='cov_filename',
type=argparse.FileType('r'),
help='Bismark .cov filename.')
parser.add_argument('--miscall', metavar='miscall_rate',
type=float, default=0.01,
help='Control assumed miscall rate (default: 0.01).')
parser.add_argument('-v', '--verbose', action='store_true',
help='verbose mode, prints progress to stderr.')
args = parser.parse_args()
# calculate binomial p values first
counter_rows = 0
p_values = {}
tsv_reader = csv.reader(args.bismark_cov, delimiter='\t')
for row in tsv_reader:
if not row: continue
if args.verbose:
counter_rows += 1
if counter_rows % 100000 == 0:
print ('{} rows processed...'.format(counter_rows), file=sys.stderr)
# column 4: sum(methylated reads)
# column 5: sum(non-methylated reads)
meth = int(row[4])
non_meth = int(row[5])
# ignore lines with meth == 0
if meth > 0:
# scipy.stats.binom.sf calculates P(X > n) by default; the "-1"
# is added to achieve P(X >= n).
binom_p = scipy.stats.binom.sf(meth-1, meth + non_meth, args.miscall)
p_values[str(row)] = binom_p
# run B-H correction
if args.verbose:
print ('correcting p values...', file=sys.stderr)
p_values = correct_p_values.correct_p_values(p_values)
# then print valid rows out
if args.verbose:
print ('printing output and exiting.', file=sys.stderr)
for p in natural_sort.natural_sort(p_values):
# print out rows that have corrected P values < 0.05
if p_values[p] < 0.05:
print ('\t'.join(eval(p)))