-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathm5_stats.py
169 lines (138 loc) · 7.29 KB
/
m5_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import sys
import argparse
import fasta
import aln_formats
def main(unc, cor, fa, method, verbose=False):
if verbose:
print "Reading pacbio fasta"
pacbio_reads, names = fasta.read_fasta(fa)
cor_aligned = 0
unc_aligned = 0
tp = 0
fp = 0
fn = 0
#ne = 0
tn = 0
'''
From ec_toolkit compute-stats.py:
errorStats['TP'] += len(errPreCorrect.difference(errPostCorrect))
errorStats['FP'] += len(errPostCorrect.difference(errPreCorrect))
errorStats['FN'] += len(errPreCorrect.intersection(errPostCorrect))
errorStats['NE'] += getNumWrongBase(errPreCorrect,errPostCorrect)
# apparently, NE is the number of bases changed, but still incorrect
'''
'''
From Error Correction Toolkit paper:
We use the following measures for each program:
number of erroneous bases identified and
successfully corrected (true positives, TP), correct
bases wrongly identified as errors and changed
(false positives, FP), and erroneous bases that were
either uncorrected or falsely corrected (false negatives,
FN). We report sensitivity and specificity for
each program. Then, we combine these into the gain
metric [21], defined by gain = (TP - FP) /
(TP + FN), which is the percentage of errors
removed from the data set by the error-correction
program. A negative gain value indicates that more
errors have been introduced due to false corrections,
which is not captured by measures such as sensitivity
and specificity.
'''
cor_iter = aln_formats.iter_m5(cor, 0, 0, 1000000000, by_query=True)
uncor_iter = aln_formats.iter_m5(unc, 0, 0, 1000000000, by_query=True)
cor_query = None
uncor_query = None
correct_uncorrected = 0
incorrect_uncorrected = 0
correct_corrected = 0
incorrect_corrected = 0
# ------ !! keep track of loci on the target sequence since coordinates on the query sequence will change dramatically ------
# ------ !! although if sequences align to different places, everything will go to crap ------
while True:
if cor_query is not None and cor_query == uncor_query and cor_best_aln.target.name == uncor_best_aln.target.name:
if verbose:
print
print "{} aligned for both uncorrected and corrected".format(cor_query)
print "{} errors in uncorrected read".format(len(incorrect_loci_in_uncorrected))
print "{} errors in corrected read".format(len(incorrect_loci_in_corrected))
read_tp = len(prev_incorrect & now_correct)
read_fp = len(prev_correct & now_incorrect)
read_fn = len(prev_incorrect & now_incorrect)
read_tn = len(prev_correct & now_correct)
# not obviously trivial to compute this using set operations
#read_ne = 0
tp += read_tp
fp += read_fp
fn += read_fn
tn += read_tn
if (uncor_query is None or cor_iter is None or uncor_query <= cor_query) and uncor_iter is not None:
try:
uncor_query, uncor_aln = uncor_iter.next()
except:
uncor_iter = None
if cor_iter is None:
break
unc_aligned += 1
uncor_best_aln = sorted(uncor_aln, key = lambda al: (al.accuracy() * abs(al.query.end - al.query.start)))[-1] # keep only "best" alignment, by total correct bp
incorrect_loci_in_uncorrected = []
correct_loci_in_uncorrected = []
# get all incorrect loci in uncorrected alignment
tpos = uncor_best_aln.target.start
for i in xrange(len(uncor_best_aln.alignment)):
if uncor_best_aln.alignment[i] == '|':
correct_loci_in_uncorrected.append(tpos)
else:
if uncor_best_aln.target.alignment[i] == '-':
incorrect_loci_in_uncorrected.append("{}i{}".format(uncor_best_aln.query.alignment[i], tpos)) # <nucleotide> inserted before tpos
else:
incorrect_loci_in_uncorrected.append("x{}".format(tpos)) # <nucleotide> mismatch or deleted at tpos
if uncor_best_aln.target.alignment[i] != '-':
tpos += 1
incorrect_loci_in_uncorrected.extend(range(uncor_best_aln.target.start-uncor_best_aln.query.start, uncor_best_aln.target.start) + range(uncor_best_aln.target.end, uncor_best_aln.target.end+uncor_best_aln.query.length-uncor_best_aln.query.end)) # finagle an estimate of the target regions that are supposed to be covered by the read
prev_incorrect = set(incorrect_loci_in_uncorrected)
prev_correct = set(correct_loci_in_uncorrected)
incorrect_uncorrected += len(prev_incorrect)
correct_uncorrected += len(prev_correct)
else:
try:
cor_query, cor_aln = cor_iter.next()
except:
cor_iter = None
if uncor_iter is None:
break
cor_aligned += 1
cor_best_aln = sorted(cor_aln, key = lambda al: (al.accuracy() * abs(al.query.end - al.query.start)))[-1] # keep only "best" alignment, by total correct bp
incorrect_loci_in_corrected = []
correct_loci_in_corrected = []
# get all incorrect loci in corrected alignment
tpos = cor_best_aln.target.start
for i in xrange(len(cor_best_aln.alignment)):
if cor_best_aln.alignment[i] == '|':
correct_loci_in_corrected.append(tpos)
else:
if cor_best_aln.target.alignment[i] == '-':
incorrect_loci_in_corrected.append("{}i{}".format(cor_best_aln.query.alignment[i], tpos)) # <nucleotide> inserted before tpos
else:
incorrect_loci_in_corrected.append("{}".format(tpos)) # <nucleotide> mismatch or deleted at tpos
if cor_best_aln.target.alignment[i] != '-':
tpos += 1
incorrect_loci_in_corrected.extend(range(cor_best_aln.target.start-cor_best_aln.query.start, cor_best_aln.target.start) + range(cor_best_aln.target.end, cor_best_aln.target.end+cor_best_aln.query.length-cor_best_aln.query.end)) # finagle an estimate of the target regions that are supposed to be covered by the read
now_incorrect = set(incorrect_loci_in_corrected)
now_correct = set(correct_loci_in_corrected) # these should already be unique, but we need them to be sets to do set operations
incorrect_corrected += len(now_incorrect)
correct_corrected += len(now_correct)
if tp + fn == 0:
raise Exception("No read names matched (uncor: {}, cor: {})".format(uncor_query, cor_query))
gain = float(tp - fp) / (tp + fn)
print "Sample\tMethod\tUncorrected reads\tCorrected reads\tRead gain/loss\tUncorrected wrong bp\tUncorrected right bp\tCorrected wrong bp\tCorrected right bp\tTP\tFP\tTN\tFN\tSensitivity\tSpecificity\tGain"
print "%s\t%s\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%.4f\t%.4f\t%.4f" % (fa, cor, unc_aligned, cor_aligned, (cor_aligned-unc_aligned), incorrect_uncorrected, correct_uncorrected, incorrect_corrected, correct_corrected, tp, fp, tn, fn, (float(tp)/(tp + fn)), (float(tn)/(tn + fp)), gain)
if __name__ == "__main__":
parser = argparse.ArgumentParser("Compute true/false positive and gain stats for long-read error corrections")
parser.add_argument("uncorrected", help="m5 file for original reads")
parser.add_argument("corrected", help="m5 file for corrected reads")
parser.add_argument("fasta", help="Uncorrected fasta")
parser.add_argument("method", help="Method used to correct reads")
parser.add_argument("--verbose", help="Verbosity (every read)", default=False, action="store_true")
args = parser.parse_args()
main(args.uncorrected, args.corrected, args.fasta, args.method, args.verbose)