From a2ef5aae5b6f8e3e6ba1d33447e7a5c9561b1294 Mon Sep 17 00:00:00 2001 From: Tom Smith Date: Wed, 25 May 2016 13:48:07 +0100 Subject: [PATCH 1/3] adds quality filtering to extract --- umi_tools/extract.py | 97 ++++++++++++++++++++++++++++++++++++++++---- umi_tools/version.py | 2 +- 2 files changed, 91 insertions(+), 8 deletions(-) diff --git a/umi_tools/extract.py b/umi_tools/extract.py index 816a013e..eb84c9fb 100644 --- a/umi_tools/extract.py +++ b/umi_tools/extract.py @@ -100,6 +100,13 @@ # https://github.com/CGATOxford/cgat/blob/master/CGAT/Fastq.py ############################################################################### +RANGES = { + 'phred33': (33, 77), + 'solexa': (59, 106), + 'phred64': (64, 106), +} + + class Record: """A record representing a :term:`fastq` formatted record. @@ -113,7 +120,7 @@ class Record: String representation of quality scores. format : string Quality score format. Can be one of ``sanger``, - ``illumina-1.8``, ``solexa`` or ``phred64``. + ``illumina-1.9``, ``solexa`` or ``phred64``. """ def __init__(self, identifier, seq, quals, format=None): @@ -121,6 +128,19 @@ def __init__(self, identifier, seq, quals, format=None): identifier, seq, quals, format) self.format = None + def guessFormat(self): + '''return quality score format - + might return several if ambiguous.''' + + c = [ord(x) for x in self.quals] + mi, ma = min(c), max(c) + r = [] + for format, v in RANGES.iteritems(): + m1, m2 = v + if mi >= m1 and ma < m2: + r.append(format) + return r + def __str__(self): return "@%s\n%s\n+\n%s" % (self.identifier, self.seq, self.quals) @@ -189,7 +209,10 @@ def _joiner_5prime(self, sequence, sample): def _joiner_3prime(self, sequence, sample): return sequence + sample - def __init__(self, pattern, pattern2, prime3=False): + def __init__(self, pattern, pattern2, + quality_filter_threshold, + quality_encoding, + prime3=False,): if prime3: self.extract = self._extract_3prime @@ -198,6 +221,8 @@ def __init__(self, pattern, pattern2, prime3=False): self.extract = self._extract_5prime self.joiner = self._joiner_5prime + self.thresh = quality_filter_threshold + self.encoding = quality_encoding self.pattern_length = len(pattern) self.umi_bases = [x for x in range(len(pattern)) if pattern[x] is "N"] self.bc_bases = [x for x in range(len(pattern)) if pattern[x] is "X"] @@ -216,6 +241,24 @@ def __call__(self, read1, read2=None): bc1, sequence1 = self.extract(read1.seq) bc_qual1, seq_qual1 = self.extract(read1.quals) + # check qualities of UMI bases from read1 + if self.thresh: + read1_format = read1.guessFormat() + assert self.encoding in read1_format, ( + "specified quality encoding (%s) does not match possible " + "format for read (%s)" % (self.encoding, read1_format)) + + umi_qual = map(ord, bc_qual1) + umi_qual = [x - RANGES[self.encoding][0] for x in umi_qual] + below_threshold = len([x for x in umi_qual if x < self.thresh]) + + # if any bases below threshold, return None + if below_threshold > 0: + if read2: + return None, None + else: + return None + umi1 = "".join([bc1[x] for x in self.umi_bases]) sample1 = "".join([bc1[x] for x in self.bc_bases]) sample_qual1 = "".join([bc_qual1[x] for x in self.bc_bases]) @@ -224,6 +267,21 @@ def __call__(self, read1, read2=None): bc2, sequence2 = self.extract(read2.seq, read=2) bc_qual2, seq_qual2 = self.extract(read2.quals, read=2) + # check qualities of UMI bases from read2 + if self.thresh: + read2_format = read2.guessFormat() + assert self.encoding in read2_format, ( + "specified quality encoding (%s) does not match possible " + "format(s) for read 2 (%s)" % (self.encoding, read2_format)) + + umi_qual2 = map(ord, bc_qual2) + umi_qual2 = [x - RANGES[self.encoding][0] for x in umi_qual2] + below_threshold = len([x for x in umi_qual2 if x < self.thresh]) + + # if any bases below threshold, return None, None + if below_threshold > 0: + return None, None + umi2 = "".join([bc2[x] for x in self.umi_bases2]) sample2 = "".join([bc2[x] for x in self.bc_bases2]) sample_qual2 = "".join([bc_qual2[x] for x in self.bc_bases2]) @@ -280,6 +338,15 @@ def main(argv=None): help="barcode is on 3' end of read") parser.add_option("--read2-out", dest="read2_out", type="string", help="file to output processed paired read to") + parser.add_option("--quality-filter-threshold", + dest="quality_filter_threshold", type="int", + help=("Remove reads where any UMI base quality score " + "falls below this threshold")) + parser.add_option("--quality-encoding", + dest="quality_encoding", type="choice", + choices=["phred33", "phred64", "solexa"], + help=("Quality score encoding. Choose from phred33" + "[33-77] phred64 [64-106] or solexa [59-106]")) parser.add_option("--supress-stats", dest="stats", action="store_false", help="Suppress the writing of stats to the log") @@ -289,7 +356,9 @@ def main(argv=None): read2_in=None, read2_out=None, prime3=False, - stats=True) + stats=True, + quality_filter_threshold=None, + quality_encoding=None) # add common options (-h/--help, ...) and parse command line @@ -311,14 +380,26 @@ def main(argv=None): raise ValueError("must specify an output for the paired end " "``--read2-out``") + if options.quality_filter_threshold: + if not options.quality_encoding: + raise ValueError("must provide a quality encoding to filter UMIs " + "by quality ``--quality-encoding``") + # Initialise the processor - processor = Extractor(options.pattern, options.pattern2, options.prime3) + processor = Extractor(options.pattern, + options.pattern2, + options.quality_filter_threshold, + options.quality_encoding, + options.prime3) + read1s = fastqIterate(options.stdin) if options.read2_in is None: for read in read1s: - options.stdout.write(str(processor(read)) + "\n") + new_1 = processor(read) + if new_1: + options.stdout.write(str(new_1) + "\n") else: @@ -326,9 +407,11 @@ def main(argv=None): read2_out = U.openFile(options.read2_out, "w") for read1, read2 in izip(read1s, read2s): + U.info("read1: %s, read2: %s" % ( read1, read2)) new_1, new_2 = processor(read1, read2) - options.stdout.write(str(new_1) + "\n") - read2_out.write(str(new_2) + "\n") + if new_1: + options.stdout.write(str(new_1) + "\n") + read2_out.write(str(new_2) + "\n") # write footer and output benchmark information. diff --git a/umi_tools/version.py b/umi_tools/version.py index 869ac9c0..36e61c64 100644 --- a/umi_tools/version.py +++ b/umi_tools/version.py @@ -1,2 +1,2 @@ -__version__ = "0.0.11" +__version__ = "0.1.0" From dc1e9fdad5ae0598f7a9a9a2c680eb8b88abb305 Mon Sep 17 00:00:00 2001 From: Tom Smith Date: Wed, 25 May 2016 13:51:58 +0100 Subject: [PATCH 2/3] updates documentation --- umi_tools/dedup.py | 2 +- umi_tools/extract.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/umi_tools/dedup.py b/umi_tools/dedup.py index 5321076a..27ce2339 100644 --- a/umi_tools/dedup.py +++ b/umi_tools/dedup.py @@ -68,7 +68,7 @@ "directional-adjacency" Identify clusters of connected UMIs (based on hamming distance - threshold) and umi A counts > (2* umi B counts) - 1. Return the + threshold) and umi A counts >= (2* umi B counts) - 1. Return the UMIs with the highest counts per cluster --edit-distance-threshold (int) diff --git a/umi_tools/extract.py b/umi_tools/extract.py index eb84c9fb..6617b887 100644 --- a/umi_tools/extract.py +++ b/umi_tools/extract.py @@ -120,7 +120,7 @@ class Record: String representation of quality scores. format : string Quality score format. Can be one of ``sanger``, - ``illumina-1.9``, ``solexa`` or ``phred64``. + ``phred33``, ``phred64`` or ``solexa``. """ def __init__(self, identifier, seq, quals, format=None): From 6bc0c7f79a224daec25e214df3efd1a448e30aa3 Mon Sep 17 00:00:00 2001 From: Tom Smith Date: Tue, 31 May 2016 12:11:24 +0100 Subject: [PATCH 3/3] corrects quality checking so only UMI bases in bc-pattern are checked --- umi_tools/extract.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/umi_tools/extract.py b/umi_tools/extract.py index 6617b887..2986f89e 100644 --- a/umi_tools/extract.py +++ b/umi_tools/extract.py @@ -246,9 +246,10 @@ def __call__(self, read1, read2=None): read1_format = read1.guessFormat() assert self.encoding in read1_format, ( "specified quality encoding (%s) does not match possible " - "format for read (%s)" % (self.encoding, read1_format)) + "format for read: \n\n%s\n\npossible format(s)=%s)" % ( + self.encoding, read1, read1_format)) - umi_qual = map(ord, bc_qual1) + umi_qual = map(ord, [bc_qual1[x] for x in self.umi_bases]) umi_qual = [x - RANGES[self.encoding][0] for x in umi_qual] below_threshold = len([x for x in umi_qual if x < self.thresh]) @@ -274,7 +275,7 @@ def __call__(self, read1, read2=None): "specified quality encoding (%s) does not match possible " "format(s) for read 2 (%s)" % (self.encoding, read2_format)) - umi_qual2 = map(ord, bc_qual2) + umi_qual2 = map(ord, [bc_qual2[x] for x in self.umi_bases2]) umi_qual2 = [x - RANGES[self.encoding][0] for x in umi_qual2] below_threshold = len([x for x in umi_qual2 if x < self.thresh])