From a2ef5aae5b6f8e3e6ba1d33447e7a5c9561b1294 Mon Sep 17 00:00:00 2001
From: Tom Smith <thomas.smith2@dpag.ox.ac.uk>
Date: Wed, 25 May 2016 13:48:07 +0100
Subject: [PATCH 1/3] adds quality filtering to extract

---
 umi_tools/extract.py | 97 ++++++++++++++++++++++++++++++++++++++++----
 umi_tools/version.py |  2 +-
 2 files changed, 91 insertions(+), 8 deletions(-)

diff --git a/umi_tools/extract.py b/umi_tools/extract.py
index 816a013e..eb84c9fb 100644
--- a/umi_tools/extract.py
+++ b/umi_tools/extract.py
@@ -100,6 +100,13 @@
 # https://github.com/CGATOxford/cgat/blob/master/CGAT/Fastq.py
 ###############################################################################
 
+RANGES = {
+    'phred33': (33, 77),
+    'solexa': (59, 106),
+    'phred64': (64, 106),
+}
+
+
 class Record:
     """A record representing a :term:`fastq` formatted record.
 
@@ -113,7 +120,7 @@ class Record:
        String representation of quality scores.
     format : string
        Quality score format. Can be one of ``sanger``,
-       ``illumina-1.8``, ``solexa`` or ``phred64``.
+       ``illumina-1.9``, ``solexa`` or ``phred64``.
 
     """
     def __init__(self, identifier, seq, quals, format=None):
@@ -121,6 +128,19 @@ def __init__(self, identifier, seq, quals, format=None):
             identifier, seq, quals, format)
         self.format = None
 
+    def guessFormat(self):
+        '''return quality score format -
+        might return several if ambiguous.'''
+
+        c = [ord(x) for x in self.quals]
+        mi, ma = min(c), max(c)
+        r = []
+        for format, v in RANGES.iteritems():
+            m1, m2 = v
+            if mi >= m1 and ma < m2:
+                r.append(format)
+        return r
+
     def __str__(self):
         return "@%s\n%s\n+\n%s" % (self.identifier, self.seq, self.quals)
 
@@ -189,7 +209,10 @@ def _joiner_5prime(self, sequence, sample):
     def _joiner_3prime(self, sequence, sample):
         return sequence + sample
 
-    def __init__(self, pattern, pattern2, prime3=False):
+    def __init__(self, pattern, pattern2,
+                 quality_filter_threshold,
+                 quality_encoding,
+                 prime3=False,):
 
         if prime3:
             self.extract = self._extract_3prime
@@ -198,6 +221,8 @@ def __init__(self, pattern, pattern2, prime3=False):
             self.extract = self._extract_5prime
             self.joiner = self._joiner_5prime
 
+        self.thresh = quality_filter_threshold
+        self.encoding = quality_encoding
         self.pattern_length = len(pattern)
         self.umi_bases = [x for x in range(len(pattern)) if pattern[x] is "N"]
         self.bc_bases = [x for x in range(len(pattern)) if pattern[x] is "X"]
@@ -216,6 +241,24 @@ def __call__(self, read1, read2=None):
         bc1, sequence1 = self.extract(read1.seq)
         bc_qual1, seq_qual1 = self.extract(read1.quals)
 
+        # check qualities of UMI bases from read1
+        if self.thresh:
+            read1_format = read1.guessFormat()
+            assert self.encoding in read1_format, (
+                "specified quality encoding (%s) does not match possible "
+                "format for read (%s)" % (self.encoding, read1_format))
+
+            umi_qual = map(ord, bc_qual1)
+            umi_qual = [x - RANGES[self.encoding][0] for x in umi_qual]
+            below_threshold = len([x for x in umi_qual if x < self.thresh])
+
+            # if any bases below threshold, return None
+            if below_threshold > 0:
+                if read2:
+                    return None, None
+                else:
+                    return None
+
         umi1 = "".join([bc1[x] for x in self.umi_bases])
         sample1 = "".join([bc1[x] for x in self.bc_bases])
         sample_qual1 = "".join([bc_qual1[x] for x in self.bc_bases])
@@ -224,6 +267,21 @@ def __call__(self, read1, read2=None):
             bc2, sequence2 = self.extract(read2.seq, read=2)
             bc_qual2, seq_qual2 = self.extract(read2.quals, read=2)
 
+            # check qualities of UMI bases from read2
+            if self.thresh:
+                read2_format = read2.guessFormat()
+                assert self.encoding in read2_format, (
+                    "specified quality encoding (%s) does not match possible "
+                    "format(s) for read 2 (%s)" % (self.encoding, read2_format))
+
+                umi_qual2 = map(ord, bc_qual2)
+                umi_qual2 = [x - RANGES[self.encoding][0] for x in umi_qual2]
+                below_threshold = len([x for x in umi_qual2 if x < self.thresh])
+
+                # if any bases below threshold, return None, None
+                if below_threshold > 0:
+                    return None, None
+
             umi2 = "".join([bc2[x] for x in self.umi_bases2])
             sample2 = "".join([bc2[x] for x in self.bc_bases2])
             sample_qual2 = "".join([bc_qual2[x] for x in self.bc_bases2])
@@ -280,6 +338,15 @@ def main(argv=None):
                       help="barcode is on 3' end of read")
     parser.add_option("--read2-out", dest="read2_out", type="string",
                       help="file to output processed paired read to")
+    parser.add_option("--quality-filter-threshold",
+                      dest="quality_filter_threshold", type="int",
+                      help=("Remove reads where any UMI base quality score "
+                            "falls below this threshold"))
+    parser.add_option("--quality-encoding",
+                      dest="quality_encoding", type="choice",
+                      choices=["phred33", "phred64", "solexa"],
+                      help=("Quality score encoding. Choose from phred33"
+                            "[33-77] phred64 [64-106] or solexa [59-106]"))
     parser.add_option("--supress-stats", dest="stats", action="store_false",
                       help="Suppress the writing of stats to the log")
 
@@ -289,7 +356,9 @@ def main(argv=None):
                         read2_in=None,
                         read2_out=None,
                         prime3=False,
-                        stats=True)
+                        stats=True,
+                        quality_filter_threshold=None,
+                        quality_encoding=None)
 
     # add common options (-h/--help, ...) and parse command line
 
@@ -311,14 +380,26 @@ def main(argv=None):
             raise ValueError("must specify an output for the paired end "
                              "``--read2-out``")
 
+    if options.quality_filter_threshold:
+        if not options.quality_encoding:
+            raise ValueError("must provide a quality encoding to filter UMIs "
+                             "by quality ``--quality-encoding``")
+
     # Initialise the processor
-    processor = Extractor(options.pattern, options.pattern2, options.prime3)
+    processor = Extractor(options.pattern,
+                          options.pattern2,
+                          options.quality_filter_threshold,
+                          options.quality_encoding,
+                          options.prime3)
+
     read1s = fastqIterate(options.stdin)
 
     if options.read2_in is None:
 
         for read in read1s:
-            options.stdout.write(str(processor(read)) + "\n")
+            new_1 = processor(read)
+            if new_1:
+                options.stdout.write(str(new_1) + "\n")
 
     else:
 
@@ -326,9 +407,11 @@ def main(argv=None):
         read2_out = U.openFile(options.read2_out, "w")
 
         for read1, read2 in izip(read1s, read2s):
+            U.info("read1: %s, read2: %s" % ( read1, read2))
             new_1, new_2 = processor(read1, read2)
-            options.stdout.write(str(new_1) + "\n")
-            read2_out.write(str(new_2) + "\n")
+            if new_1:
+                options.stdout.write(str(new_1) + "\n")
+                read2_out.write(str(new_2) + "\n")
 
     # write footer and output benchmark information.
 
diff --git a/umi_tools/version.py b/umi_tools/version.py
index 869ac9c0..36e61c64 100644
--- a/umi_tools/version.py
+++ b/umi_tools/version.py
@@ -1,2 +1,2 @@
 
-__version__ = "0.0.11"
+__version__ = "0.1.0"

From dc1e9fdad5ae0598f7a9a9a2c680eb8b88abb305 Mon Sep 17 00:00:00 2001
From: Tom Smith <thomas.smith2@dpag.ox.ac.uk>
Date: Wed, 25 May 2016 13:51:58 +0100
Subject: [PATCH 2/3] updates documentation

---
 umi_tools/dedup.py   | 2 +-
 umi_tools/extract.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/umi_tools/dedup.py b/umi_tools/dedup.py
index 5321076a..27ce2339 100644
--- a/umi_tools/dedup.py
+++ b/umi_tools/dedup.py
@@ -68,7 +68,7 @@
 
       "directional-adjacency"
           Identify clusters of connected UMIs (based on hamming distance
-          threshold) and umi A counts > (2* umi B counts) - 1. Return the
+          threshold) and umi A counts >= (2* umi B counts) - 1. Return the
           UMIs with the highest counts per cluster
 
 --edit-distance-threshold (int)
diff --git a/umi_tools/extract.py b/umi_tools/extract.py
index eb84c9fb..6617b887 100644
--- a/umi_tools/extract.py
+++ b/umi_tools/extract.py
@@ -120,7 +120,7 @@ class Record:
        String representation of quality scores.
     format : string
        Quality score format. Can be one of ``sanger``,
-       ``illumina-1.9``, ``solexa`` or ``phred64``.
+       ``phred33``, ``phred64`` or ``solexa``.
 
     """
     def __init__(self, identifier, seq, quals, format=None):

From 6bc0c7f79a224daec25e214df3efd1a448e30aa3 Mon Sep 17 00:00:00 2001
From: Tom Smith <thomas.smith2@dpag.ox.ac.uk>
Date: Tue, 31 May 2016 12:11:24 +0100
Subject: [PATCH 3/3] corrects quality checking so only UMI bases in bc-pattern
 are checked

---
 umi_tools/extract.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/umi_tools/extract.py b/umi_tools/extract.py
index 6617b887..2986f89e 100644
--- a/umi_tools/extract.py
+++ b/umi_tools/extract.py
@@ -246,9 +246,10 @@ def __call__(self, read1, read2=None):
             read1_format = read1.guessFormat()
             assert self.encoding in read1_format, (
                 "specified quality encoding (%s) does not match possible "
-                "format for read (%s)" % (self.encoding, read1_format))
+                "format for read: \n\n%s\n\npossible format(s)=%s)" % (
+                    self.encoding, read1, read1_format))
 
-            umi_qual = map(ord, bc_qual1)
+            umi_qual = map(ord, [bc_qual1[x] for x in self.umi_bases])
             umi_qual = [x - RANGES[self.encoding][0] for x in umi_qual]
             below_threshold = len([x for x in umi_qual if x < self.thresh])
 
@@ -274,7 +275,7 @@ def __call__(self, read1, read2=None):
                     "specified quality encoding (%s) does not match possible "
                     "format(s) for read 2 (%s)" % (self.encoding, read2_format))
 
-                umi_qual2 = map(ord, bc_qual2)
+                umi_qual2 = map(ord, [bc_qual2[x] for x in self.umi_bases2])
                 umi_qual2 = [x - RANGES[self.encoding][0] for x in umi_qual2]
                 below_threshold = len([x for x in umi_qual2 if x < self.thresh])