open2c · frankyan · Jul 20, 2020 · Jul 20, 2020
diff --git a/pairtools/__init__.py b/pairtools/__init__.py
@@ -11,7 +11,7 @@
 
 """
 
-__version__ = '0.3.0'
+__version__ = '0.3.0b0'
 
 
 import click

diff --git a/pairtools/_dedup.pyx b/pairtools/_dedup.pyx
@@ -149,7 +149,9 @@ cdef class OnlineDuplicateDetector(object):
     cdef cython.int [:] p2 
     cdef cython.int [:] s1 
     cdef cython.int [:] s2 
+    cdef cython.int [:] min_mapq
     cdef cython.char [:] rm
+    cdef cython.char [:] rm_best
     cdef int methodid
     cdef int low
     cdef int high
@@ -168,8 +170,11 @@ cdef class OnlineDuplicateDetector(object):
         self.p1 = np.zeros(0, np.int32)
         self.p2 = np.zeros(0, np.int32)
         self.s1 = np.zeros(0, np.int32)
-        self.s2 = np.zeros(0, np.int32) 
+        self.s2 = np.zeros(0, np.int32)
+        self.min_mapq = np.zeros(0, np.int32)
         self.rm = np.zeros(0, np.int8)        
+        self.rm_best = np.zeros(0, np.int8)        
+
         if method == "max": 
             self.methodid = 0
         elif method == "sum":
@@ -182,7 +187,8 @@ cdef class OnlineDuplicateDetector(object):
 
     def _shrink(self):
         if self.returnData == 1:
-            firstret = self.rm[:self.low]
+            # firstret = self.rm[:self.low]
+            firstret = self.rm_best[:self.low]
             retainMask = (np.asarray(firstret) == False)
             del firstret 
             ret = []
@@ -194,7 +200,9 @@ cdef class OnlineDuplicateDetector(object):
         self.p2 = self.p2[self.low:]        
         self.s1 = self.s1[self.low:]
         self.s2 = self.s2[self.low:]
-        pastrm = self.rm[:self.low]
+        # pastrm = self.rm[:self.low]
+        pastrm = self.rm_best[:self.low]
+        self.rm_best = self.rm_best[self.low:] 
         self.rm = self.rm[self.low:]        
         self.high = self.high-self.low
         self.N = self.N - self.low
@@ -206,25 +214,40 @@ cdef class OnlineDuplicateDetector(object):
     def _run(self, finish=False):
         cdef int finishing = 0 
         cdef int extraCondition
+        cdef int best_ind
+        cdef int best_minmapq
 
         if finish:
             finishing = 1 
-
+
+        if self.low < self.N:
+            best_ind = self.low
+            best_minmapq = self.min_mapq[self.low]
+
         while True:            
             if self.low == self.N:
                 break
 
             if self.high == self.N:
-                if finishing == 1: 
+                if finishing == 1:
+                    if self.low != best_ind:
+                        self.rm_best[self.low] = 1
+                        self.rm_best[best_ind] = 0
                     self.low += 1
                     self.high = self.low + 1
+                    if self.low < self.N:
+                        best_ind = self.low
+                        best_minmapq = self.min_mapq[self.low]
                     continue
                 else:
                     break 
 
             if self.rm[self.low] == 1:
                 self.low += 1
                 self.high = self.low+1
+                if self.low < self.N:
+                    best_ind = self.low
+                    best_minmapq = self.min_mapq[self.low]
                 continue
 
             # if high already removed, just continue
@@ -236,8 +259,15 @@ cdef class OnlineDuplicateDetector(object):
             if ((self.c1[self.high] != self.c1[self.low]) or 
                 (self.p1[self.high] - self.p1[self.low] > self.max_mismatch)   or 
                 (self.p1[self.high] - self.p1[self.low] < 0  )):
+                if self.low != best_ind:
+                    self.rm_best[self.low] = 1
+                    self.rm_best[best_ind] = 0
+
                 self.low += 1
                 self.high = self.low + 1  # restart high
+                if self.low < self.N:
+                    best_ind = self.low
+                    best_minmapq = self.min_mapq[self.low]
                 continue
 
             if self.methodid == 0: 
@@ -259,21 +289,28 @@ cdef class OnlineDuplicateDetector(object):
                     (self.s1[self.low] == self.s1[self.high]) and 
                     (self.s2[self.low] == self.s2[self.high]) and 
                     extraCondition):
+                # compare the min_mapq
+                if(self.min_mapq[self.high] > best_minmapq):
+                    best_ind = self.high
+                    best_minmapq = self.min_mapq[self.high]
+                self.rm_best[self.high] = 1
                 self.rm[self.high] = 1
                 self.high += 1
                 continue
             self.high += 1
 
         return self._shrink()
 
-    def push(self, c1, c2, p1, p2, s1, s2):            
+    def push(self, c1, c2, p1, p2, s1, s2, min_mapq):            
         self.c1 = np.concatenate([self.c1, c1])
         self.c2 = np.concatenate([self.c2, c2])
         self.p1 = np.concatenate([self.p1, p1])
         self.p2 = np.concatenate([self.p2, p2])
         self.s1 = np.concatenate([self.s1, s1])
         self.s2 = np.concatenate([self.s2, s2])
+        self.min_mapq = np.concatenate([self.min_mapq, min_mapq])
         self.rm = np.concatenate([self.rm, np.zeros(len(c1), dtype=np.int8)])
+        self.rm_best = np.concatenate([self.rm_best, np.zeros(len(c1), dtype=np.int8)])
         self.N = self.N + len(c1)        
         return self._run(finish=False)
 

diff --git a/pairtools/pairtools_dedup.py b/pairtools/pairtools_dedup.py
@@ -117,6 +117,17 @@
     type=int, 
     default=_pairsam_format.COL_S2,  
     help='Strand 2 column; default {}'.format(_pairsam_format.COL_S2))
+# Provide MAPQ value for choosing the best mapped pair of duplicates to keep. Default is choosing the first one.
+# The first one might be a low MAPQ pair, which might be filtered out in downstream analysis, such as
+# pairtools select with MAPQ threshold. With the MAPQ value, the high MAPQ "duplicated" pair can be selected 
+# to avoide the problem.   
+@click.option(
+    "--mapq-cols", 
+    nargs=2, 
+    help='MAPQ columns 1 and 2; Can be either provided as 0-based column indices or as column '
+    'name (requires the "#columns" header field). Example: \'--mapq "mapq1" "mapq2"\' or \'--mapq 10 11\' '
+    'The MAPQ values will be used to select the best pair which will be kept when there are duplicats. '
+    'The best pair is the one has the highest value of min(mapq1, mapq2).')
 @click.option(
     "--unmapped-chrom", 
     type=str, 
@@ -145,7 +156,7 @@ def dedup(pairs_path, output, output_dups, output_unmapped,
     output_stats,
     max_mismatch, method, 
     sep, comment_char, send_header_to,
-    c1, c2, p1, p2, s1, s2, unmapped_chrom, mark_dups, extra_col_pair, **kwargs
+    c1, c2, p1, p2, s1, s2, mapq_cols, unmapped_chrom, mark_dups, extra_col_pair, **kwargs
     ):
     '''Find and remove PCR/optical duplicates.
 
@@ -160,7 +171,7 @@ def dedup(pairs_path, output, output_dups, output_unmapped,
         output_stats,
         max_mismatch, method, 
         sep, comment_char, send_header_to,
-        c1, c2, p1, p2, s1, s2, unmapped_chrom, mark_dups, extra_col_pair,
+        c1, c2, p1, p2, s1, s2, mapq_cols, unmapped_chrom, mark_dups, extra_col_pair,
         **kwargs
         )
 
@@ -170,7 +181,7 @@ def dedup_py(
     output_stats,
     max_mismatch, method, 
     sep, comment_char, send_header_to,
-    c1, c2, p1, p2, s1, s2, unmapped_chrom, mark_dups, extra_col_pair,
+    c1, c2, p1, p2, s1, s2, mapq_cols, unmapped_chrom, mark_dups, extra_col_pair,
     **kwargs
     ):
     sep = ast.literal_eval('"""' + sep + '"""')
@@ -229,6 +240,15 @@ def dedup_py(
         outstream_unmapped.writelines((l+'\n' for l in header))
 
     column_names = _headerops.extract_column_names(header)
+
+    # consider mapq1 and mapq2 in dedup if exist 
+    mapq1 = None
+    mapq2 = None
+    if len(mapq_cols):
+        mapq1 = int(mapq_cols[0]) if mapq_cols[0].isdigit() else column_names.index(mapq_cols[0])
+        mapq2 = int(mapq_cols[1]) if mapq_cols[1].isdigit() else column_names.index(mapq_cols[1])
+        # print("MAPQ cols:", mapq1, mapq2)
+
     extra_cols1 = []
     extra_cols2 = []
     if extra_col_pair is not None:
@@ -240,7 +260,7 @@ def dedup_py(
 
 
     streaming_dedup( method, max_mismatch, sep,
-        c1, c2, p1, p2, s1, s2, extra_cols1, extra_cols2, unmapped_chrom,
+        c1, c2, p1, p2, s1, s2, mapq1, mapq2, extra_cols1, extra_cols2, unmapped_chrom,
         body_stream, outstream, outstream_dups,
         outstream_unmapped, out_stat, mark_dups)
 
@@ -278,12 +298,15 @@ def ar(mylist, val):
 
 def streaming_dedup(
         method, max_mismatch, sep,
-        c1ind, c2ind, p1ind, p2ind, s1ind, s2ind, extra_cols1, extra_cols2,
+        c1ind, c2ind, p1ind, p2ind, s1ind, s2ind, mapq1ind, mapq2ind, extra_cols1, extra_cols2,
         unmapped_chrom,
         instream, outstream, outstream_dups, outstream_unmapped,
         out_stat, mark_dups):
 
     maxind = max(c1ind, c2ind, p1ind, p2ind, s1ind, s2ind)
+    if bool(mapq1ind) and bool(mapq2ind):
+        maxind = max(maxind, mapq1ind, mapq2ind)
+
     if bool(extra_cols1) and bool(extra_cols2):
         maxind = max(maxind, max(extra_cols1), max(extra_cols2))
 
@@ -298,7 +321,7 @@ def streaming_dedup(
 
     dd = _dedup.OnlineDuplicateDetector(method, max_mismatch, returnData=False)
 
-    c1 = []; c2 = []; p1 = []; p2 = []; s1 = []; s2 = []
+    c1 = []; c2 = []; p1 = []; p2 = []; s1 = []; s2 = []; min_mapq = []
     line_buffer = []
     cols_buffer = []
     chromDict = {}
@@ -353,13 +376,21 @@ def streaming_dedup(
                     s1.append(fetchadd(cols[s1ind], strandDict))
                     s2.append(fetchadd(cols[s2ind], strandDict))
 
+                # get min_mapq 
+                if bool(mapq1ind) and bool(mapq2ind):
+                    min_mapq.append(min(int(cols[mapq1ind]), int(cols[mapq2ind])))
+                    # print(min(int(cols[mapq1ind]), int(cols[mapq2ind])))
+                else:
+                    min_mapq.append(0)
+
         if (not stripline) or (len(c1) == curMaxLen):
             res = dd.push(ar(c1, 32), 
                           ar(c2, 32), 
                           ar(p1, 32), 
                           ar(p2, 32), 
                           ar(s1, 32), 
-                          ar(s2, 32))
+                          ar(s2, 32),
+                          ar(min_mapq, 32))
             if not stripline:
                 res = np.concatenate([res, dd.finish()])
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,7 +11,7 @@ @@
     """
-    __version__ = '0.3.0'
+    __version__ = '0.3.0b0'
     import click
@@ Expand Down @@