From bf74f0194501f528c0354a98092f107eec171498 Mon Sep 17 00:00:00 2001 From: frankyan Date: Mon, 20 Jul 2020 01:31:32 -0700 Subject: [PATCH 1/2] dev to support mapq based duplicate selction --- pairtools/__init__.py | 2 +- pairtools/_dedup.pyx | 32 ++++++++++++++++++++++---- pairtools/pairtools_dedup.py | 44 ++++++++++++++++++++++++++++++------ 3 files changed, 66 insertions(+), 12 deletions(-) diff --git a/pairtools/__init__.py b/pairtools/__init__.py index 39902216..280afed8 100644 --- a/pairtools/__init__.py +++ b/pairtools/__init__.py @@ -11,7 +11,7 @@ """ -__version__ = '0.3.0' +__version__ = '0.3.0b0' import click diff --git a/pairtools/_dedup.pyx b/pairtools/_dedup.pyx index 2f477688..8062e4bb 100644 --- a/pairtools/_dedup.pyx +++ b/pairtools/_dedup.pyx @@ -149,6 +149,7 @@ cdef class OnlineDuplicateDetector(object): cdef cython.int [:] p2 cdef cython.int [:] s1 cdef cython.int [:] s2 + cdef cython.int [:] min_mapq cdef cython.char [:] rm cdef int methodid cdef int low @@ -168,7 +169,8 @@ cdef class OnlineDuplicateDetector(object): self.p1 = np.zeros(0, np.int32) self.p2 = np.zeros(0, np.int32) self.s1 = np.zeros(0, np.int32) - self.s2 = np.zeros(0, np.int32) + self.s2 = np.zeros(0, np.int32) + self.min_mapq = np.zeros(0, np.int32) self.rm = np.zeros(0, np.int8) if method == "max": self.methodid = 0 @@ -206,10 +208,16 @@ cdef class OnlineDuplicateDetector(object): def _run(self, finish=False): cdef int finishing = 0 cdef int extraCondition + cdef int best_ind + cdef int best_minmapq if finish: finishing = 1 - + + if self.low < self.N: + best_ind = self.low + best_minmapq = self.min_mapq[self.low] + while True: if self.low == self.N: break @@ -218,6 +226,9 @@ cdef class OnlineDuplicateDetector(object): if finishing == 1: self.low += 1 self.high = self.low + 1 + if self.low < self.N: + best_ind = self.low + best_minmapq = self.min_mapq[self.low] continue else: break @@ -225,6 +236,9 @@ cdef class OnlineDuplicateDetector(object): if self.rm[self.low] == 1: self.low += 1 self.high = self.low+1 + if self.low < self.N: + best_ind = self.low + best_minmapq = self.min_mapq[self.low] continue # if high already removed, just continue @@ -238,6 +252,9 @@ cdef class OnlineDuplicateDetector(object): (self.p1[self.high] - self.p1[self.low] < 0 )): self.low += 1 self.high = self.low + 1 # restart high + if self.low < self.N: + best_ind = self.low + best_minmapq = self.min_mapq[self.low] continue if self.methodid == 0: @@ -259,20 +276,27 @@ cdef class OnlineDuplicateDetector(object): (self.s1[self.low] == self.s1[self.high]) and (self.s2[self.low] == self.s2[self.high]) and extraCondition): - self.rm[self.high] = 1 + # compare the min_mapq + if(self.min_mapq[self.high] > best_minmapq): + self.rm[best_ind] = 1 + best_ind = self.high + best_minmapq = self.min_mapq[self.high] + else: + self.rm[self.high] = 1 self.high += 1 continue self.high += 1 return self._shrink() - def push(self, c1, c2, p1, p2, s1, s2): + def push(self, c1, c2, p1, p2, s1, s2, min_mapq): self.c1 = np.concatenate([self.c1, c1]) self.c2 = np.concatenate([self.c2, c2]) self.p1 = np.concatenate([self.p1, p1]) self.p2 = np.concatenate([self.p2, p2]) self.s1 = np.concatenate([self.s1, s1]) self.s2 = np.concatenate([self.s2, s2]) + self.min_mapq = np.concatenate([self.min_mapq, min_mapq]) self.rm = np.concatenate([self.rm, np.zeros(len(c1), dtype=np.int8)]) self.N = self.N + len(c1) return self._run(finish=False) diff --git a/pairtools/pairtools_dedup.py b/pairtools/pairtools_dedup.py index ed248943..cabe5202 100644 --- a/pairtools/pairtools_dedup.py +++ b/pairtools/pairtools_dedup.py @@ -117,6 +117,17 @@ type=int, default=_pairsam_format.COL_S2, help='Strand 2 column; default {}'.format(_pairsam_format.COL_S2)) +# Provide MAPQ value for choosing the best mapped pair of duplicates to keep. Default is choosing the first one. +# The first one might be a low MAPQ pair, which might be filtered out in downstream analysis, such as +# pairtools select with MAPQ threshold. With the MAPQ value, the high MAPQ "duplicated" pair can be selected +# to avoide the problem. +@click.option( + "--mapq-cols", + nargs=2, + help='MAPQ columns 1 and 2; Can be either provided as 0-based column indices or as column ' + 'name (requires the "#columns" header field). Example: \'--mapq "mapq1" "mapq2"\' or \'--mapq 10 11\' ' + 'The MAPQ values will be used to select the best pair which will be kept when there are duplicats. ' + 'The best pair is the one has the highest value of min(mapq1, mapq2).') @click.option( "--unmapped-chrom", type=str, @@ -145,7 +156,7 @@ def dedup(pairs_path, output, output_dups, output_unmapped, output_stats, max_mismatch, method, sep, comment_char, send_header_to, - c1, c2, p1, p2, s1, s2, unmapped_chrom, mark_dups, extra_col_pair, **kwargs + c1, c2, p1, p2, s1, s2, mapq_cols, unmapped_chrom, mark_dups, extra_col_pair, **kwargs ): '''Find and remove PCR/optical duplicates. @@ -160,7 +171,7 @@ def dedup(pairs_path, output, output_dups, output_unmapped, output_stats, max_mismatch, method, sep, comment_char, send_header_to, - c1, c2, p1, p2, s1, s2, unmapped_chrom, mark_dups, extra_col_pair, + c1, c2, p1, p2, s1, s2, mapq_cols, unmapped_chrom, mark_dups, extra_col_pair, **kwargs ) @@ -170,7 +181,7 @@ def dedup_py( output_stats, max_mismatch, method, sep, comment_char, send_header_to, - c1, c2, p1, p2, s1, s2, unmapped_chrom, mark_dups, extra_col_pair, + c1, c2, p1, p2, s1, s2, mapq_cols, unmapped_chrom, mark_dups, extra_col_pair, **kwargs ): sep = ast.literal_eval('"""' + sep + '"""') @@ -229,6 +240,15 @@ def dedup_py( outstream_unmapped.writelines((l+'\n' for l in header)) column_names = _headerops.extract_column_names(header) + + # consider mapq1 and mapq2 in dedup if exist + mapq1 = None + mapq2 = None + if len(mapq_cols): + mapq1 = int(mapq_cols[0]) if mapq_cols[0].isdigit() else column_names.index(mapq_cols[0]) + mapq2 = int(mapq_cols[1]) if mapq_cols[1].isdigit() else column_names.index(mapq_cols[1]) + # print("MAPQ cols:", mapq1, mapq2) + extra_cols1 = [] extra_cols2 = [] if extra_col_pair is not None: @@ -240,7 +260,7 @@ def dedup_py( streaming_dedup( method, max_mismatch, sep, - c1, c2, p1, p2, s1, s2, extra_cols1, extra_cols2, unmapped_chrom, + c1, c2, p1, p2, s1, s2, mapq1, mapq2, extra_cols1, extra_cols2, unmapped_chrom, body_stream, outstream, outstream_dups, outstream_unmapped, out_stat, mark_dups) @@ -278,12 +298,15 @@ def ar(mylist, val): def streaming_dedup( method, max_mismatch, sep, - c1ind, c2ind, p1ind, p2ind, s1ind, s2ind, extra_cols1, extra_cols2, + c1ind, c2ind, p1ind, p2ind, s1ind, s2ind, mapq1ind, mapq2ind, extra_cols1, extra_cols2, unmapped_chrom, instream, outstream, outstream_dups, outstream_unmapped, out_stat, mark_dups): maxind = max(c1ind, c2ind, p1ind, p2ind, s1ind, s2ind) + if bool(mapq1ind) and bool(mapq2ind): + maxind = max(maxind, mapq1ind, mapq2ind) + if bool(extra_cols1) and bool(extra_cols2): maxind = max(maxind, max(extra_cols1), max(extra_cols2)) @@ -298,7 +321,7 @@ def streaming_dedup( dd = _dedup.OnlineDuplicateDetector(method, max_mismatch, returnData=False) - c1 = []; c2 = []; p1 = []; p2 = []; s1 = []; s2 = [] + c1 = []; c2 = []; p1 = []; p2 = []; s1 = []; s2 = []; min_mapq = [] line_buffer = [] cols_buffer = [] chromDict = {} @@ -353,13 +376,20 @@ def streaming_dedup( s1.append(fetchadd(cols[s1ind], strandDict)) s2.append(fetchadd(cols[s2ind], strandDict)) + # get min_mapq + if bool(mapq1ind) and bool(mapq2ind): + min_mapq.append(min(int(cols[mapq1ind]), int(cols[mapq2ind]))) + else: + min_mapq.append(0) + if (not stripline) or (len(c1) == curMaxLen): res = dd.push(ar(c1, 32), ar(c2, 32), ar(p1, 32), ar(p2, 32), ar(s1, 32), - ar(s2, 32)) + ar(s2, 32), + ar(min_mapq, 32)) if not stripline: res = np.concatenate([res, dd.finish()]) From 301ca8aa5e30eaa33113757c41742361d8f477aa Mon Sep 17 00:00:00 2001 From: frankyan Date: Mon, 20 Jul 2020 15:17:00 -0700 Subject: [PATCH 2/2] dev 0.3.0b0 select best mapq in dedup --- pairtools/_dedup.pyx | 25 +++++++++++++++++++------ pairtools/pairtools_dedup.py | 1 + 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/pairtools/_dedup.pyx b/pairtools/_dedup.pyx index 8062e4bb..3164064d 100644 --- a/pairtools/_dedup.pyx +++ b/pairtools/_dedup.pyx @@ -151,6 +151,7 @@ cdef class OnlineDuplicateDetector(object): cdef cython.int [:] s2 cdef cython.int [:] min_mapq cdef cython.char [:] rm + cdef cython.char [:] rm_best cdef int methodid cdef int low cdef int high @@ -172,6 +173,8 @@ cdef class OnlineDuplicateDetector(object): self.s2 = np.zeros(0, np.int32) self.min_mapq = np.zeros(0, np.int32) self.rm = np.zeros(0, np.int8) + self.rm_best = np.zeros(0, np.int8) + if method == "max": self.methodid = 0 elif method == "sum": @@ -184,7 +187,8 @@ cdef class OnlineDuplicateDetector(object): def _shrink(self): if self.returnData == 1: - firstret = self.rm[:self.low] + # firstret = self.rm[:self.low] + firstret = self.rm_best[:self.low] retainMask = (np.asarray(firstret) == False) del firstret ret = [] @@ -196,7 +200,9 @@ cdef class OnlineDuplicateDetector(object): self.p2 = self.p2[self.low:] self.s1 = self.s1[self.low:] self.s2 = self.s2[self.low:] - pastrm = self.rm[:self.low] + # pastrm = self.rm[:self.low] + pastrm = self.rm_best[:self.low] + self.rm_best = self.rm_best[self.low:] self.rm = self.rm[self.low:] self.high = self.high-self.low self.N = self.N - self.low @@ -223,7 +229,10 @@ cdef class OnlineDuplicateDetector(object): break if self.high == self.N: - if finishing == 1: + if finishing == 1: + if self.low != best_ind: + self.rm_best[self.low] = 1 + self.rm_best[best_ind] = 0 self.low += 1 self.high = self.low + 1 if self.low < self.N: @@ -250,6 +259,10 @@ cdef class OnlineDuplicateDetector(object): if ((self.c1[self.high] != self.c1[self.low]) or (self.p1[self.high] - self.p1[self.low] > self.max_mismatch) or (self.p1[self.high] - self.p1[self.low] < 0 )): + if self.low != best_ind: + self.rm_best[self.low] = 1 + self.rm_best[best_ind] = 0 + self.low += 1 self.high = self.low + 1 # restart high if self.low < self.N: @@ -278,11 +291,10 @@ cdef class OnlineDuplicateDetector(object): extraCondition): # compare the min_mapq if(self.min_mapq[self.high] > best_minmapq): - self.rm[best_ind] = 1 best_ind = self.high best_minmapq = self.min_mapq[self.high] - else: - self.rm[self.high] = 1 + self.rm_best[self.high] = 1 + self.rm[self.high] = 1 self.high += 1 continue self.high += 1 @@ -298,6 +310,7 @@ cdef class OnlineDuplicateDetector(object): self.s2 = np.concatenate([self.s2, s2]) self.min_mapq = np.concatenate([self.min_mapq, min_mapq]) self.rm = np.concatenate([self.rm, np.zeros(len(c1), dtype=np.int8)]) + self.rm_best = np.concatenate([self.rm_best, np.zeros(len(c1), dtype=np.int8)]) self.N = self.N + len(c1) return self._run(finish=False) diff --git a/pairtools/pairtools_dedup.py b/pairtools/pairtools_dedup.py index cabe5202..c1755586 100644 --- a/pairtools/pairtools_dedup.py +++ b/pairtools/pairtools_dedup.py @@ -379,6 +379,7 @@ def streaming_dedup( # get min_mapq if bool(mapq1ind) and bool(mapq2ind): min_mapq.append(min(int(cols[mapq1ind]), int(cols[mapq2ind]))) + # print(min(int(cols[mapq1ind]), int(cols[mapq2ind]))) else: min_mapq.append(0)