Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

modify the pairtools-dedup to select the best MAPQ alignment as output of deduplication #95

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pairtools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

"""

__version__ = '0.3.0'
__version__ = '0.3.0b0'


import click
Expand Down
49 changes: 43 additions & 6 deletions pairtools/_dedup.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,9 @@ cdef class OnlineDuplicateDetector(object):
cdef cython.int [:] p2
cdef cython.int [:] s1
cdef cython.int [:] s2
cdef cython.int [:] min_mapq
cdef cython.char [:] rm
cdef cython.char [:] rm_best
cdef int methodid
cdef int low
cdef int high
Expand All @@ -168,8 +170,11 @@ cdef class OnlineDuplicateDetector(object):
self.p1 = np.zeros(0, np.int32)
self.p2 = np.zeros(0, np.int32)
self.s1 = np.zeros(0, np.int32)
self.s2 = np.zeros(0, np.int32)
self.s2 = np.zeros(0, np.int32)
self.min_mapq = np.zeros(0, np.int32)
self.rm = np.zeros(0, np.int8)
self.rm_best = np.zeros(0, np.int8)

if method == "max":
self.methodid = 0
elif method == "sum":
Expand All @@ -182,7 +187,8 @@ cdef class OnlineDuplicateDetector(object):

def _shrink(self):
if self.returnData == 1:
firstret = self.rm[:self.low]
# firstret = self.rm[:self.low]
firstret = self.rm_best[:self.low]
retainMask = (np.asarray(firstret) == False)
del firstret
ret = []
Expand All @@ -194,7 +200,9 @@ cdef class OnlineDuplicateDetector(object):
self.p2 = self.p2[self.low:]
self.s1 = self.s1[self.low:]
self.s2 = self.s2[self.low:]
pastrm = self.rm[:self.low]
# pastrm = self.rm[:self.low]
pastrm = self.rm_best[:self.low]
self.rm_best = self.rm_best[self.low:]
self.rm = self.rm[self.low:]
self.high = self.high-self.low
self.N = self.N - self.low
Expand All @@ -206,25 +214,40 @@ cdef class OnlineDuplicateDetector(object):
def _run(self, finish=False):
cdef int finishing = 0
cdef int extraCondition
cdef int best_ind
cdef int best_minmapq

if finish:
finishing = 1


if self.low < self.N:
best_ind = self.low
best_minmapq = self.min_mapq[self.low]

while True:
if self.low == self.N:
break

if self.high == self.N:
if finishing == 1:
if finishing == 1:
if self.low != best_ind:
self.rm_best[self.low] = 1
self.rm_best[best_ind] = 0
self.low += 1
self.high = self.low + 1
if self.low < self.N:
best_ind = self.low
best_minmapq = self.min_mapq[self.low]
continue
else:
break

if self.rm[self.low] == 1:
self.low += 1
self.high = self.low+1
if self.low < self.N:
best_ind = self.low
best_minmapq = self.min_mapq[self.low]
continue

# if high already removed, just continue
Expand All @@ -236,8 +259,15 @@ cdef class OnlineDuplicateDetector(object):
if ((self.c1[self.high] != self.c1[self.low]) or
(self.p1[self.high] - self.p1[self.low] > self.max_mismatch) or
(self.p1[self.high] - self.p1[self.low] < 0 )):
if self.low != best_ind:
self.rm_best[self.low] = 1
self.rm_best[best_ind] = 0

self.low += 1
self.high = self.low + 1 # restart high
if self.low < self.N:
best_ind = self.low
best_minmapq = self.min_mapq[self.low]
continue

if self.methodid == 0:
Expand All @@ -259,21 +289,28 @@ cdef class OnlineDuplicateDetector(object):
(self.s1[self.low] == self.s1[self.high]) and
(self.s2[self.low] == self.s2[self.high]) and
extraCondition):
# compare the min_mapq
if(self.min_mapq[self.high] > best_minmapq):
best_ind = self.high
best_minmapq = self.min_mapq[self.high]
self.rm_best[self.high] = 1
self.rm[self.high] = 1
self.high += 1
continue
self.high += 1

return self._shrink()

def push(self, c1, c2, p1, p2, s1, s2):
def push(self, c1, c2, p1, p2, s1, s2, min_mapq):
self.c1 = np.concatenate([self.c1, c1])
self.c2 = np.concatenate([self.c2, c2])
self.p1 = np.concatenate([self.p1, p1])
self.p2 = np.concatenate([self.p2, p2])
self.s1 = np.concatenate([self.s1, s1])
self.s2 = np.concatenate([self.s2, s2])
self.min_mapq = np.concatenate([self.min_mapq, min_mapq])
self.rm = np.concatenate([self.rm, np.zeros(len(c1), dtype=np.int8)])
self.rm_best = np.concatenate([self.rm_best, np.zeros(len(c1), dtype=np.int8)])
self.N = self.N + len(c1)
return self._run(finish=False)

Expand Down
45 changes: 38 additions & 7 deletions pairtools/pairtools_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,17 @@
type=int,
default=_pairsam_format.COL_S2,
help='Strand 2 column; default {}'.format(_pairsam_format.COL_S2))
# Provide MAPQ value for choosing the best mapped pair of duplicates to keep. Default is choosing the first one.
# The first one might be a low MAPQ pair, which might be filtered out in downstream analysis, such as
# pairtools select with MAPQ threshold. With the MAPQ value, the high MAPQ "duplicated" pair can be selected
# to avoide the problem.
@click.option(
"--mapq-cols",
nargs=2,
help='MAPQ columns 1 and 2; Can be either provided as 0-based column indices or as column '
'name (requires the "#columns" header field). Example: \'--mapq "mapq1" "mapq2"\' or \'--mapq 10 11\' '
'The MAPQ values will be used to select the best pair which will be kept when there are duplicats. '
'The best pair is the one has the highest value of min(mapq1, mapq2).')
@click.option(
"--unmapped-chrom",
type=str,
Expand Down Expand Up @@ -145,7 +156,7 @@ def dedup(pairs_path, output, output_dups, output_unmapped,
output_stats,
max_mismatch, method,
sep, comment_char, send_header_to,
c1, c2, p1, p2, s1, s2, unmapped_chrom, mark_dups, extra_col_pair, **kwargs
c1, c2, p1, p2, s1, s2, mapq_cols, unmapped_chrom, mark_dups, extra_col_pair, **kwargs
):
'''Find and remove PCR/optical duplicates.

Expand All @@ -160,7 +171,7 @@ def dedup(pairs_path, output, output_dups, output_unmapped,
output_stats,
max_mismatch, method,
sep, comment_char, send_header_to,
c1, c2, p1, p2, s1, s2, unmapped_chrom, mark_dups, extra_col_pair,
c1, c2, p1, p2, s1, s2, mapq_cols, unmapped_chrom, mark_dups, extra_col_pair,
**kwargs
)

Expand All @@ -170,7 +181,7 @@ def dedup_py(
output_stats,
max_mismatch, method,
sep, comment_char, send_header_to,
c1, c2, p1, p2, s1, s2, unmapped_chrom, mark_dups, extra_col_pair,
c1, c2, p1, p2, s1, s2, mapq_cols, unmapped_chrom, mark_dups, extra_col_pair,
**kwargs
):
sep = ast.literal_eval('"""' + sep + '"""')
Expand Down Expand Up @@ -229,6 +240,15 @@ def dedup_py(
outstream_unmapped.writelines((l+'\n' for l in header))

column_names = _headerops.extract_column_names(header)

# consider mapq1 and mapq2 in dedup if exist
mapq1 = None
mapq2 = None
if len(mapq_cols):
mapq1 = int(mapq_cols[0]) if mapq_cols[0].isdigit() else column_names.index(mapq_cols[0])
mapq2 = int(mapq_cols[1]) if mapq_cols[1].isdigit() else column_names.index(mapq_cols[1])
# print("MAPQ cols:", mapq1, mapq2)

extra_cols1 = []
extra_cols2 = []
if extra_col_pair is not None:
Expand All @@ -240,7 +260,7 @@ def dedup_py(


streaming_dedup( method, max_mismatch, sep,
c1, c2, p1, p2, s1, s2, extra_cols1, extra_cols2, unmapped_chrom,
c1, c2, p1, p2, s1, s2, mapq1, mapq2, extra_cols1, extra_cols2, unmapped_chrom,
body_stream, outstream, outstream_dups,
outstream_unmapped, out_stat, mark_dups)

Expand Down Expand Up @@ -278,12 +298,15 @@ def ar(mylist, val):

def streaming_dedup(
method, max_mismatch, sep,
c1ind, c2ind, p1ind, p2ind, s1ind, s2ind, extra_cols1, extra_cols2,
c1ind, c2ind, p1ind, p2ind, s1ind, s2ind, mapq1ind, mapq2ind, extra_cols1, extra_cols2,
unmapped_chrom,
instream, outstream, outstream_dups, outstream_unmapped,
out_stat, mark_dups):

maxind = max(c1ind, c2ind, p1ind, p2ind, s1ind, s2ind)
if bool(mapq1ind) and bool(mapq2ind):
maxind = max(maxind, mapq1ind, mapq2ind)

if bool(extra_cols1) and bool(extra_cols2):
maxind = max(maxind, max(extra_cols1), max(extra_cols2))

Expand All @@ -298,7 +321,7 @@ def streaming_dedup(

dd = _dedup.OnlineDuplicateDetector(method, max_mismatch, returnData=False)

c1 = []; c2 = []; p1 = []; p2 = []; s1 = []; s2 = []
c1 = []; c2 = []; p1 = []; p2 = []; s1 = []; s2 = []; min_mapq = []
line_buffer = []
cols_buffer = []
chromDict = {}
Expand Down Expand Up @@ -353,13 +376,21 @@ def streaming_dedup(
s1.append(fetchadd(cols[s1ind], strandDict))
s2.append(fetchadd(cols[s2ind], strandDict))

# get min_mapq
if bool(mapq1ind) and bool(mapq2ind):
min_mapq.append(min(int(cols[mapq1ind]), int(cols[mapq2ind])))
# print(min(int(cols[mapq1ind]), int(cols[mapq2ind])))
else:
min_mapq.append(0)

if (not stripline) or (len(c1) == curMaxLen):
res = dd.push(ar(c1, 32),
ar(c2, 32),
ar(p1, 32),
ar(p2, 32),
ar(s1, 32),
ar(s2, 32))
ar(s2, 32),
ar(min_mapq, 32))
if not stripline:
res = np.concatenate([res, dd.finish()])

Expand Down