CGATOxford · fgp · Sep 12, 2017 · Sep 12, 2017 · Sep 12, 2017 · Sep 12, 2017
diff --git a/tests/count_paired_wide_contig.tsv b/tests/count_paired_wide_contig.tsv
@@ -0,0 +1,3 @@
+gene	count
+AAC73112	14
+AAC73113	50
diff --git a/tests/dedup_paired_contig.sam b/tests/dedup_paired_contig.sam
diff --git a/tests/dedup_paired_contig_py3.sam b/tests/dedup_paired_contig_py3.sam
diff --git a/tests/group_paired_contig.sam b/tests/group_paired_contig.sam
diff --git a/tests/group_paired_contig.tsv b/tests/group_paired_contig.tsv
diff --git a/tests/group_paired_contig_py3.sam b/tests/group_paired_contig_py3.sam
diff --git a/tests/group_paired_contig_py3.tsv b/tests/group_paired_contig_py3.tsv
diff --git a/tests/paired_contig.bam b/tests/paired_contig.bam
diff --git a/tests/paired_contig.bam.bai b/tests/paired_contig.bam.bai
diff --git a/tests/tests.yaml b/tests/tests.yaml
@@ -140,6 +140,12 @@ count_tab_single_per_cell:
       references: [count_tab_cell.tsv]
       options: count_tab -L test.log --per-cell
 
+count_paired_wide_contig:
+      stdin: paired_contig.bam
+      outputs: [stdout]
+      references: [count_paired_wide_contig.tsv]
+      options: count -L test.log  --random-seed=123456789 --method=directional --umi-separator="|" --paired --per-contig --wide-format-cell-counts
+
 dedup_single_ignore:
       stdin: chr19.bam
       outputs: [stdout]
@@ -225,12 +231,26 @@ dedup_single_gene_tag:
       references: [single_gene_tag_py3.sam]
       options: dedup -L test.log --out-sam --random-seed=123456789 --method=directional --per-gene --gene-tag=XF --skip-tags-regex="^[__|Unassigned]"
 
+dedup_paired_contig:
+      sort: True
+      stdin: paired_contig.bam
+      outputs: [stdout]
+      references: [dedup_paired_contig_py3.sam]
+      options: dedup -L test.log  --out-sam --random-seed=123456789 --method=directional --umi-separator="|" --paired
+
 group_gene_tag:
       stdin: chr19_gene_tags.bam
       outputs: [group_dir_per_gene_py3.tsv, stdout]
       references: [group_dir_per_gene_py3.tsv, group_dir_per_gene_py3.sam]
       options: group -L test.log  --random-seed=123456789 --method=directional --per-gene --gene-tag=XF --skip-tags-regex="^[__|Unassigned]" --group-out=group_dir_per_gene_py3.tsv --output-bam --out-sam
 
+group_paired_contig:
+      sort: True
+      stdin: paired_contig.bam
+      outputs: [stdout, group_paired_contig_py3.tsv]
+      references: [group_paired_contig_py3.sam, group_paired_contig_py3.tsv]
+      options: group -L test.log --random-seed=123456789 --umi-separator="|" --paired --output-bam --out-sam --group-out=group_paired_contig_py3.tsv
+
 group_unique:
       stdin: chr19.bam
       outputs: [stdout, group_uniq_py3.tsv]

diff --git a/umi_tools/dedup.py b/umi_tools/dedup.py
@@ -199,9 +199,6 @@ def main(argv=None):
     infile = pysam.Samfile(in_name, in_mode)
     outfile = pysam.Samfile(out_name, out_mode, template=infile)
 
-    if options.paired:
-        outfile = sam_methods.TwoPassPairWriter(infile, outfile)
-
     nInput, nOutput, input_reads, output_reads = 0, 0, 0, 0
 
     if options.detection_method:
@@ -292,8 +289,10 @@ def main(argv=None):
                 bundle=bundle,
                 threshold=options.threshold)
 
-            for read in reads:
+            for read, read2 in reads:
                 outfile.write(read)
+                if (read2 is not None):
+                    outfile.write(read2)
                 nOutput += 1
 
             if options.stats:
@@ -304,7 +303,7 @@ def main(argv=None):
                     [bundle[UMI]['count'] for UMI in bundle])
 
                 # collect post-dudupe stats
-                post_cluster_umis = [bundle_iterator.barcode_getter(x)[0] for x in reads]
+                post_cluster_umis = [bundle_iterator.barcode_getter(r1)[0] for r1, r2 in reads]
                 stats_post_df_dict['UMI'].extend(umis)
                 stats_post_df_dict['counts'].extend(umi_counts)
 

diff --git a/umi_tools/group.py b/umi_tools/group.py
@@ -79,7 +79,7 @@
 import os
 
 # required to make iteritems python2 and python3 compatible
-from builtins import dict
+from builtins import dict, zip
 from future.utils import iteritems
 
 import pysam
@@ -187,8 +187,10 @@ def main(argv=None):
     if options.tsv:
         mapping_outfile = U.openFile(options.tsv, "w")
         mapping_outfile.write("%s\n" % "\t".join(
-            ["read_id", "contig", "position", "gene", "umi", "umi_count",
-             "final_umi", "final_umi_count", "unique_id"]))
+            ["read_id", "contig", "position"] +
+            (["position2"] if options.paired else []) +
+            ["gene", "umi", "umi_count", "final_umi", "final_umi_count",
+             "unique_id"]))
 
     nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0
 
@@ -262,28 +264,38 @@ def main(argv=None):
 
             for umi in umi_group:
                 reads = bundle[umi]['read']
-                for read in reads:
+                reads2 = bundle[umi]['read2']
+                for read, read2 in zip(reads, reads2):
                     if outfile:
                         # Add the 'UG' tag to the read
                         read.set_tag('UG', unique_id)
                         read.set_tag(options.umi_group_tag, top_umi)
                         outfile.write(read)
+                        if read2 is not None:
+                            read2.tags += [('UG', unique_id)]
+                            read2.tags += [(options.umi_group_tag, top_umi)]
+                            outfile.write(read2)
 
                     if options.tsv:
                         if options.per_gene:
+                            # XXX: What if the two reads disagree?
                             gene = read.get_tag(gene_tag)
                         else:
                             gene = "NA"
-                        mapping_outfile.write("%s\n" % "\t".join(map(str, (
+                        pos = sam_methods.get_read_position(
+                            read, options.soft_clip_threshold)[1]
+                        pos2 = sam_methods.get_read_position(
+                            read2, options.soft_clip_threshold)[1] if read2 is not None else ""
+                        mapping_outfile.write("%s\n" % "\t".join(map(str, [
                             read.query_name, read.reference_name,
-                            sam_methods.get_read_position(
-                                read, options.soft_clip_threshold)[1],
+                            pos] +
+                            ([pos2] if options.paired else []) + [
                             gene,
                             umi.decode(),
                             counts[umi],
                             top_umi.decode(),
                             group_count,
-                            unique_id))))
+                            unique_id])))
 
                     nOutput += 1
 

diff --git a/umi_tools/network.py b/umi_tools/network.py
@@ -405,7 +405,7 @@ def __call__(self, bundle, threshold):
         final_umis = [cluster[0] for cluster in clusters]
         umi_counts = [sum(counts[umi] for umi in cluster)
                       for cluster in clusters]
-        reads = [bundle[umi]["read"] for umi in final_umis]
+        reads = [(bundle[umi]["read"], bundle[umi]["read2"]) for umi in final_umis]
 
         return (reads, final_umis, umi_counts)
 

diff --git a/umi_tools/sam_methods.py b/umi_tools/sam_methods.py
@@ -126,6 +126,99 @@ def get_barcode_umis(read, cell_barcode=False):
                          "read tag")
 
 
+class get_readpairs:
+    # Used in place of a real read in incomplete pairs before they are completed
+    PendingMate = object()
+
+    # Read key. To allow for multi-mapping, we also use the mapping position
+    def get_pair_key(self, read):
+        return (read.qname, read.reference_start if not read.is_read2 == 0 else read.next_reference_start)
+
+    def drain_queue_forcibly(self):
+        # Forcibly convert incomplete queued pairs into singletons.
+        # These necessarily contains only a read1, but not read2, since we
+        # only queue a pair once we've observed read1.
+        while self.queue:
+            pair = self.queue.popleft()
+            if pair[1] is get_readpairs.PendingMate:
+                pair[1] = None
+                U.warn("inconsistent BAM file: only read1 of proper pair %s found on chromosome %s, treating as unpaired" %
+                       (str(self.get_pair_key(pair[0])), pair[0].reference_name))
+            yield pair
+        # Forcibly convert also incomplete unqueued pairs into singletons.
+        # These are the ones that contain only a read2 but no read1
+        for pair in self.incomplete_pairs.values():
+            if pair[0] is get_readpairs.PendingMate:
+                pair[0] = None
+                U.warn("inconsistent BAM file: only read2 of proper pair %s found on chromosome %s, treating as unpaired" %
+                       (str(self.get_pair_key(pair[1])), pair[1].reference_name))
+                yield pair
+        self.incomplete_pairs.clear()
+
+    def __call__(self, inreads):
+        self.queue = collections.deque()
+        self.incomplete_pairs = dict()
+        self.current_chr = None
+
+        for read in inreads:
+            read_chr = read.reference_name
+            # Read index. 0 for read1 and 1 for read2
+            read_i = int(read.is_read2)
+            pair_key = self.get_pair_key(read)
+
+            # Output leftover incomplete read pairs if the chrosomome changes
+            if self.current_chr is not None and self.current_chr != read_chr:
+                for queue_entry in self.drain_queue_forcibly():
+                    yield queue_entry
+            self.current_chr = read_chr
+
+            # Queue read, either as a singleton, or as part of a read pair.
+            # Proper pairs should always have both mates mapped to the same chromosome,
+            # but to avoid weird behaviour for broken BAM file, we re-check.
+            if ((not read.is_proper_pair) or
+                read.is_unmapped or
+                read.mate_is_unmapped or
+                (read.reference_id != read.next_reference_id)):
+                # Read is not part of a proper pair. Enqueue as singleton
+                pair = [None, None]
+                pair[read_i] = read
+                self.queue.append(pair)
+            else:
+                if pair_key in self.incomplete_pairs:
+                    # Read is part of an already queued (incomplete) read pair. Complete it.
+                    pair = self.incomplete_pairs.pop(pair_key)
+                    if pair[read_i] is not get_readpairs.PendingMate:
+                        U.warn("inconsistent BAM file: both mates of %s flagged to be read%d" %
+                               (str(pair_key), read_i+1))
+                        read_i = 1 - read_i
+                    pair[read_i] = read
+                else:
+                    # Read is part of a new read pair. Create incomplete pair
+                    pair = [get_readpairs.PendingMate, get_readpairs.PendingMate]
+                    pair[read_i] = read
+                    self.incomplete_pairs[pair_key] = pair
+                # Enqueue pair in the correct place for its 1st read
+                if read_i == 0:
+                    self.queue.append(pair)
+
+            # Output queued reads and readpairs up to the first incomplete pair
+            while self.queue and sum(1 for r in self.queue[0] if r is get_readpairs.PendingMate) == 0:
+                yield self.queue.popleft()
+
+        # Output leftover incomplete read pairs at the end of the input
+        for queue_entry in self.drain_queue_forcibly():
+            yield queue_entry
+
+
+class get_singletons:
+    def __call__(self, inreads):
+        for read in inreads:
+            if read.is_read2:
+                yield None, read
+            else:
+                yield read, None
+
+
 class get_bundles:
 
     ''' A functor - When called returns a dictionary of dictionaries,
@@ -208,7 +301,7 @@ def __init__(self,
         self.read_counts = collections.defaultdict(
             lambda: collections.defaultdict(dict))
 
-    def update_dicts(self, read, pos, key, umi):
+    def update_dicts(self, read, read2, pos, key, umi):
 
         # The content of the reads_dict depends on whether all reads
         # are being retained
@@ -219,9 +312,11 @@ def update_dicts(self, read, pos, key, umi):
                 self.reads_dict[pos][key][umi]["count"] += 1
             except KeyError:
                 self.reads_dict[pos][key][umi]["read"] = [read]
+                self.reads_dict[pos][key][umi]["read2"] = [read2]
                 self.reads_dict[pos][key][umi]["count"] = 1
             else:
                 self.reads_dict[pos][key][umi]["read"].append(read)
+                self.reads_dict[pos][key][umi]["read2"].append(read2)
 
         elif self.only_count_reads:
             # retain all reads per key
@@ -236,18 +331,25 @@ def update_dicts(self, read, pos, key, umi):
                 self.reads_dict[pos][key][umi]["count"] += 1
             except KeyError:
                 self.reads_dict[pos][key][umi]["read"] = read
+                self.reads_dict[pos][key][umi]["read2"] = read2
                 self.reads_dict[pos][key][umi]["count"] = 1
                 self.read_counts[pos][key][umi] = 0
             else:
-                if self.reads_dict[pos][key][umi]["read"].mapq > read.mapq:
+                dict_r1 = self.reads_dict[pos][key][umi]["read"]
+                dict_r2 = self.reads_dict[pos][key][umi]["read2"]
+                dict_mapq = dict_r1.mapq + (dict_r2.mapq if dict_r2 is not None else 0)
+                read_mapq = read.mapq + (read2.mapq if read2 is not None else 0)
+                if dict_mapq > read_mapq:
                     return
 
-                if self.reads_dict[pos][key][umi]["read"].mapq < read.mapq:
+                if read_mapq > dict_mapq:
                     self.reads_dict[pos][key][umi]["read"] = read
+                    self.reads_dict[pos][key][umi]["read2"] = read2
                     self.read_counts[pos][key][umi] = 0
                     return
 
                 # TS: implemented different checks for multimapping here
+                # XXX: What to do in paired mode? Probably just complain...
                 if self.options.detection_method in ["NH", "X0"]:
                     tag = self.options.detection_method
                     if (self.reads_dict[pos][key][umi]["read"].opt(tag) <
@@ -270,6 +372,7 @@ def update_dicts(self, read, pos, key, umi):
 
                 if random.random() < prob:
                     self.reads_dict[pos][key][umi]["read"] = read
+                    self.reads_dict[pos][key][umi]["read2"] = read2
 
     def check_output(self):
 
@@ -311,14 +414,19 @@ def check_output(self):
         return do_output, out_keys
 
     def __call__(self, inreads):
+        if self.options.paired:
+            generator = get_readpairs()
+        else:
+            generator = get_singletons()
 
-        for read in inreads:
+        for read, read2 in generator(inreads):
 
-            if read.is_read2:
+            # Unpaired second reads are returned if the return_read2 option is set
+            if read is None:
                 if self.return_read2:
-                    if not read.is_unmapped or (
-                            read.is_unmapped and self.return_unmapped):
-                        yield read, None, "single_read"
+                    if not read2.is_unmapped or (
+                            read2.is_unmapped and self.return_unmapped):
+                        yield read2, None, "single_read"
                 continue
             else:
                 self.read_events['Input Reads'] += 1
@@ -356,13 +464,11 @@ def __call__(self, inreads):
 
                 # if read1 is unmapped, yield immediately or skip read
                 if self.return_unmapped:
-                    self.read_events['Input Reads'] += 1
                     yield read, None, "single_read"
                 continue
 
             if self.options.paired and read.mate_is_unmapped:
-                if not read.is_unmapped:
-                    self.read_events['Read 2 unmapped'] += 1
+                self.read_events['Read 2 unmapped'] += 1
 
                 # if paired end input and read2 is unmapped, skip unless
                 # options.unmapped_reads == "use", in which case TLEN will be 0
@@ -396,7 +502,8 @@ def __call__(self, inreads):
                     continue
 
             if self.options.mapping_quality:
-                if read.mapq < self.options.mapping_quality:
+                if ((read.mapq < self.options.mapping_quality) or
+                    (read2 is not None and read2.mapq < self.options.mapping_quality)):
                     self.read_events['< MAPQ threshold'] += 1
                     continue
 
@@ -500,12 +607,23 @@ def __call__(self, inreads):
                 else:
                     r_length = 0
 
-                key = (read.is_reverse, self.options.spliced and is_spliced,
-                       self.options.paired*read.tlen, r_length)
+                key = (read.is_reverse, self.options.spliced & is_spliced, r_length)
+
+                # In paired mode, we add the 2nd read's properties as well
+                if self.options.paired and (read2 is not None):
+                    start2, pos2, is_spliced2 = get_read_position(
+                        read, self.options.soft_clip_threshold)
+
+                    if self.options.read_length:
+                        r_length2 = read.query_length
+                    else:
+                        r_length2 = 0
+
+                    key = (key, read2.is_reverse, self.options.spliced & is_spliced, r_length2)
 
             # update dictionaries
             key = (key, cell)
-            self.update_dicts(read, pos, key, umi)
+            self.update_dicts(read, read2, pos, key, umi)
 
             if self.metacontig_contig:
                 # keep track of observed contigs for each gene