From 34825783e2178a09ff989d1f55f350606f9e5b68 Mon Sep 17 00:00:00 2001
From: IanSudbery <ian.sudbery@dpag.ox.ac.uk>
Date: Sun, 28 Mar 2021 19:39:28 +0100
Subject: [PATCH 1/6] Created prepare_for_rsem script to make output valid for
 rsem.

---
 umi_tools/prepare_for_rsem.py | 219 ++++++++++++++++++++++++++++++++++
 1 file changed, 219 insertions(+)
 create mode 100644 umi_tools/prepare_for_rsem.py

diff --git a/umi_tools/prepare_for_rsem.py b/umi_tools/prepare_for_rsem.py
new file mode 100644
index 0000000..1cccc93
--- /dev/null
+++ b/umi_tools/prepare_for_rsem.py
@@ -0,0 +1,219 @@
+'''
+==============================================================================
+prepare_for_rsem - make the output from dedup or group compatible with RSEM
+===============================================================================
+The SAM format specification states that the mnext and mpos fields should point
+to the primary alignment of a read's mate. However, not all aligners adhere to
+this standard. In addition, the RSEM software requires that the mate of a read1
+appears directly after it in its input BAM. This requires that there is exactly
+one read1 alignment for every read2 and vice versa.
+
+In general (except in a few edge cases) UMI tools outputs only the read2 to that 
+corresponds to the read specified in the mnext and mpos positions of a selected
+read1, and only outputs this read once, even if multiple read1s point to it. 
+This makes UMI-tools outputs incompatible with RSEM. This script takes the output
+from dedup or groups and ensures that each read1 has exactly one read2 (and vice
+versa), that read2 always appears directly after read1,and that pairs point to 
+each other (note this is technically not valid SAM format). Copy any specified
+tags from read1 to read2 if they are present (by default, UG and BX, the unique
+group and correct UMI tags added by _group_)
+
+Input must to name sorted.
+
+'''
+
+
+from umi_tools import Utilities as U
+from collections import defaultdict
+import pysam
+import sys
+
+usage = '''
+prepare_for_rsem - make output from dedup or group compatible with RSEM
+
+Usage: umi_tools prepare_for_rsem [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM]
+
+       note: If --stdout is ommited, standard out is output. To
+             generate a valid BAM file on standard out, please
+             redirect log with --log=LOGFILE or --log2stderr '''
+
+def chunk_bam(bamfile):
+    '''Take in a iterator of pysam.AlignmentSegment entries and yeild
+    lists of reads that all share the same name'''
+
+    last_query_name = None
+    output_buffer = list()
+
+    for read in bamfile:
+    
+        if last_query_name is not None and last_query_name != read.query_name:
+            yield(output_buffer)
+            output_buffer = list()
+        
+        output_buffer.append(read)
+
+    yield (output_buffer)
+
+def copy_tags(tags, read1, read2):
+    '''Given a  list of tags, copies the values of these tags from read1
+    to read2, if the tag is set'''
+
+    for tag in tags:
+        
+        try:
+            read1_tag = read1.get_tag(tag, with_value_type=True)
+            read2.set_tag(tag, value=read1_tag[0], value_type=read1_tag[1])
+        except KeyError:
+            pass
+        
+    return(read2)
+
+
+def main(argv=None):
+
+    if argv is None:
+        argv = sys.argv
+
+    # setup command line parser
+    parser = U.OptionParser(version="%prog version: $Id$",
+                            usage=usage,
+                            description=globals()["__doc__"])
+    group = U.OptionGroup(parser, "RSEM preparation specific options")
+
+    group.add_option("--tags", dest="tags", type="string",
+                     default="UG,BX",
+                     help="Comma-seperated list of tags to transfer from read1 to read2")
+    group.add_option("--sam", dest="sam", action="store_true",
+                     default=False,
+                     help="input and output SAM rather than BAM")
+
+    parser.add_option_group(group)
+
+    # add common options (-h/--help, ...) and parse command line
+    (options, args) = U.Start(parser, argv=argv,
+                              add_group_dedup_options=False,
+                              add_umi_grouping_options=False,
+                              add_sam_options=False)
+
+    if options.stdin != sys.stdin:
+        in_name = options.stdin.name
+        options.stdin.close()
+    else:
+        in_name = "-"
+
+    if options.sam:
+        mode = ""
+    else:
+        mode = "b"
+
+    inbam = pysam.AlignmentFile(in_name, "r"+mode)
+
+    if options.stdout != sys.stdout:
+        out_name = options.stdout.name
+        options.stdout.close()
+    else:
+        out_name = "-"
+
+    outbam = pysam.AlignmentFile(out_name, "w" + mode, template = inbam)
+
+    options.tags = options.tags.split(",")
+
+    for template in chunk_bam(inbam):
+        
+        current_template = {True: defaultdict(list),
+                            False: defaultdict(list)}
+
+        for read in template:
+            key = (read.reference_name, read.pos, not read.is_secondary)
+            current_template[read.is_read1][key].append(read)
+
+        output = set()
+
+        for read in template:
+           
+            mate = None
+           
+            # if this read is a non_primary alignment, we first want to check if it has a mate
+            # with the non-primary alignment flag set. 
+            mate_key_secondary = (read.next_reference_name, read.next_reference_start, False)
+            if read.is_secondary:
+
+                # get a list of secondary reads at the correct alignment position
+                potential_secondary_mates = current_template[not read.is_read1][mate_key_secondary]
+
+                # search through one at a time to find a read that points to the current read
+                # as its mate.
+                for candidate_mate in potential_secondary_mates:
+                    if candidate_mate.next_reference_name == read.reference_name and \
+                       candidate_mate.next_reference_start == read.pos:
+                        mate = candidate_mate
+                
+                # if no such read is found, then pick any old secondary alignment at that position
+                # note: this happens when UMI-tools outputs the wrong read as somethings pair.
+                if mate is None and len(potential_secondary_mates) >0:
+                    mate = potential_secondary_mates[0]
+
+            # The following happens if read is primary, or is secondary, but doesn't point to a secondary
+            # alignment as its mate (or at least we didn't find one). If which case find the primary 
+            # alignment at that location.
+            if mate is None:
+                mate_key_primary = (read.next_reference_name, read.next_reference_start, True)
+
+                try:
+                    # there should be exactly one primary alignment at this location
+                    mate = current_template[not read.is_read1][mate_key_primary][0]
+                except KeyError:
+                    U.warn("Alignment {} has no mate -- skipped".format(
+                        "\t".join([read.query_name, read.flag, read.reference_name, read.pos])
+                    ))
+                    continue
+            
+            # because we might want to make changes to the read, but not have those changes reflected
+            # if we need the read again,we copy the read. This is only way I can find to do this.
+            read = pysam.AlignedSegment().from_dict(read.to_dict(), read.header)
+            mate = pysam.AlignedSegment().from_dict(mate.to_dict(), read.header)
+
+            # Make it so that if our read is secondary, the mate is also secondary. We don't make the
+            # mate primary if the read is primary because we would otherwise end up with mulitple
+            # primary alignments. 
+            if read.is_secondary:
+                mate.is_secondary = True
+            
+            # In a situation where there is already one mate for each read, then we will come across
+            # each pair twice - once when we scan read1 and once when we scan read2. Thus we need
+            # to make sure we don't output something already output. 
+            if read.is_read1:
+                
+                mate = copy_tags(options.tags, read, mate)  
+                output_key = str(read) + str(mate)
+
+                if output_key not in output:
+                    output.add(output_key)
+                    outbam.write(read)
+                    outbam.write(mate)
+
+            elif read.is_read2:
+
+                read = copy_tags(options.tags, mate, read)
+                output_key = str(mate) + str(read)
+
+                if output_key not in output:
+                    output.add(output_key)
+                    outbam.write(mate)
+                    outbam.write(key)
+
+            else:
+                U.warn("Alignment {} is neither read1 nor read2 -- skipped".format(
+                    "\t".join([read.query_name, read.flag, read.reference_name, read.pos])
+                ))
+                continue
+
+    if not out_name == "-":
+        outbam.close()
+    U.Stop()
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))
+
+
+

From f4f93743608b100856d57c973400bf1736dc5760 Mon Sep 17 00:00:00 2001
From: IanSudbery <ian.sudbery@dpag.ox.ac.uk>
Date: Sun, 28 Mar 2021 19:41:47 +0100
Subject: [PATCH 2/6] more sensible naming

---
 umi_tools/{prepare_for_rsem.py => prepare-for-rsem.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename umi_tools/{prepare_for_rsem.py => prepare-for-rsem.py} (100%)

diff --git a/umi_tools/prepare_for_rsem.py b/umi_tools/prepare-for-rsem.py
similarity index 100%
rename from umi_tools/prepare_for_rsem.py
rename to umi_tools/prepare-for-rsem.py

From b88e923d82d05f6d422f86c05389b9e522fd581c Mon Sep 17 00:00:00 2001
From: Ian Sudbery <i.sudbery@sheffield.ac.uk>
Date: Sat, 4 Dec 2021 17:01:37 +0000
Subject: [PATCH 3/6] Fix typo in output code for prepare-for-rsem

---
 umi_tools/prepare-for-rsem.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/umi_tools/prepare-for-rsem.py b/umi_tools/prepare-for-rsem.py
index 1cccc93..ab5d413 100644
--- a/umi_tools/prepare-for-rsem.py
+++ b/umi_tools/prepare-for-rsem.py
@@ -200,7 +200,7 @@ def main(argv=None):
                 if output_key not in output:
                     output.add(output_key)
                     outbam.write(mate)
-                    outbam.write(key)
+                    outbam.write(read)
 
             else:
                 U.warn("Alignment {} is neither read1 nor read2 -- skipped".format(

From e3f348e21f482d289da5f73cc42b779d7f69876b Mon Sep 17 00:00:00 2001
From: Ian Sudbery <5391758+IanSudbery@users.noreply.github.com>
Date: Tue, 14 Dec 2021 10:34:37 +0000
Subject: [PATCH 4/6] Fix exception type in prepare-for-RSEM

prepare-for-rsem tries to identify cases where a read has no mate by catching a look up error. However, the wrong exception type was specified. Since current template is a dictionary of defaultdicts, it shouldn't throw a key error. However the returned list will be empty, so any attempt to index into it will throw an IndexError.
---
 umi_tools/prepare-for-rsem.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/umi_tools/prepare-for-rsem.py b/umi_tools/prepare-for-rsem.py
index ab5d413..62ca956 100644
--- a/umi_tools/prepare-for-rsem.py
+++ b/umi_tools/prepare-for-rsem.py
@@ -162,7 +162,7 @@ def main(argv=None):
                 try:
                     # there should be exactly one primary alignment at this location
                     mate = current_template[not read.is_read1][mate_key_primary][0]
-                except KeyError:
+                except (KeyError, IndexError):
                     U.warn("Alignment {} has no mate -- skipped".format(
                         "\t".join([read.query_name, read.flag, read.reference_name, read.pos])
                     ))

From fe52a3c3e34851b9e024529e216662a1a4912f73 Mon Sep 17 00:00:00 2001
From: IanSudbery <ian.sudbery@dpag.ox.ac.uk>
Date: Tue, 14 Dec 2021 11:59:55 +0000
Subject: [PATCH 5/6] prepare-for-rsem - edits to fix logging code on failing
 to find mates

---
 umi_tools/prepare-for-rsem.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/umi_tools/prepare-for-rsem.py b/umi_tools/prepare-for-rsem.py
index 62ca956..4f7be75 100644
--- a/umi_tools/prepare-for-rsem.py
+++ b/umi_tools/prepare-for-rsem.py
@@ -24,7 +24,7 @@
 
 
 from umi_tools import Utilities as U
-from collections import defaultdict
+from collections import defaultdict, Counter
 import pysam
 import sys
 
@@ -95,6 +95,8 @@ def main(argv=None):
                               add_umi_grouping_options=False,
                               add_sam_options=False)
 
+    skipped_stats = Counter()
+
     if options.stdin != sys.stdin:
         in_name = options.stdin.name
         options.stdin.close()
@@ -163,8 +165,9 @@ def main(argv=None):
                     # there should be exactly one primary alignment at this location
                     mate = current_template[not read.is_read1][mate_key_primary][0]
                 except (KeyError, IndexError):
+                    skipped_stats["no_mate"] += 1
                     U.warn("Alignment {} has no mate -- skipped".format(
-                        "\t".join([read.query_name, read.flag, read.reference_name, read.pos])
+                        "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)]))
                     ))
                     continue
             
@@ -191,6 +194,7 @@ def main(argv=None):
                     output.add(output_key)
                     outbam.write(read)
                     outbam.write(mate)
+                    skipped_stats["pairs_output"] += 1
 
             elif read.is_read2:
 
@@ -201,15 +205,23 @@ def main(argv=None):
                     output.add(output_key)
                     outbam.write(mate)
                     outbam.write(read)
+                    skipped_stats["pairs_output"] += 1
 
             else:
+                skipped_stats["skipped_not_read12"] += 1
                 U.warn("Alignment {} is neither read1 nor read2 -- skipped".format(
-                    "\t".join([read.query_name, read.flag, read.reference_name, read.pos])
+                    "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)]))
                 ))
                 continue
 
     if not out_name == "-":
         outbam.close()
+    
+    U.info("Total pairs output: {}, Pairs skipped - no mates: {},"
+           " Pairs skipped - not read1 or 2: {}".format(
+               skipped_stats["pairs_output"],
+               skipped_stats["no_mate"],
+               skipped_stats["skipped_not_read12"]))
     U.Stop()
 
 if __name__ == "__main__":

From bf8608d6a172c5ca0dcf33c126b4e23429177a72 Mon Sep 17 00:00:00 2001
From: Ian Sudbery <i.sudbery@sheffield.ac.uk>
Date: Tue, 28 Dec 2021 22:57:41 +0000
Subject: [PATCH 6/6] Refactored prepare-for-rsem so that reads are chunked
 properly, and will find secondary mates of primary reads

---
 umi_tools/prepare-for-rsem.py | 77 +++++++++++++++++++++--------------
 1 file changed, 47 insertions(+), 30 deletions(-)

diff --git a/umi_tools/prepare-for-rsem.py b/umi_tools/prepare-for-rsem.py
index 4f7be75..b7a8bea 100644
--- a/umi_tools/prepare-for-rsem.py
+++ b/umi_tools/prepare-for-rsem.py
@@ -50,6 +50,7 @@ def chunk_bam(bamfile):
             yield(output_buffer)
             output_buffer = list()
         
+        last_query_name = read.query_name
         output_buffer.append(read)
 
     yield (output_buffer)
@@ -68,6 +69,29 @@ def copy_tags(tags, read1, read2):
         
     return(read2)
 
+def pick_mate(read, template_dict, mate_key):
+    '''Find the mate of read in the template dict using key. It will retreieve
+    all reads at that key, and then scan to pick the one that refers to _read_
+    as it's mate. If there is no such read, it picks a first one it comes to'''
+
+    mate = None
+
+    # get a list of secondary reads at the correct alignment position
+    potential_mates = template_dict[not read.is_read1][mate_key]
+
+    # search through one at a time to find a read that points to the current read
+    # as its mate.
+    for candidate_mate in potential_mates:
+        if candidate_mate.next_reference_name == read.reference_name and \
+            candidate_mate.next_reference_start == read.pos:
+            mate = candidate_mate
+               
+    # if no such read is found, then pick any old secondary alignment at that position
+    # note: this happens when UMI-tools outputs the wrong read as something's pair.
+    if mate is None and len(potential_mates) >0:
+        mate = potential_mates[0]
+
+    return mate
 
 def main(argv=None):
 
@@ -122,6 +146,7 @@ def main(argv=None):
 
     for template in chunk_bam(inbam):
         
+        assert len(set(r.query_name for r in template)) == 1
         current_template = {True: defaultdict(list),
                             False: defaultdict(list)}
 
@@ -137,39 +162,31 @@ def main(argv=None):
            
             # if this read is a non_primary alignment, we first want to check if it has a mate
             # with the non-primary alignment flag set. 
-            mate_key_secondary = (read.next_reference_name, read.next_reference_start, False)
-            if read.is_secondary:
 
-                # get a list of secondary reads at the correct alignment position
-                potential_secondary_mates = current_template[not read.is_read1][mate_key_secondary]
+            mate_key_primary = ( True)
+            mate_key_secondary = (read.next_reference_name, read.next_reference_start, False)
+            
+            # First look for a read that has the same primary/secondary status
+            # as read (i.e. secondary mate for secondary read, and primary mate
+            # for primary read)
+            mate_key = (read.next_reference_name, read.next_reference_start,
+                        read.is_secondary)
+            mate = pick_mate(read, current_template, mate_key)
+
+            # If none was found then look for the opposite (primary mate of secondary
+            # read or seconadary mate of primary read)
+            if mate is None:
+                mate_key = (read.next_reference_name, read.next_reference_start,
+                            not read.is_secondary)
+                mate = pick_mate(read, current_template, mate_key)
 
-                # search through one at a time to find a read that points to the current read
-                # as its mate.
-                for candidate_mate in potential_secondary_mates:
-                    if candidate_mate.next_reference_name == read.reference_name and \
-                       candidate_mate.next_reference_start == read.pos:
-                        mate = candidate_mate
-                
-                # if no such read is found, then pick any old secondary alignment at that position
-                # note: this happens when UMI-tools outputs the wrong read as somethings pair.
-                if mate is None and len(potential_secondary_mates) >0:
-                    mate = potential_secondary_mates[0]
-
-            # The following happens if read is primary, or is secondary, but doesn't point to a secondary
-            # alignment as its mate (or at least we didn't find one). If which case find the primary 
-            # alignment at that location.
+            # If we still don't have a mate, then their can't be one?
             if mate is None:
-                mate_key_primary = (read.next_reference_name, read.next_reference_start, True)
-
-                try:
-                    # there should be exactly one primary alignment at this location
-                    mate = current_template[not read.is_read1][mate_key_primary][0]
-                except (KeyError, IndexError):
-                    skipped_stats["no_mate"] += 1
-                    U.warn("Alignment {} has no mate -- skipped".format(
-                        "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)]))
-                    ))
-                    continue
+                skipped_stats["no_mate"] += 1
+                U.warn("Alignment {} has no mate -- skipped".format(
+                    "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)]))
+                ))
+                continue
             
             # because we might want to make changes to the read, but not have those changes reflected
             # if we need the read again,we copy the read. This is only way I can find to do this.