From 34825783e2178a09ff989d1f55f350606f9e5b68 Mon Sep 17 00:00:00 2001 From: IanSudbery Date: Sun, 28 Mar 2021 19:39:28 +0100 Subject: [PATCH 1/6] Created prepare_for_rsem script to make output valid for rsem. --- umi_tools/prepare_for_rsem.py | 219 ++++++++++++++++++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 umi_tools/prepare_for_rsem.py diff --git a/umi_tools/prepare_for_rsem.py b/umi_tools/prepare_for_rsem.py new file mode 100644 index 0000000..1cccc93 --- /dev/null +++ b/umi_tools/prepare_for_rsem.py @@ -0,0 +1,219 @@ +''' +============================================================================== +prepare_for_rsem - make the output from dedup or group compatible with RSEM +=============================================================================== +The SAM format specification states that the mnext and mpos fields should point +to the primary alignment of a read's mate. However, not all aligners adhere to +this standard. In addition, the RSEM software requires that the mate of a read1 +appears directly after it in its input BAM. This requires that there is exactly +one read1 alignment for every read2 and vice versa. + +In general (except in a few edge cases) UMI tools outputs only the read2 to that +corresponds to the read specified in the mnext and mpos positions of a selected +read1, and only outputs this read once, even if multiple read1s point to it. +This makes UMI-tools outputs incompatible with RSEM. This script takes the output +from dedup or groups and ensures that each read1 has exactly one read2 (and vice +versa), that read2 always appears directly after read1,and that pairs point to +each other (note this is technically not valid SAM format). Copy any specified +tags from read1 to read2 if they are present (by default, UG and BX, the unique +group and correct UMI tags added by _group_) + +Input must to name sorted. + +''' + + +from umi_tools import Utilities as U +from collections import defaultdict +import pysam +import sys + +usage = ''' +prepare_for_rsem - make output from dedup or group compatible with RSEM + +Usage: umi_tools prepare_for_rsem [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] + + note: If --stdout is ommited, standard out is output. To + generate a valid BAM file on standard out, please + redirect log with --log=LOGFILE or --log2stderr ''' + +def chunk_bam(bamfile): + '''Take in a iterator of pysam.AlignmentSegment entries and yeild + lists of reads that all share the same name''' + + last_query_name = None + output_buffer = list() + + for read in bamfile: + + if last_query_name is not None and last_query_name != read.query_name: + yield(output_buffer) + output_buffer = list() + + output_buffer.append(read) + + yield (output_buffer) + +def copy_tags(tags, read1, read2): + '''Given a list of tags, copies the values of these tags from read1 + to read2, if the tag is set''' + + for tag in tags: + + try: + read1_tag = read1.get_tag(tag, with_value_type=True) + read2.set_tag(tag, value=read1_tag[0], value_type=read1_tag[1]) + except KeyError: + pass + + return(read2) + + +def main(argv=None): + + if argv is None: + argv = sys.argv + + # setup command line parser + parser = U.OptionParser(version="%prog version: $Id$", + usage=usage, + description=globals()["__doc__"]) + group = U.OptionGroup(parser, "RSEM preparation specific options") + + group.add_option("--tags", dest="tags", type="string", + default="UG,BX", + help="Comma-seperated list of tags to transfer from read1 to read2") + group.add_option("--sam", dest="sam", action="store_true", + default=False, + help="input and output SAM rather than BAM") + + parser.add_option_group(group) + + # add common options (-h/--help, ...) and parse command line + (options, args) = U.Start(parser, argv=argv, + add_group_dedup_options=False, + add_umi_grouping_options=False, + add_sam_options=False) + + if options.stdin != sys.stdin: + in_name = options.stdin.name + options.stdin.close() + else: + in_name = "-" + + if options.sam: + mode = "" + else: + mode = "b" + + inbam = pysam.AlignmentFile(in_name, "r"+mode) + + if options.stdout != sys.stdout: + out_name = options.stdout.name + options.stdout.close() + else: + out_name = "-" + + outbam = pysam.AlignmentFile(out_name, "w" + mode, template = inbam) + + options.tags = options.tags.split(",") + + for template in chunk_bam(inbam): + + current_template = {True: defaultdict(list), + False: defaultdict(list)} + + for read in template: + key = (read.reference_name, read.pos, not read.is_secondary) + current_template[read.is_read1][key].append(read) + + output = set() + + for read in template: + + mate = None + + # if this read is a non_primary alignment, we first want to check if it has a mate + # with the non-primary alignment flag set. + mate_key_secondary = (read.next_reference_name, read.next_reference_start, False) + if read.is_secondary: + + # get a list of secondary reads at the correct alignment position + potential_secondary_mates = current_template[not read.is_read1][mate_key_secondary] + + # search through one at a time to find a read that points to the current read + # as its mate. + for candidate_mate in potential_secondary_mates: + if candidate_mate.next_reference_name == read.reference_name and \ + candidate_mate.next_reference_start == read.pos: + mate = candidate_mate + + # if no such read is found, then pick any old secondary alignment at that position + # note: this happens when UMI-tools outputs the wrong read as somethings pair. + if mate is None and len(potential_secondary_mates) >0: + mate = potential_secondary_mates[0] + + # The following happens if read is primary, or is secondary, but doesn't point to a secondary + # alignment as its mate (or at least we didn't find one). If which case find the primary + # alignment at that location. + if mate is None: + mate_key_primary = (read.next_reference_name, read.next_reference_start, True) + + try: + # there should be exactly one primary alignment at this location + mate = current_template[not read.is_read1][mate_key_primary][0] + except KeyError: + U.warn("Alignment {} has no mate -- skipped".format( + "\t".join([read.query_name, read.flag, read.reference_name, read.pos]) + )) + continue + + # because we might want to make changes to the read, but not have those changes reflected + # if we need the read again,we copy the read. This is only way I can find to do this. + read = pysam.AlignedSegment().from_dict(read.to_dict(), read.header) + mate = pysam.AlignedSegment().from_dict(mate.to_dict(), read.header) + + # Make it so that if our read is secondary, the mate is also secondary. We don't make the + # mate primary if the read is primary because we would otherwise end up with mulitple + # primary alignments. + if read.is_secondary: + mate.is_secondary = True + + # In a situation where there is already one mate for each read, then we will come across + # each pair twice - once when we scan read1 and once when we scan read2. Thus we need + # to make sure we don't output something already output. + if read.is_read1: + + mate = copy_tags(options.tags, read, mate) + output_key = str(read) + str(mate) + + if output_key not in output: + output.add(output_key) + outbam.write(read) + outbam.write(mate) + + elif read.is_read2: + + read = copy_tags(options.tags, mate, read) + output_key = str(mate) + str(read) + + if output_key not in output: + output.add(output_key) + outbam.write(mate) + outbam.write(key) + + else: + U.warn("Alignment {} is neither read1 nor read2 -- skipped".format( + "\t".join([read.query_name, read.flag, read.reference_name, read.pos]) + )) + continue + + if not out_name == "-": + outbam.close() + U.Stop() + +if __name__ == "__main__": + sys.exit(main(sys.argv)) + + + From f4f93743608b100856d57c973400bf1736dc5760 Mon Sep 17 00:00:00 2001 From: IanSudbery Date: Sun, 28 Mar 2021 19:41:47 +0100 Subject: [PATCH 2/6] more sensible naming --- umi_tools/{prepare_for_rsem.py => prepare-for-rsem.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename umi_tools/{prepare_for_rsem.py => prepare-for-rsem.py} (100%) diff --git a/umi_tools/prepare_for_rsem.py b/umi_tools/prepare-for-rsem.py similarity index 100% rename from umi_tools/prepare_for_rsem.py rename to umi_tools/prepare-for-rsem.py From b88e923d82d05f6d422f86c05389b9e522fd581c Mon Sep 17 00:00:00 2001 From: Ian Sudbery Date: Sat, 4 Dec 2021 17:01:37 +0000 Subject: [PATCH 3/6] Fix typo in output code for prepare-for-rsem --- umi_tools/prepare-for-rsem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/umi_tools/prepare-for-rsem.py b/umi_tools/prepare-for-rsem.py index 1cccc93..ab5d413 100644 --- a/umi_tools/prepare-for-rsem.py +++ b/umi_tools/prepare-for-rsem.py @@ -200,7 +200,7 @@ def main(argv=None): if output_key not in output: output.add(output_key) outbam.write(mate) - outbam.write(key) + outbam.write(read) else: U.warn("Alignment {} is neither read1 nor read2 -- skipped".format( From e3f348e21f482d289da5f73cc42b779d7f69876b Mon Sep 17 00:00:00 2001 From: Ian Sudbery <5391758+IanSudbery@users.noreply.github.com> Date: Tue, 14 Dec 2021 10:34:37 +0000 Subject: [PATCH 4/6] Fix exception type in prepare-for-RSEM prepare-for-rsem tries to identify cases where a read has no mate by catching a look up error. However, the wrong exception type was specified. Since current template is a dictionary of defaultdicts, it shouldn't throw a key error. However the returned list will be empty, so any attempt to index into it will throw an IndexError. --- umi_tools/prepare-for-rsem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/umi_tools/prepare-for-rsem.py b/umi_tools/prepare-for-rsem.py index ab5d413..62ca956 100644 --- a/umi_tools/prepare-for-rsem.py +++ b/umi_tools/prepare-for-rsem.py @@ -162,7 +162,7 @@ def main(argv=None): try: # there should be exactly one primary alignment at this location mate = current_template[not read.is_read1][mate_key_primary][0] - except KeyError: + except (KeyError, IndexError): U.warn("Alignment {} has no mate -- skipped".format( "\t".join([read.query_name, read.flag, read.reference_name, read.pos]) )) From fe52a3c3e34851b9e024529e216662a1a4912f73 Mon Sep 17 00:00:00 2001 From: IanSudbery Date: Tue, 14 Dec 2021 11:59:55 +0000 Subject: [PATCH 5/6] prepare-for-rsem - edits to fix logging code on failing to find mates --- umi_tools/prepare-for-rsem.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/umi_tools/prepare-for-rsem.py b/umi_tools/prepare-for-rsem.py index 62ca956..4f7be75 100644 --- a/umi_tools/prepare-for-rsem.py +++ b/umi_tools/prepare-for-rsem.py @@ -24,7 +24,7 @@ from umi_tools import Utilities as U -from collections import defaultdict +from collections import defaultdict, Counter import pysam import sys @@ -95,6 +95,8 @@ def main(argv=None): add_umi_grouping_options=False, add_sam_options=False) + skipped_stats = Counter() + if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() @@ -163,8 +165,9 @@ def main(argv=None): # there should be exactly one primary alignment at this location mate = current_template[not read.is_read1][mate_key_primary][0] except (KeyError, IndexError): + skipped_stats["no_mate"] += 1 U.warn("Alignment {} has no mate -- skipped".format( - "\t".join([read.query_name, read.flag, read.reference_name, read.pos]) + "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)])) )) continue @@ -191,6 +194,7 @@ def main(argv=None): output.add(output_key) outbam.write(read) outbam.write(mate) + skipped_stats["pairs_output"] += 1 elif read.is_read2: @@ -201,15 +205,23 @@ def main(argv=None): output.add(output_key) outbam.write(mate) outbam.write(read) + skipped_stats["pairs_output"] += 1 else: + skipped_stats["skipped_not_read12"] += 1 U.warn("Alignment {} is neither read1 nor read2 -- skipped".format( - "\t".join([read.query_name, read.flag, read.reference_name, read.pos]) + "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)])) )) continue if not out_name == "-": outbam.close() + + U.info("Total pairs output: {}, Pairs skipped - no mates: {}," + " Pairs skipped - not read1 or 2: {}".format( + skipped_stats["pairs_output"], + skipped_stats["no_mate"], + skipped_stats["skipped_not_read12"])) U.Stop() if __name__ == "__main__": From bf8608d6a172c5ca0dcf33c126b4e23429177a72 Mon Sep 17 00:00:00 2001 From: Ian Sudbery Date: Tue, 28 Dec 2021 22:57:41 +0000 Subject: [PATCH 6/6] Refactored prepare-for-rsem so that reads are chunked properly, and will find secondary mates of primary reads --- umi_tools/prepare-for-rsem.py | 77 +++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 30 deletions(-) diff --git a/umi_tools/prepare-for-rsem.py b/umi_tools/prepare-for-rsem.py index 4f7be75..b7a8bea 100644 --- a/umi_tools/prepare-for-rsem.py +++ b/umi_tools/prepare-for-rsem.py @@ -50,6 +50,7 @@ def chunk_bam(bamfile): yield(output_buffer) output_buffer = list() + last_query_name = read.query_name output_buffer.append(read) yield (output_buffer) @@ -68,6 +69,29 @@ def copy_tags(tags, read1, read2): return(read2) +def pick_mate(read, template_dict, mate_key): + '''Find the mate of read in the template dict using key. It will retreieve + all reads at that key, and then scan to pick the one that refers to _read_ + as it's mate. If there is no such read, it picks a first one it comes to''' + + mate = None + + # get a list of secondary reads at the correct alignment position + potential_mates = template_dict[not read.is_read1][mate_key] + + # search through one at a time to find a read that points to the current read + # as its mate. + for candidate_mate in potential_mates: + if candidate_mate.next_reference_name == read.reference_name and \ + candidate_mate.next_reference_start == read.pos: + mate = candidate_mate + + # if no such read is found, then pick any old secondary alignment at that position + # note: this happens when UMI-tools outputs the wrong read as something's pair. + if mate is None and len(potential_mates) >0: + mate = potential_mates[0] + + return mate def main(argv=None): @@ -122,6 +146,7 @@ def main(argv=None): for template in chunk_bam(inbam): + assert len(set(r.query_name for r in template)) == 1 current_template = {True: defaultdict(list), False: defaultdict(list)} @@ -137,39 +162,31 @@ def main(argv=None): # if this read is a non_primary alignment, we first want to check if it has a mate # with the non-primary alignment flag set. - mate_key_secondary = (read.next_reference_name, read.next_reference_start, False) - if read.is_secondary: - # get a list of secondary reads at the correct alignment position - potential_secondary_mates = current_template[not read.is_read1][mate_key_secondary] + mate_key_primary = ( True) + mate_key_secondary = (read.next_reference_name, read.next_reference_start, False) + + # First look for a read that has the same primary/secondary status + # as read (i.e. secondary mate for secondary read, and primary mate + # for primary read) + mate_key = (read.next_reference_name, read.next_reference_start, + read.is_secondary) + mate = pick_mate(read, current_template, mate_key) + + # If none was found then look for the opposite (primary mate of secondary + # read or seconadary mate of primary read) + if mate is None: + mate_key = (read.next_reference_name, read.next_reference_start, + not read.is_secondary) + mate = pick_mate(read, current_template, mate_key) - # search through one at a time to find a read that points to the current read - # as its mate. - for candidate_mate in potential_secondary_mates: - if candidate_mate.next_reference_name == read.reference_name and \ - candidate_mate.next_reference_start == read.pos: - mate = candidate_mate - - # if no such read is found, then pick any old secondary alignment at that position - # note: this happens when UMI-tools outputs the wrong read as somethings pair. - if mate is None and len(potential_secondary_mates) >0: - mate = potential_secondary_mates[0] - - # The following happens if read is primary, or is secondary, but doesn't point to a secondary - # alignment as its mate (or at least we didn't find one). If which case find the primary - # alignment at that location. + # If we still don't have a mate, then their can't be one? if mate is None: - mate_key_primary = (read.next_reference_name, read.next_reference_start, True) - - try: - # there should be exactly one primary alignment at this location - mate = current_template[not read.is_read1][mate_key_primary][0] - except (KeyError, IndexError): - skipped_stats["no_mate"] += 1 - U.warn("Alignment {} has no mate -- skipped".format( - "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)])) - )) - continue + skipped_stats["no_mate"] += 1 + U.warn("Alignment {} has no mate -- skipped".format( + "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)])) + )) + continue # because we might want to make changes to the read, but not have those changes reflected # if we need the read again,we copy the read. This is only way I can find to do this.