Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Also report position of 2nd read in "group --group-out" table #180

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions tests/count_paired_wide_contig.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
gene count
AAC73112 14
AAC73113 50
133 changes: 133 additions & 0 deletions tests/dedup_paired_contig.sam

Large diffs are not rendered by default.

133 changes: 133 additions & 0 deletions tests/dedup_paired_contig_py3.sam

Large diffs are not rendered by default.

6,859 changes: 6,859 additions & 0 deletions tests/group_paired_contig.sam

Large diffs are not rendered by default.

3,371 changes: 3,371 additions & 0 deletions tests/group_paired_contig.tsv

Large diffs are not rendered by default.

6,859 changes: 6,859 additions & 0 deletions tests/group_paired_contig_py3.sam

Large diffs are not rendered by default.

3,371 changes: 3,371 additions & 0 deletions tests/group_paired_contig_py3.tsv

Large diffs are not rendered by default.

Binary file added tests/paired_contig.bam
Binary file not shown.
Binary file added tests/paired_contig.bam.bai
Binary file not shown.
20 changes: 20 additions & 0 deletions tests/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,12 @@ count_tab_single_per_cell:
references: [count_tab_cell.tsv]
options: count_tab -L test.log --per-cell

count_paired_wide_contig:
stdin: paired_contig.bam
outputs: [stdout]
references: [count_paired_wide_contig.tsv]
options: count -L test.log --random-seed=123456789 --method=directional --umi-separator="|" --paired --per-contig --wide-format-cell-counts

dedup_single_ignore:
stdin: chr19.bam
outputs: [stdout]
Expand Down Expand Up @@ -225,12 +231,26 @@ dedup_single_gene_tag:
references: [single_gene_tag_py3.sam]
options: dedup -L test.log --out-sam --random-seed=123456789 --method=directional --per-gene --gene-tag=XF --skip-tags-regex="^[__|Unassigned]"

dedup_paired_contig:
sort: True
stdin: paired_contig.bam
outputs: [stdout]
references: [dedup_paired_contig_py3.sam]
options: dedup -L test.log --out-sam --random-seed=123456789 --method=directional --umi-separator="|" --paired

group_gene_tag:
stdin: chr19_gene_tags.bam
outputs: [group_dir_per_gene_py3.tsv, stdout]
references: [group_dir_per_gene_py3.tsv, group_dir_per_gene_py3.sam]
options: group -L test.log --random-seed=123456789 --method=directional --per-gene --gene-tag=XF --skip-tags-regex="^[__|Unassigned]" --group-out=group_dir_per_gene_py3.tsv --output-bam --out-sam

group_paired_contig:
sort: True
stdin: paired_contig.bam
outputs: [stdout, group_paired_contig_py3.tsv]
references: [group_paired_contig_py3.sam, group_paired_contig_py3.tsv]
options: group -L test.log --random-seed=123456789 --umi-separator="|" --paired --output-bam --out-sam --group-out=group_paired_contig_py3.tsv

group_unique:
stdin: chr19.bam
outputs: [stdout, group_uniq_py3.tsv]
Expand Down
9 changes: 4 additions & 5 deletions umi_tools/dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,9 +199,6 @@ def main(argv=None):
infile = pysam.Samfile(in_name, in_mode)
outfile = pysam.Samfile(out_name, out_mode, template=infile)

if options.paired:
outfile = sam_methods.TwoPassPairWriter(infile, outfile)

nInput, nOutput, input_reads, output_reads = 0, 0, 0, 0

if options.detection_method:
Expand Down Expand Up @@ -292,8 +289,10 @@ def main(argv=None):
bundle=bundle,
threshold=options.threshold)

for read in reads:
for read, read2 in reads:
outfile.write(read)
if (read2 is not None):
outfile.write(read2)
nOutput += 1

if options.stats:
Expand All @@ -304,7 +303,7 @@ def main(argv=None):
[bundle[UMI]['count'] for UMI in bundle])

# collect post-dudupe stats
post_cluster_umis = [bundle_iterator.barcode_getter(x)[0] for x in reads]
post_cluster_umis = [bundle_iterator.barcode_getter(r1)[0] for r1, r2 in reads]
stats_post_df_dict['UMI'].extend(umis)
stats_post_df_dict['counts'].extend(umi_counts)

Expand Down
28 changes: 20 additions & 8 deletions umi_tools/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
import os

# required to make iteritems python2 and python3 compatible
from builtins import dict
from builtins import dict, zip
from future.utils import iteritems

import pysam
Expand Down Expand Up @@ -187,8 +187,10 @@ def main(argv=None):
if options.tsv:
mapping_outfile = U.openFile(options.tsv, "w")
mapping_outfile.write("%s\n" % "\t".join(
["read_id", "contig", "position", "gene", "umi", "umi_count",
"final_umi", "final_umi_count", "unique_id"]))
["read_id", "contig", "position"] +
(["position2"] if options.paired else []) +
["gene", "umi", "umi_count", "final_umi", "final_umi_count",
"unique_id"]))

nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0

Expand Down Expand Up @@ -262,28 +264,38 @@ def main(argv=None):

for umi in umi_group:
reads = bundle[umi]['read']
for read in reads:
reads2 = bundle[umi]['read2']
for read, read2 in zip(reads, reads2):
if outfile:
# Add the 'UG' tag to the read
read.set_tag('UG', unique_id)
read.set_tag(options.umi_group_tag, top_umi)
outfile.write(read)
if read2 is not None:
read2.tags += [('UG', unique_id)]
read2.tags += [(options.umi_group_tag, top_umi)]
outfile.write(read2)

if options.tsv:
if options.per_gene:
# XXX: What if the two reads disagree?
gene = read.get_tag(gene_tag)
else:
gene = "NA"
mapping_outfile.write("%s\n" % "\t".join(map(str, (
pos = sam_methods.get_read_position(
read, options.soft_clip_threshold)[1]
pos2 = sam_methods.get_read_position(
read2, options.soft_clip_threshold)[1] if read2 is not None else ""
mapping_outfile.write("%s\n" % "\t".join(map(str, [
read.query_name, read.reference_name,
sam_methods.get_read_position(
read, options.soft_clip_threshold)[1],
pos] +
([pos2] if options.paired else []) + [
gene,
umi.decode(),
counts[umi],
top_umi.decode(),
group_count,
unique_id))))
unique_id])))

nOutput += 1

Expand Down
2 changes: 1 addition & 1 deletion umi_tools/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ def __call__(self, bundle, threshold):
final_umis = [cluster[0] for cluster in clusters]
umi_counts = [sum(counts[umi] for umi in cluster)
for cluster in clusters]
reads = [bundle[umi]["read"] for umi in final_umis]
reads = [(bundle[umi]["read"], bundle[umi]["read2"]) for umi in final_umis]

return (reads, final_umis, umi_counts)

Expand Down
148 changes: 133 additions & 15 deletions umi_tools/sam_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,99 @@ def get_barcode_umis(read, cell_barcode=False):
"read tag")


class get_readpairs:
# Used in place of a real read in incomplete pairs before they are completed
PendingMate = object()

# Read key. To allow for multi-mapping, we also use the mapping position
def get_pair_key(self, read):
return (read.qname, read.reference_start if not read.is_read2 == 0 else read.next_reference_start)

def drain_queue_forcibly(self):
# Forcibly convert incomplete queued pairs into singletons.
# These necessarily contains only a read1, but not read2, since we
# only queue a pair once we've observed read1.
while self.queue:
pair = self.queue.popleft()
if pair[1] is get_readpairs.PendingMate:
pair[1] = None
U.warn("inconsistent BAM file: only read1 of proper pair %s found on chromosome %s, treating as unpaired" %
(str(self.get_pair_key(pair[0])), pair[0].reference_name))
yield pair
# Forcibly convert also incomplete unqueued pairs into singletons.
# These are the ones that contain only a read2 but no read1
for pair in self.incomplete_pairs.values():
if pair[0] is get_readpairs.PendingMate:
pair[0] = None
U.warn("inconsistent BAM file: only read2 of proper pair %s found on chromosome %s, treating as unpaired" %
(str(self.get_pair_key(pair[1])), pair[1].reference_name))
yield pair
self.incomplete_pairs.clear()

def __call__(self, inreads):
self.queue = collections.deque()
self.incomplete_pairs = dict()
self.current_chr = None

for read in inreads:
read_chr = read.reference_name
# Read index. 0 for read1 and 1 for read2
read_i = int(read.is_read2)
pair_key = self.get_pair_key(read)

# Output leftover incomplete read pairs if the chrosomome changes
if self.current_chr is not None and self.current_chr != read_chr:
for queue_entry in self.drain_queue_forcibly():
yield queue_entry
self.current_chr = read_chr

# Queue read, either as a singleton, or as part of a read pair.
# Proper pairs should always have both mates mapped to the same chromosome,
# but to avoid weird behaviour for broken BAM file, we re-check.
if ((not read.is_proper_pair) or
read.is_unmapped or
read.mate_is_unmapped or
(read.reference_id != read.next_reference_id)):
# Read is not part of a proper pair. Enqueue as singleton
pair = [None, None]
pair[read_i] = read
self.queue.append(pair)
else:
if pair_key in self.incomplete_pairs:
# Read is part of an already queued (incomplete) read pair. Complete it.
pair = self.incomplete_pairs.pop(pair_key)
if pair[read_i] is not get_readpairs.PendingMate:
U.warn("inconsistent BAM file: both mates of %s flagged to be read%d" %
(str(pair_key), read_i+1))
read_i = 1 - read_i
pair[read_i] = read
else:
# Read is part of a new read pair. Create incomplete pair
pair = [get_readpairs.PendingMate, get_readpairs.PendingMate]
pair[read_i] = read
self.incomplete_pairs[pair_key] = pair
# Enqueue pair in the correct place for its 1st read
if read_i == 0:
self.queue.append(pair)

# Output queued reads and readpairs up to the first incomplete pair
while self.queue and sum(1 for r in self.queue[0] if r is get_readpairs.PendingMate) == 0:
yield self.queue.popleft()

# Output leftover incomplete read pairs at the end of the input
for queue_entry in self.drain_queue_forcibly():
yield queue_entry


class get_singletons:
def __call__(self, inreads):
for read in inreads:
if read.is_read2:
yield None, read
else:
yield read, None


class get_bundles:

''' A functor - When called returns a dictionary of dictionaries,
Expand Down Expand Up @@ -208,7 +301,7 @@ def __init__(self,
self.read_counts = collections.defaultdict(
lambda: collections.defaultdict(dict))

def update_dicts(self, read, pos, key, umi):
def update_dicts(self, read, read2, pos, key, umi):

# The content of the reads_dict depends on whether all reads
# are being retained
Expand All @@ -219,9 +312,11 @@ def update_dicts(self, read, pos, key, umi):
self.reads_dict[pos][key][umi]["count"] += 1
except KeyError:
self.reads_dict[pos][key][umi]["read"] = [read]
self.reads_dict[pos][key][umi]["read2"] = [read2]
self.reads_dict[pos][key][umi]["count"] = 1
else:
self.reads_dict[pos][key][umi]["read"].append(read)
self.reads_dict[pos][key][umi]["read2"].append(read2)

elif self.only_count_reads:
# retain all reads per key
Expand All @@ -236,18 +331,25 @@ def update_dicts(self, read, pos, key, umi):
self.reads_dict[pos][key][umi]["count"] += 1
except KeyError:
self.reads_dict[pos][key][umi]["read"] = read
self.reads_dict[pos][key][umi]["read2"] = read2
self.reads_dict[pos][key][umi]["count"] = 1
self.read_counts[pos][key][umi] = 0
else:
if self.reads_dict[pos][key][umi]["read"].mapq > read.mapq:
dict_r1 = self.reads_dict[pos][key][umi]["read"]
dict_r2 = self.reads_dict[pos][key][umi]["read2"]
dict_mapq = dict_r1.mapq + (dict_r2.mapq if dict_r2 is not None else 0)
read_mapq = read.mapq + (read2.mapq if read2 is not None else 0)
if dict_mapq > read_mapq:
return

if self.reads_dict[pos][key][umi]["read"].mapq < read.mapq:
if read_mapq > dict_mapq:
self.reads_dict[pos][key][umi]["read"] = read
self.reads_dict[pos][key][umi]["read2"] = read2
self.read_counts[pos][key][umi] = 0
return

# TS: implemented different checks for multimapping here
# XXX: What to do in paired mode? Probably just complain...
if self.options.detection_method in ["NH", "X0"]:
tag = self.options.detection_method
if (self.reads_dict[pos][key][umi]["read"].opt(tag) <
Expand All @@ -270,6 +372,7 @@ def update_dicts(self, read, pos, key, umi):

if random.random() < prob:
self.reads_dict[pos][key][umi]["read"] = read
self.reads_dict[pos][key][umi]["read2"] = read2

def check_output(self):

Expand Down Expand Up @@ -311,14 +414,19 @@ def check_output(self):
return do_output, out_keys

def __call__(self, inreads):
if self.options.paired:
generator = get_readpairs()
else:
generator = get_singletons()

for read in inreads:
for read, read2 in generator(inreads):

if read.is_read2:
# Unpaired second reads are returned if the return_read2 option is set
if read is None:
if self.return_read2:
if not read.is_unmapped or (
read.is_unmapped and self.return_unmapped):
yield read, None, "single_read"
if not read2.is_unmapped or (
read2.is_unmapped and self.return_unmapped):
yield read2, None, "single_read"
continue
else:
self.read_events['Input Reads'] += 1
Expand Down Expand Up @@ -356,13 +464,11 @@ def __call__(self, inreads):

# if read1 is unmapped, yield immediately or skip read
if self.return_unmapped:
self.read_events['Input Reads'] += 1
yield read, None, "single_read"
continue

if self.options.paired and read.mate_is_unmapped:
if not read.is_unmapped:
self.read_events['Read 2 unmapped'] += 1
self.read_events['Read 2 unmapped'] += 1

# if paired end input and read2 is unmapped, skip unless
# options.unmapped_reads == "use", in which case TLEN will be 0
Expand Down Expand Up @@ -396,7 +502,8 @@ def __call__(self, inreads):
continue

if self.options.mapping_quality:
if read.mapq < self.options.mapping_quality:
if ((read.mapq < self.options.mapping_quality) or
(read2 is not None and read2.mapq < self.options.mapping_quality)):
self.read_events['< MAPQ threshold'] += 1
continue

Expand Down Expand Up @@ -500,12 +607,23 @@ def __call__(self, inreads):
else:
r_length = 0

key = (read.is_reverse, self.options.spliced and is_spliced,
self.options.paired*read.tlen, r_length)
key = (read.is_reverse, self.options.spliced & is_spliced, r_length)

# In paired mode, we add the 2nd read's properties as well
if self.options.paired and (read2 is not None):
start2, pos2, is_spliced2 = get_read_position(
read, self.options.soft_clip_threshold)

if self.options.read_length:
r_length2 = read.query_length
else:
r_length2 = 0

key = (key, read2.is_reverse, self.options.spliced & is_spliced, r_length2)

# update dictionaries
key = (key, cell)
self.update_dicts(read, pos, key, umi)
self.update_dicts(read, read2, pos, key, umi)

if self.metacontig_contig:
# keep track of observed contigs for each gene
Expand Down