Skip to content

Commit

Permalink
Don't count g2p mapped reads twice, for #442.
Browse files Browse the repository at this point in the history
  • Loading branch information
donkirkby committed Dec 5, 2019
1 parent a29d4a2 commit 67b2b49
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 11 deletions.
12 changes: 10 additions & 2 deletions micall/core/cascade_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,17 @@ def parse_args():


class CascadeReport:
def __init__(self, cascade_csv):
def __init__(self, cascade_csv, is_g2p_remapped=False):
""" Initialize a report object.
:param cascade_csv: an open CSV file to write to
:param is_g2p_remapped: True if the G2P reads get mapped again in the
remap step, so they shouldn't be included in the demultiplexed count
"""
self.g2p_summary_csv = self.remap_counts_csv = self.aligned_csv = None
self.counts = None
self.cascade_csv = cascade_csv
self.is_g2p_remapped = is_g2p_remapped

def generate(self):
field_names = ['demultiplexed',
Expand Down Expand Up @@ -57,7 +64,8 @@ def read_g2p(self):
reader = DictReader(self.g2p_summary_csv)
row = next(reader)
mapped = int(row['mapped'])
self.counts['demultiplexed'] += mapped
if not self.is_g2p_remapped:
self.counts['demultiplexed'] += mapped
self.counts['v3loop'] = mapped
self.counts['g2p'] += int(row['valid'])

Expand Down
20 changes: 11 additions & 9 deletions micall/drivers/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,15 +123,16 @@ def process(self,
summary_file=read_summary,
use_gzip=use_gzip)

logger.info('Running merge_for_entropy on %s.', self)
with open(self.read_entropy_csv, 'w') as read_entropy_csv:
merge_for_entropy(self.trimmed1_fastq,
self.trimmed2_fastq,
read_entropy_csv,
scratch_path)
if use_denovo:
logger.info('Running merge_for_entropy on %s.', self)
with open(self.read_entropy_csv, 'w') as read_entropy_csv:
merge_for_entropy(self.trimmed1_fastq,
self.trimmed2_fastq,
read_entropy_csv,
scratch_path)

write_merge_lengths_plot(self.read_entropy_csv,
self.merge_lengths_svg)
write_merge_lengths_plot(self.read_entropy_csv,
self.merge_lengths_svg)

logger.info('Running fastq_g2p on %s.', self)
with open(self.trimmed1_fastq) as fastq1, \
Expand Down Expand Up @@ -247,7 +248,8 @@ def process(self,
open(self.remap_counts_csv) as remap_counts_csv, \
open(self.aligned_csv) as aligned_csv, \
open(self.cascade_csv, 'w') as cascade_csv:
cascade_report = CascadeReport(cascade_csv)
cascade_report = CascadeReport(cascade_csv,
is_g2p_remapped=use_denovo)
cascade_report.g2p_summary_csv = g2p_summary_csv
cascade_report.remap_counts_csv = remap_counts_csv
cascade_report.aligned_csv = aligned_csv
Expand Down
24 changes: 24 additions & 0 deletions micall/tests/test_cascade_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,30 @@ def test_remap_inputs(self):

self.assertEqual(expected_report, report.cascade_csv.getvalue())

def test_g2p_inputs_remapped(self):
report = CascadeReport(StringIO(), is_g2p_remapped=True)
report.g2p_summary_csv = StringIO("""\
mapped,valid,X4calls,X4pct,final,validpct
100,90,0,0.00,,90.00
""")
report.remap_counts_csv = StringIO("""\
type,count,ignored
raw,300,x
prelim R1-seed,200,99
prelim *,100,
remap-1 R1-seed,220,
remap-final R1-seed,240,
unmapped,60,
""")
expected_report = """\
demultiplexed,v3loop,g2p,prelim_map,remap,aligned
150,100,90,100,120,0
"""

report.generate()

self.assertEqual(expected_report, report.cascade_csv.getvalue())

def test_remap_inputs_two_seeds(self):
report = CascadeReport(StringIO())
report.remap_counts_csv = StringIO("""\
Expand Down

0 comments on commit 67b2b49

Please sign in to comment.