Skip to content

Commit

Permalink
Fix soft clipping counts, as part of #442.
Browse files Browse the repository at this point in the history
  • Loading branch information
donkirkby committed Oct 22, 2018
1 parent bef6a84 commit c471be8
Show file tree
Hide file tree
Showing 5 changed files with 160 additions and 32 deletions.
83 changes: 58 additions & 25 deletions micall/core/aln2counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,6 @@ def parse_args():
parser.add_argument('aligned_csv',
type=argparse.FileType('r'),
help='input CSV with aligned reads')
parser.add_argument('g2p_aligned_csv',
type=argparse.FileType('r'),
help='input CSV with aligned reads from G2P step')
parser.add_argument('clipping_csv',
type=argparse.FileType('r'),
help='input CSV with count of soft-clipped reads at each position')
Expand All @@ -56,6 +53,9 @@ def parse_args():
parser.add_argument('amino_csv',
type=argparse.FileType('w'),
help='CSV containing amino frequencies')
parser.add_argument('amino_detail_csv',
type=argparse.FileType('w'),
help='CSV containing amino frequencies for each contig')
parser.add_argument('coord_ins_csv',
type=argparse.FileType('w'),
help='CSV containing insertions relative to coordinate reference')
Expand Down Expand Up @@ -225,6 +225,7 @@ def _map_to_coordinate_ref(self, coordinate_name, coordinate_ref):
for reading_frame, frame_seed_aminos in self.seed_aminos.items():
consensus = ''.join([seed_amino1.get_consensus()
for seed_amino1 in frame_seed_aminos])
consensus = consensus.replace('-', '?')
if reading_frame == 0:
# best guess before aligning - if alignments fail, this will be non-empty
self.consensus[coordinate_name] = consensus
Expand Down Expand Up @@ -484,6 +485,11 @@ def write_amino_report(self, amino_writer, reports, seed, coverage_summary=None)
query_pos = (str(seed_amino.consensus_nuc_index + 1)
if seed_amino.consensus_nuc_index is not None
else '')
max_clip_count = 0
total_insertion_count = 0
for seed_nuc in seed_amino.nucleotides:
max_clip_count = max(seed_nuc.clip_count, max_clip_count)
total_insertion_count += seed_nuc.insertion_count
row = {'seed': seed,
'region': region,
'q-cutoff': self.qcut,
Expand All @@ -492,8 +498,8 @@ def write_amino_report(self, amino_writer, reports, seed, coverage_summary=None)
'X': seed_amino.low_quality,
'partial': seed_amino.partial,
'del': seed_amino.deletions,
'ins': report_amino.insertion_count,
'clip': report_amino.max_clip_count,
'ins': total_insertion_count,
'clip': max_clip_count,
'v3_overlap': seed_amino.v3_overlap,
'coverage': seed_amino.deletions}
for letter in AMINO_ALPHABET:
Expand Down Expand Up @@ -543,48 +549,73 @@ def write_counts(self, region, seed_amino, report_amino, nuc_writer):
:param ReportAmino? report_amino: the ReportAmino object for this position
:param nuc_writer: the CSV writer to write the row into
"""
max_clip_count = 0
total_insertion_count = 0
seed_insertion_counts = self.conseq_insertion_counts[self.seed]
for i, seed_nuc in enumerate(seed_amino.nucleotides):
if seed_amino.consensus_nuc_index is None:
query_pos_txt = ''
insertion_count = clip_count = 0
else:
query_pos = i + seed_amino.consensus_nuc_index + 1
query_pos_txt = str(query_pos)
clip_count = self.clipping_counts[self.seed][query_pos]
max_clip_count = max(clip_count, max_clip_count)
insertion_count = seed_insertion_counts[query_pos]
ref_pos = (str(i + 3*report_amino.position - 2)
if report_amino is not None
else '')
if i == 2 and report_amino is not None:
insertion_counts = self.insert_writer.insert_pos_counts[
(self.seed, region)]
insertion_count += insertion_counts[report_amino.position]
total_insertion_count += insertion_count
row = {'seed': self.seed,
'region': region,
'q-cutoff': self.qcut,
'query.nuc.pos': query_pos_txt,
'refseq.nuc.pos': ref_pos,
'del': seed_nuc.counts['-'],
'ins': insertion_count,
'clip': clip_count,
'ins': seed_nuc.insertion_count,
'clip': seed_nuc.clip_count,
'v3_overlap': seed_nuc.v3_overlap,
'coverage': seed_nuc.get_coverage()}
for base in 'ACTGN':
nuc_count = seed_nuc.counts[base]
row[base] = nuc_count
nuc_writer.writerow(row)
if report_amino is not None:
report_amino.max_clip_count = max_clip_count
report_amino.insertion_count = total_insertion_count

def merge_extra_counts(self):
for region, report_aminos in self.reports.items():
first_amino_index = None
last_amino_index = None
last_consensus_nuc_index = None
for i, report_amino in enumerate(report_aminos):
seed_amino = report_amino.seed_amino
if seed_amino.consensus_nuc_index is not None:
if first_amino_index is None:
first_amino_index = i
last_amino_index = i
last_consensus_nuc_index = seed_amino.consensus_nuc_index
if first_amino_index is not None:
if self.amino_detail_writer is None:
seed_name = self.seed
else:
seed_name = self.detail_seed
seed_clipping = self.clipping_counts[seed_name]
seed_insertion_counts = self.conseq_insertion_counts[seed_name]
for i, report_amino in enumerate(report_aminos):
seed_amino = report_amino.seed_amino
if i < first_amino_index:
consensus_nuc_index = (i - first_amino_index) * 3
elif i > last_amino_index:
consensus_nuc_index = (i - last_amino_index) * 3 + \
last_consensus_nuc_index
else:
consensus_nuc_index = seed_amino.consensus_nuc_index
if consensus_nuc_index is None:
continue
for j, seed_nuc in enumerate(seed_amino.nucleotides):
query_pos = j + consensus_nuc_index + 1
seed_nuc.clip_count = seed_clipping[query_pos]
seed_nuc.insertion_count = seed_insertion_counts[query_pos]
if j == 2:
insertion_counts = self.insert_writer.insert_pos_counts[
(self.seed, region)]
seed_nuc.insertion_count += insertion_counts[report_amino.position]

def write_nuc_counts(self, nuc_writer=None):
nuc_writer = nuc_writer or self.nuc_writer

self.merge_extra_counts()
if not self.coordinate_refs:
for seed_amino in self.seed_aminos[0]:
self.write_counts(self.seed, seed_amino, None, nuc_writer)
Expand Down Expand Up @@ -882,7 +913,7 @@ class SeedNucleotide(object):
COUNTED_NUCS = 'ACTG-'

def __init__(self, counts=None):
self.v3_overlap = 0
self.v3_overlap = self.clip_count = self.insertion_count = 0
self.counts = counts or Counter()

def __repr__(self):
Expand All @@ -901,6 +932,8 @@ def count_nucleotides(self, nuc_seq, count):

def add(self, other):
self.counts += other.counts
self.clip_count += other.clip_count
self.insertion_count += other.insertion_count

def get_report(self):
""" Build a report string with the counts of each nucleotide.
Expand Down Expand Up @@ -1202,9 +1235,9 @@ def main():
args.failed_align_csv,
clipping_csv=args.clipping_csv,
conseq_ins_csv=args.conseq_ins_csv,
g2p_aligned_csv=args.g2p_aligned_csv,
remap_conseq_csv=args.remap_conseq_csv,
conseq_region_csv=args.conseq_region_csv)
conseq_region_csv=args.conseq_region_csv,
amino_detail_csv=args.amino_detail_csv)


if __name__ == '__main__':
Expand Down
20 changes: 15 additions & 5 deletions micall/core/denovo.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import argparse
import logging
import os
import sys
from csv import DictWriter, DictReader
from datetime import datetime
from glob import glob
from io import StringIO
from shutil import rmtree
Expand All @@ -20,7 +22,8 @@
'utils',
'hcv_geno',
'hcv.fasta')
ASSEMBLY_TIMEOUT=1800 # 30 minutes
ASSEMBLY_TIMEOUT = 1800 # 30 minutes
logger = logging.getLogger(__name__)


def write_genotypes(contigs_fasta_path, contigs_csv):
Expand All @@ -29,13 +32,14 @@ def write_genotypes(contigs_fasta_path, contigs_csv):
lineterminator=os.linesep)
writer.writeheader()
if contigs_fasta_path is None:
return
return 0
genotypes = genotype(contigs_fasta_path)
for i, record in enumerate(SeqIO.parse(contigs_fasta_path, "fasta")):
genotype_name, match_fraction = genotypes.get(record.name, ('unknown', 0))
writer.writerow(dict(genotype=genotype_name,
match=match_fraction,
contig=record.seq))
return len(genotypes)


def genotype(fasta, db=DEFAULT_DATABASE):
Expand Down Expand Up @@ -81,10 +85,11 @@ def denovo(fastq1_path, fastq2_path, contigs, work_dir='.'):
rmtree(old_tmp_dir, ignore_errors=True)

tmp_dir = mkdtemp(dir=work_dir, prefix='iva_')
start_time = datetime.now()
is_successful = (0 == call(
[IVA, "-f", fastq1_path, "-r", fastq2_path, "-t", "1", "iva"],
cwd=tmp_dir,
timeout=ASSEMBLY_TIMEOUT))
cwd=tmp_dir))
duration = datetime.now() - start_time

if IS_SAVAGE_ENABLED:
pear_proc = Popen([PEAR, "-f", fastq1_path, "-r", fastq2_path, "-o", prefix],
Expand Down Expand Up @@ -114,7 +119,12 @@ def denovo(fastq1_path, fastq2_path, contigs, work_dir='.'):
contigs_fasta_path = os.path.join(tmp_dir, 'iva', 'contigs.fasta')
else:
contigs_fasta_path = None
write_genotypes(contigs_fasta_path, contigs)
contig_count = write_genotypes(contigs_fasta_path, contigs)
logger.info('Assembled %d contigs in %s (%ds) on %s.',
contig_count,
duration,
duration.total_seconds(),
fastq1_path)


if __name__ == '__main__':
Expand Down
76 changes: 76 additions & 0 deletions micall/tests/test_aln2counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -910,6 +910,56 @@ def testSoftClippingAminoReport(self):

self.assertMultiLineEqual(expected_text, self.report_file.getvalue())

def testMultiplePrefixSoftClippingAminoReport(self):
""" Combine the soft clipping data with the read counts.
"""
""" Assemble counts from three contigs to two references.
Contig 1-R1 AAATTT -> KF
Contig 2-R2 GGCCCG -> GP
Contig 3-R1 TTTAGG -> FR
Contig 1 and 3 should combine into R1 with KFR.
"""
# refname,qcut,rank,count,offset,seq
aligned_reads1 = self.prepareReads("1-R1-seed,15,0,5,0,AAATTT")
aligned_reads2 = self.prepareReads("2-R2-seed,15,0,4,0,GGCCCG")
aligned_reads3 = self.prepareReads("3-R1-seed,15,0,2,0,TTTAGG")

clipping = StringIO("""\
refname,pos,count
1-R1-seed,7,5
3-R1-seed,-1,2
""")

expected_text = """\
seed,region,q-cutoff,query.nuc.pos,refseq.aa.pos,\
A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*,X,partial,del,ins,clip,v3_overlap,coverage
R1-seed,R1,15,,1,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,5
R1-seed,R1,15,,2,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7
R1-seed,R1,15,,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,5,0,2
R2-seed,R2,15,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
R2-seed,R2,15,,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
R2-seed,R2,15,,3,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
R2-seed,R2,15,,4,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
R2-seed,R2,15,,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""

self.report.read_clipping(clipping)
self.report.write_amino_header(self.report_file)
self.report.write_amino_detail_header(self.detail_report_file)
self.report.write_nuc_header(StringIO())
self.report.read(aligned_reads1)
self.report.write_nuc_counts()
self.report.write_amino_detail_counts()
self.report.read(aligned_reads2)
self.report.write_nuc_counts()
self.report.write_amino_detail_counts()
self.report.read(aligned_reads3)
self.report.write_nuc_counts()
self.report.write_amino_detail_counts()
self.report.write_amino_counts()

self.assertMultiLineEqual(expected_text, self.report_file.getvalue())

def testInsertionBetweenReadAndConsensusNucleotideReport(self):
""" Combine the soft clipping data with the read counts.
"""
Expand Down Expand Up @@ -1112,6 +1162,32 @@ def testPartialCodonNucleotideReport(self):

self.assertMultiLineEqual(expected_text, self.report_file.getvalue())

def testPartialStartCodonNucleotideReport(self):
# refname,qcut,rank,count,offset,seq
aligned_reads = self.prepareReads("""\
R1-seed,15,0,9,0,TTAGG
""")

expected_text = """\
seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,\
A,C,G,T,N,del,ins,clip,v3_overlap,coverage
R1-seed,R1,15,,1,0,0,0,0,0,0,0,0,0,0
R1-seed,R1,15,,2,0,0,0,0,0,0,0,0,0,0
R1-seed,R1,15,,3,0,0,0,0,0,0,0,0,0,0
R1-seed,R1,15,0,4,0,0,0,0,0,0,0,0,0,0
R1-seed,R1,15,1,5,0,0,0,9,0,0,0,0,0,9
R1-seed,R1,15,2,6,0,0,0,9,0,0,0,0,0,9
R1-seed,R1,15,3,7,9,0,0,0,0,0,0,0,0,9
R1-seed,R1,15,4,8,0,0,9,0,0,0,0,0,0,9
R1-seed,R1,15,5,9,0,0,9,0,0,0,0,0,0,9
"""

self.report.read(aligned_reads)
self.report.write_nuc_header(self.report_file)
self.report.write_nuc_counts()

self.assertMultiLineEqual(expected_text, self.report_file.getvalue())

def testLowQualityNucleotideReport(self):
# refname,qcut,rank,count,offset,seq
aligned_reads = self.prepareReads("""\
Expand Down
9 changes: 9 additions & 0 deletions micall/tests/test_aln2counts_seed_nucleotide.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,10 +243,19 @@ def test_add():
nuc = SeedNucleotide()
nuc.count_nucleotides('T', 4)
nuc.count_nucleotides('C', 1)
nuc.clip_count = 10
nuc.insertion_count = 7

other = SeedNucleotide()
other.count_nucleotides('C', 5)
other.clip_count = 9
other.insertion_count = 8
expected_counts = {'T': 4, 'C': 6}
expected_clip_count = 19
expected_insertion_count = 15

nuc.add(other)

assert expected_counts == nuc.counts
assert expected_clip_count == nuc.clip_count
assert expected_insertion_count == nuc.insertion_count
4 changes: 2 additions & 2 deletions micall/utils/contig_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def plot_contigs(sample_dir, contigs_csv):
plt.tight_layout()


def test_plot():
def live_plot():
# qaccver,saccver,pident,score,qcovhsp,qstart,qend,sstart,send
contigs_csv = """\
contig.00001,HCV-3a,9405,91.717,4747,75,2291,9389,1523,8621
Expand Down Expand Up @@ -160,4 +160,4 @@ def test_plot():
if __name__ == '__main__':
main()
elif __name__ == '__live_coding__':
test_plot()
live_plot()

0 comments on commit c471be8

Please sign in to comment.