Skip to content

Commit

Permalink
Retry amino alignment without marking bad codons, for #442.
Browse files Browse the repository at this point in the history
  • Loading branch information
donkirkby committed Nov 26, 2019
1 parent c7a95f4 commit 27003c2
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 24 deletions.
68 changes: 44 additions & 24 deletions micall/core/aln2counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,29 +246,13 @@ def _map_to_coordinate_ref(self, coordinate_name, coordinate_ref):
:return: Dict object containing coordinate map.
"""

# Start max_score with the minimum score we can consider a valid
# alignment. Anything worse, we won't bother
consensus_length = len([amino for amino in self.seed_aminos[0] if amino.counts])
max_score = min(consensus_length, len(coordinate_ref))

best_alignment = None
for reading_frame, frame_seed_aminos in self.seed_aminos.items():
consensus = ''.join([seed_amino1.get_consensus()
for seed_amino1 in frame_seed_aminos])
if reading_frame == 0:
# best guess before aligning - if alignments fail, this will be non-empty
self.consensus[coordinate_name] = consensus

# map to reference coordinates by aligning consensus
a_coord, a_consensus, score = self._pair_align(
best_alignment = self.find_coordinate_alignment(coordinate_name,
coordinate_ref)
if best_alignment is None:
best_alignment = self.find_coordinate_alignment(
coordinate_name,
coordinate_ref,
consensus,
gap_open=GAP_OPEN_COORD,
gap_extend=GAP_EXTEND_COORD)
if score < max_score:
continue
max_score = score
best_alignment = (reading_frame, consensus, a_coord, a_consensus)
mark_bad_codons=False)

report_aminos = []
if best_alignment is not None:
Expand Down Expand Up @@ -326,6 +310,42 @@ def _map_to_coordinate_ref(self, coordinate_name, coordinate_ref):

self.reports[coordinate_name] = report_aminos

def find_coordinate_alignment(self,
coordinate_name,
coordinate_ref,
mark_bad_codons=True):
""" Align the amino counts in self.seed_aminos to a reference.
:param str coordinate_name: the name of the coordinate reference to
align to
:param str coordinate_ref: the amino acid sequence to align to
:param bool mark_bad_codons: True if codons with no valid reads but some
invalid reads should be marked with question marks.
"""
# Start max_score with the minimum score we can consider a valid
# alignment. Anything worse, we won't bother
consensus_length = len([amino for amino in self.seed_aminos[0] if amino.counts])
max_score = min(consensus_length, len(coordinate_ref))
best_alignment = None
for reading_frame, frame_seed_aminos in self.seed_aminos.items():
consensus = ''.join([seed_amino1.get_consensus(mark_bad_codons)
for seed_amino1 in frame_seed_aminos])
if reading_frame == 0:
# best guess before aligning - if alignments fail, this will be non-empty
self.consensus[coordinate_name] = consensus

# map to reference coordinates by aligning consensus
a_coord, a_consensus, score = self._pair_align(
coordinate_ref,
consensus,
gap_open=GAP_OPEN_COORD,
gap_extend=GAP_EXTEND_COORD)
if score < max_score:
continue
max_score = score
best_alignment = (reading_frame, consensus, a_coord, a_consensus)
return best_alignment

def map_sequences(self, from_seq, to_seq, from_aligned=None, to_aligned=None):
if from_aligned is None or to_aligned is None:
from_aligned, to_aligned, _score = self._pair_align(
Expand Down Expand Up @@ -1072,7 +1092,7 @@ def get_report(self):
return ','.join([str(self.counts[amino])
for amino in AMINO_ALPHABET])

def get_consensus(self):
def get_consensus(self, mark_bad_codons=True):
""" Find the amino acid that was seen most often in count_aminos().
If there is a tie, just pick one of the tied amino acids.
Expand All @@ -1081,7 +1101,7 @@ def get_consensus(self):
consensus = self.counts.most_common(1)
if consensus:
return consensus[0][0]
if self.read_count:
if self.read_count and mark_bad_codons:
return '?'
return '-'

Expand Down
25 changes: 25 additions & 0 deletions micall/tests/test_aln2counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -1498,6 +1498,31 @@ def testPartialStartCodonNucleotideReport(self):

self.assertMultiLineEqual(expected_text, self.report_file.getvalue())

def testPartialCodonsInMiddleOfAminoReport(self):
# refname,qcut,rank,count,offset,seq
aligned_reads = self.prepareReads("""\
R3-seed,15,0,9,0,AAATTTCA-AC-CC-CG-GA-CA-
""")

expected_text = """\
seed,region,q-cutoff,query.nuc.pos,refseq.aa.pos,\
A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*,X,partial,del,ins,clip,v3_overlap,coverage
R3-seed,R3,15,1,1,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9
R3-seed,R3,15,4,2,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9
R3-seed,R3,15,7,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0
R3-seed,R3,15,10,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0
R3-seed,R3,15,13,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0
R3-seed,R3,15,16,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0
R3-seed,R3,15,19,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0
R3-seed,R3,15,22,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0
"""

self.report.read(aligned_reads)
self.report.write_amino_header(self.report_file)
self.report.write_amino_counts()

self.assertMultiLineEqual(expected_text, self.report_file.getvalue())

def testLowQualityNucleotideReport(self):
# refname,qcut,rank,count,offset,seq
aligned_reads = self.prepareReads("""\
Expand Down

0 comments on commit 27003c2

Please sign in to comment.