Skip to content

Commit

Permalink
Split alignments when the reading frame changes, following #479.
Browse files Browse the repository at this point in the history
Also avoid double reporting in nuc.csv.
  • Loading branch information
donkirkby committed Jul 13, 2021
1 parent 3863d1a commit a5851b7
Show file tree
Hide file tree
Showing 3 changed files with 178 additions and 49 deletions.
1 change: 1 addition & 0 deletions micall/core/aln2counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -1439,6 +1439,7 @@ def aln2counts(aligned_csv,
if g2p_aligned_csv is not None:
report.nuc_detail_writer = report.amino_detail_writer = None
report.combined_reports.clear()
report.combined_report_nucleotides.clear()
if report.remap_conseqs is not None:
report.remap_conseqs[G2P_SEED_NAME] = projects.getReference(
G2P_SEED_NAME)
Expand Down
93 changes: 87 additions & 6 deletions micall/tests/test_consensus_aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def test_start_contig_big_deletion(projects):


def test_start_contig_insert_and_big_deletion(projects):
""" Split alignments around big deletions. """
""" Split alignments around a big deletion after an insert. """
seed_name = 'HIV1-B-FR-K03455-seed'
seed_seq = projects.getReference(seed_name)
consensus = (seed_seq[789:900] +
Expand Down Expand Up @@ -230,6 +230,87 @@ def test_start_contig_insert_and_big_deletion(projects):
assert_alignments(aligner, *expected_alignments)


def test_start_contig_frame_change_insert(projects):
""" Split alignments when the reading frame changes. """
seed_name = 'HIV1-B-FR-K03455-seed'
seed_seq = projects.getReference(seed_name)
consensus = (seed_seq[800:900] +
'C' +
seed_seq[900:1000])
expected_alignments = [AlignmentWrapper(ctg='N/A',
ctg_len=len(seed_seq),
r_st=800,
r_en=900,
q_st=0,
q_en=100,
mapq=60,
# Unneeded fields forced to -1.
mlen=-1,
blen=-1,
NM=-1),
AlignmentWrapper(ctg='N/A',
ctg_len=len(seed_seq),
r_st=900,
r_en=1000,
q_st=101,
q_en=201,
mapq=60,
# Unneeded fields forced to -1.
mlen=-1,
blen=-1,
NM=-1)]
aligner = ConsensusAligner(projects)

aligner.start_contig(seed_name, consensus)

assert_alignments(aligner, *expected_alignments)


def test_start_contig_frame_change_delete(projects):
seed_name = 'HIV1-B-FR-K03455-seed'
seed_seq = projects.getReference(seed_name)
consensus = (seed_seq[800:899] + seed_seq[900:1000])
expected_alignments = [AlignmentWrapper(ctg='N/A',
ctg_len=len(seed_seq),
r_st=800,
r_en=899,
q_st=0,
q_en=99,
mapq=60,
# Unneeded fields forced to -1.
mlen=-1,
blen=-1,
NM=-1),
AlignmentWrapper(ctg='N/A',
ctg_len=len(seed_seq),
r_st=900,
r_en=1000,
q_st=99,
q_en=199,
mapq=60,
# Unneeded fields forced to -1.
mlen=-1,
blen=-1,
NM=-1)]
aligner = ConsensusAligner(projects)

aligner.start_contig(seed_name, consensus)

assert_alignments(aligner, *expected_alignments)


def test_start_contig_close_frame_changes(projects):
""" The reading frame changes and then changes back within 30 bases. """
seed_name = 'HIV1-B-FR-K03455-seed'
seed_seq = projects.getReference(seed_name)
consensus = (seed_seq[800:899] + seed_seq[900:920] + seed_seq[922:1000])
aligner = ConsensusAligner(projects)

aligner.start_contig(seed_name, consensus)

assert len(aligner.alignments) == 1


def test_start_contig_short_consensus(projects):
""" Consensus is too short for minimap2, fall back to Gotoh. """
seed_name = 'SARS-CoV-2-seed'
Expand Down Expand Up @@ -323,18 +404,18 @@ def test_start_contig_matched_deletion_gotoh(projects):
def test_start_contig_insertion_minimap2(projects):
seed_name = 'SARS-CoV-2-seed'
seed_seq = projects.getReference(seed_name)
consensus = seed_seq[2000:2030] + 'T' + seed_seq[2030:2060]
consensus = seed_seq[2000:2030] + 'ACT' + seed_seq[2030:2060]
expected_alignment = AlignmentWrapper(ctg='N/A',
ctg_len=len(seed_seq),
r_st=2000,
r_en=2060,
q_st=0,
q_en=61,
mapq=9,
q_en=63,
mapq=7,
cigar=[[30, CigarActions.MATCH],
[1, CigarActions.INSERT],
[3, CigarActions.INSERT],
[30, CigarActions.MATCH]],
NM=1)
NM=3)
aligner = ConsensusAligner(projects)

aligner.start_contig(seed_name, consensus)
Expand Down
133 changes: 90 additions & 43 deletions micall/utils/consensus_aligner.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import typing
from dataclasses import dataclass
from enum import IntEnum
from operator import attrgetter

Expand All @@ -8,6 +9,9 @@
from micall.core.project_config import ProjectConfig
from micall.utils.report_amino import SeedAmino, ReportAmino, ReportNucleotide, SeedNucleotide

# A section between reading frame shifts must be at least this long to be split.
MINIMUM_READING_FRAME_SHIFT = 30

CigarActions = IntEnum(
'CigarActions',
'MATCH INSERT DELETE SKIPPED SOFT_CLIPPED HARD_CLIPPED',
Expand Down Expand Up @@ -235,7 +239,7 @@ def start_contig(self,
for alignment in self.alignments
if alignment.is_primary]
self.alignments.sort(key=attrgetter('q_st'))
self.adjust_alignments()
self.split_alignments()

def align_gotoh(self, coordinate_seq, consensus):
gap_open_penalty = 15
Expand Down Expand Up @@ -279,10 +283,10 @@ def align_gotoh(self, coordinate_seq, consensus):
q_en=consensus_index,
cigar=cigar))

def adjust_alignments(self):
""" Remove big deletions and overlaps between alignments. """
def split_alignments(self):
""" Split alignments into sections that can be translated to aminos. """

# Overlaps
# Remove overlaps between alignments.
max_query_pos = -1
for alignment_num, alignment in enumerate(self.alignments):
query_start = alignment.q_st
Expand All @@ -301,56 +305,88 @@ def adjust_alignments(self):
self.alignments[alignment_num] = new_alignment
max_query_pos = alignment.q_en

# Big deletions
# Split alignments around frame shifts and big deletions.
new_alignments = []
for alignment in self.alignments:
new_cigar = []
query_start = query_end = alignment.q_st
ref_start = ref_end = alignment.r_st
for size, action in alignment.cigar:
breakpoints = [AlignmentBreakpoint(alignment.q_st,
alignment.q_st,
alignment.r_st,
alignment.r_st,
0,
0)]
query_end = alignment.q_st
ref_end = alignment.r_st
old_reading_frame = (alignment.r_st - alignment.q_st) % 3
for cigar_index, (size, action) in enumerate(alignment.cigar):
if action == CigarActions.MATCH:
new_cigar.append([size, action])
query_end += size
ref_end += size
elif action == CigarActions.INSERT:
new_cigar.append([size, action])
query_end += size
new_reading_frame = (ref_end - query_end) % 3
if old_reading_frame != new_reading_frame:
breakpoints.append(AlignmentBreakpoint(query_end-size,
query_end,
ref_end,
ref_end,
cigar_index,
cigar_index+1))
old_reading_frame = new_reading_frame
else:
assert action == CigarActions.DELETE
if size <= 6:
new_cigar.append([size, action])
ref_end += size
else:
new_alignments.append(AlignmentWrapper.wrap(
alignment,
q_st=query_start,
q_en=query_end,
r_st=ref_start,
r_en=ref_end,
cigar=new_cigar,
# Unneeded fields => -1.
mlen=-1,
blen=-1,
NM=-1))
query_start = query_end
ref_end += size
ref_start = ref_end
new_cigar = []
if len(new_cigar) == len(alignment.cigar):
# No change, use original alignment.
new_reading_frame = (ref_end + size - query_end)
if new_reading_frame != old_reading_frame or 6 < size:
breakpoints.append(AlignmentBreakpoint(query_end,
query_end,
ref_end,
ref_end+size,
cigar_index,
cigar_index+1))
old_reading_frame = new_reading_frame
ref_end += size
breakpoints.append(AlignmentBreakpoint(alignment.q_en,
alignment.q_en,
alignment.r_en,
alignment.r_en,
len(alignment.cigar),
len(alignment.cigar)))
filtered_breakpoints = [breakpoints[0]]
prev_breakpoints = iter(breakpoints)
mid_breakpoints = iter(breakpoints)
next_breakpoints = iter(breakpoints)
next(mid_breakpoints)
next(next_breakpoints)
next(next_breakpoints)
for prev_breakpoint, mid_breakpoint, next_breakpoint in zip(
prev_breakpoints, mid_breakpoints, next_breakpoints):
prev_size = (mid_breakpoint.from_query_pos -
prev_breakpoint.to_query_pos)
next_size = (next_breakpoint.from_query_pos -
mid_breakpoint.to_query_pos)
if MINIMUM_READING_FRAME_SHIFT <= min(prev_size, next_size):
filtered_breakpoints.append(mid_breakpoint)
filtered_breakpoints.append(breakpoints[-1])
if len(filtered_breakpoints) == 2:
new_alignments.append(alignment)
else:
new_alignments.append(AlignmentWrapper.wrap(
alignment,
q_st=query_start,
q_en=query_end,
r_st=ref_start,
r_en=ref_end,
cigar=new_cigar,
# Unneeded fields => -1.
mlen=-1,
blen=-1,
NM=-1))
prev_breakpoints = iter(breakpoints)
next_breakpoints = iter(breakpoints)
next(next_breakpoints)
for prev_breakpoint, next_breakpoint in zip(
prev_breakpoints, next_breakpoints):
new_alignment = AlignmentWrapper.wrap(
alignment,
q_st=prev_breakpoint.to_query_pos,
q_en=next_breakpoint.from_query_pos,
r_st=prev_breakpoint.to_ref_pos,
r_en=next_breakpoint.from_ref_pos,
cigar=alignment.cigar[prev_breakpoint.to_cigar:
next_breakpoint.from_cigar],
# Unneeded fields => -1.
mlen=-1,
blen=-1,
NM=-1)
new_alignments.append(new_alignment)
self.alignments = new_alignments

def clear(self):
Expand Down Expand Up @@ -615,3 +651,14 @@ def build_nucleotide_report(self,
ref_nuc_index += 1
consensus_nuc_index += 1
section_nuc_index += 1


@dataclass
class AlignmentBreakpoint:
""" Describe the pieces to remove from an alignment. """
from_query_pos: int
to_query_pos: int
from_ref_pos: int
to_ref_pos: int
from_cigar: int
to_cigar: int

0 comments on commit a5851b7

Please sign in to comment.