Skip to content

Commit

Permalink
feat: retry with reverse complement when seed matching fails
Browse files Browse the repository at this point in the history
Adds flag `--retry-reverse-complement` which enables additional attempt of seed matching when initial attempt fails. The second attempt is performed on reverse-complemented sequence.

As a consequence, the output alignment, peptides and analysis results correspond to this modified sequence and not to the original.

This functionality is opt-in and the default behavior is to skip sequence with a warning.
  • Loading branch information
ivan-aksamentov committed Jun 23, 2022
1 parent 6435118 commit bc9839c
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 13 deletions.
31 changes: 20 additions & 11 deletions packages_rs/nextclade/src/align/align.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,24 @@ use crate::io::aa::Aa;
use crate::io::letter::Letter;
use crate::io::nuc::Nuc;
use crate::make_error;
use crate::translate::complement::reverse_complement_in_place;
use eyre::Report;
use log::trace;
use log::{info, trace};

fn align_pairwise<T: Letter<T>>(
qry_seq: &[T],
ref_seq: &[T],
gap_open_close: &[i32],
params: &AlignPairwiseParams,
stripes: &[Stripe],
) -> Result<AlignmentOutput<T>, Report> {
) -> AlignmentOutput<T> {
trace!("Align pairwise: started. Params: {params:?}");

let max_indel = params.max_indel;

let ScoreMatrixResult { scores, paths } = score_matrix(qry_seq, ref_seq, gap_open_close, stripes, params);

Ok(backtrace(qry_seq, ref_seq, &scores, &paths))
backtrace(qry_seq, ref_seq, &scores, &paths)
}

/// align nucleotide sequences via seed alignment and banded smith watermann without penalizing terminal gaps
Expand All @@ -42,13 +43,21 @@ pub fn align_nuc(
);
}

let stripes = seed_alignment(qry_seq, ref_seq, params)?;

let result = align_pairwise(qry_seq, ref_seq, gap_open_close, params, &stripes)?;

trace!("Score: {}", result.alignment_score);

Ok(result)
#[allow(clippy::map_err_ignore)]
match seed_alignment(qry_seq, ref_seq, params) {
Ok(stripes) => Ok(align_pairwise(qry_seq, ref_seq, gap_open_close, params, &stripes)),
Err(report) => {
if params.retry_reverse_complement {
info!("Seed matching failed. Retrying reverse complement"); // TODO: would be nice to have sequence index and name here
let mut qry_seq = qry_seq.to_owned();
reverse_complement_in_place(&mut qry_seq);
let stripes = seed_alignment(&qry_seq, ref_seq, params).map_err(|_| report)?;
Ok(align_pairwise(&qry_seq, ref_seq, gap_open_close, params, &stripes))
} else {
Err(report)
}
}
}
}

/// align amino acids using a fixed bandwidth banded alignment while penalizing terminal indels
Expand All @@ -59,7 +68,7 @@ pub fn align_aa(
params: &AlignPairwiseParams,
band_width: usize,
mean_shift: i32,
) -> Result<AlignmentOutput<Aa>, Report> {
) -> AlignmentOutput<Aa> {
let stripes = simple_stripes(mean_shift, band_width, ref_seq.len(), qry_seq.len());

align_pairwise(qry_seq, ref_seq, gap_open_close, params, &stripes)
Expand Down
8 changes: 7 additions & 1 deletion packages_rs/nextclade/src/align/params.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ pub enum GapAlignmentSide {
// as well as adds a method `.merge_opt(&opt)` to the original struct, which merges values from the optional counterpart
// into self (mutably).

#[allow(clippy::struct_excessive_bools)]
#[optfield(pub AlignPairwiseParamsOptional, attrs, doc, field_attrs, field_doc, merge_fn = pub)]
#[derive(Parser, Debug, Clone, Serialize, Deserialize)]
pub struct AlignPairwiseParams {
Expand Down Expand Up @@ -70,8 +71,12 @@ pub struct AlignPairwiseParams {
#[clap(long)]
pub seed_spacing: i32,

/// Retry seed matching step with a reverse complement if the first attempt failed
#[clap(long, takes_value = false, forbid_empty_values = false, default_missing_value = "true")]
pub retry_reverse_complement: bool,

/// If this flag is present, the amino acid sequences will be truncated at the first stop codon, if mutations or sequencing errors cause premature stop codons to be present. No amino acid mutations in the truncated region will be recorded.
#[clap(long)]
#[clap(long, takes_value = false, forbid_empty_values = false, default_missing_value = "true")]
pub no_translate_past_stop: bool,

// Internal alignment parameter
Expand Down Expand Up @@ -112,6 +117,7 @@ impl Default for AlignPairwiseParams {
min_match_rate: 0.3,
seed_spacing: 100,
mismatches_allowed: 3,
retry_reverse_complement: false,
no_translate_past_stop: false,
left_terminal_gaps_free: true,
right_terminal_gaps_free: true,
Expand Down
2 changes: 1 addition & 1 deletion packages_rs/nextclade/src/translate/translate_genes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ pub fn translate_gene(
&aa_params,
band_width,
mean_shift,
)?;
);

let mut stripped = insertions_strip(&alignment.qry_seq, &alignment.ref_seq);

Expand Down

0 comments on commit bc9839c

Please sign in to comment.