Skip to content

Commit

Permalink
Merge pull request #877 from nextstrain/feat/cli-replace-unknown
Browse files Browse the repository at this point in the history
  • Loading branch information
ivan-aksamentov authored Jun 23, 2022
2 parents d437b49 + 7ffc959 commit 286f7f3
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 33 deletions.
10 changes: 10 additions & 0 deletions packages_rs/nextclade-cli/src/cli/nextalign_cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,16 @@ pub struct NextalignRunOutputArgs {
/// Note: the sequences which trigger errors during processing will be omitted from outputs, regardless of this flag.
#[clap(long)]
pub in_order: bool,

/// Replace unknown nucleotide characters with 'N'
///
/// By default, the sequences containing unknown nucleotide nucleotide characters are skipped with a warning - they
/// are not aligned and not included into results. If this flag is provided, then before the alignment,
/// all unknown characters are replaced with 'N'. This replacement allows to align these sequences.
///
/// The following characters are considered known: '-', 'A', 'B', 'C', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'T', 'V', 'W', 'Y'
#[clap(long)]
pub replace_unknown: bool,
}

#[derive(Parser, Debug)]
Expand Down
33 changes: 19 additions & 14 deletions packages_rs/nextclade-cli/src/cli/nextalign_loop.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use nextclade::align::params::AlignPairwiseParams;
use nextclade::io::fasta::{read_one_fasta, FastaReader, FastaRecord};
use nextclade::io::gene_map::{filter_gene_map, GeneMap};
use nextclade::io::gff3::read_gff3_file;
use nextclade::io::nuc::to_nuc_seq;
use nextclade::io::nuc::{to_nuc_seq, to_nuc_seq_replacing};
use nextclade::run::nextalign_run_one::nextalign_run_one;
use nextclade::translate::translate_genes_ref::translate_genes_ref;
use nextclade::types::outputs::NextalignOutputs;
Expand Down Expand Up @@ -43,6 +43,7 @@ pub fn nextalign_run(run_args: NextalignRunArgs) -> Result<(), Report> {
output_insertions,
output_errors,
include_reference,
replace_unknown,
in_order,
..
},
Expand Down Expand Up @@ -106,19 +107,23 @@ pub fn nextalign_run(run_args: NextalignRunArgs) -> Result<(), Report> {
for FastaRecord { seq_name, seq, index } in &fasta_receiver {
info!("Processing sequence '{seq_name}'");

let outputs_or_err = to_nuc_seq(&seq)
.wrap_err_with(|| format!("When processing sequence #{index} '{seq_name}'"))
.and_then(|qry_seq| {
nextalign_run_one(
&qry_seq,
ref_seq,
ref_peptides,
gene_map,
gap_open_close_nuc,
gap_open_close_aa,
alignment_params,
)
});
let outputs_or_err = if replace_unknown {
Ok(to_nuc_seq_replacing(&seq))
} else {
to_nuc_seq(&seq)
}
.wrap_err_with(|| format!("When processing sequence #{index} '{seq_name}'"))
.and_then(|qry_seq| {
nextalign_run_one(
&qry_seq,
ref_seq,
ref_peptides,
gene_map,
gap_open_close_nuc,
gap_open_close_aa,
alignment_params,
)
});

let record = NextalignRecord {
index,
Expand Down
10 changes: 10 additions & 0 deletions packages_rs/nextclade-cli/src/cli/nextclade_cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,16 @@ pub struct NextcladeRunOutputArgs {
/// Note: the sequences which trigger errors during processing will be omitted from outputs, regardless of this flag.
#[clap(long)]
pub in_order: bool,

/// Replace unknown nucleotide characters with 'N'
///
/// By default, the sequences containing unknown nucleotide nucleotide characters are skipped with a warning - they
/// are not analyzed and not included into results. If this flag is provided, then before the alignment,
/// all unknown characters are replaced with 'N'. This replacement allows to analyze these sequences.
///
/// The following characters are considered known: '-', 'A', 'B', 'C', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'T', 'V', 'W', 'Y'
#[clap(long)]
pub replace_unknown: bool,
}

#[derive(Parser, Debug, Clone)]
Expand Down
43 changes: 24 additions & 19 deletions packages_rs/nextclade-cli/src/cli/nextclade_loop.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use nextclade::align::params::AlignPairwiseParams;
use nextclade::io::fasta::{FastaReader, FastaRecord};
use nextclade::io::fs::has_extension;
use nextclade::io::json::json_write;
use nextclade::io::nuc::{to_nuc_seq, Nuc};
use nextclade::io::nuc::{to_nuc_seq, to_nuc_seq_replacing, Nuc};
use nextclade::make_error;
use nextclade::run::nextclade_run_one::nextclade_run_one;
use nextclade::translate::translate_genes::Translation;
Expand Down Expand Up @@ -89,6 +89,7 @@ pub fn nextclade_run(run_args: NextcladeRunArgs) -> Result<(), Report> {
output_errors,
include_reference,
in_order,
replace_unknown,
..
},
other: NextcladeRunOtherArgs { jobs },
Expand Down Expand Up @@ -169,24 +170,28 @@ pub fn nextclade_run(run_args: NextcladeRunArgs) -> Result<(), Report> {
for FastaRecord { seq_name, seq, index } in &fasta_receiver {
info!("Processing sequence '{seq_name}'");

let outputs_or_err = to_nuc_seq(&seq)
.wrap_err_with(|| format!("When processing sequence #{index} '{seq_name}'"))
.and_then(|qry_seq| {
nextclade_run_one(
&seq_name,
&qry_seq,
ref_seq,
ref_peptides,
gene_map,
primers,
tree,
qc_config,
virus_properties,
gap_open_close_nuc,
gap_open_close_aa,
alignment_params,
)
});
let outputs_or_err = if replace_unknown {
Ok(to_nuc_seq_replacing(&seq))
} else {
to_nuc_seq(&seq)
}
.wrap_err_with(|| format!("When processing sequence #{index} '{seq_name}'"))
.and_then(|qry_seq| {
nextclade_run_one(
&seq_name,
&qry_seq,
ref_seq,
ref_peptides,
gene_map,
primers,
tree,
qc_config,
virus_properties,
gap_open_close_nuc,
gap_open_close_aa,
alignment_params,
)
});

let record = NextcladeRecord {
index,
Expand Down
5 changes: 5 additions & 0 deletions packages_rs/nextclade/src/io/nuc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@ pub fn to_nuc_seq(str: &str) -> Result<Vec<Nuc>, Report> {
str.chars().map(to_nuc).collect()
}

/// Converts string characters to `Nuc`s, replacing unknown characters with `N`
pub fn to_nuc_seq_replacing(str: &str) -> Vec<Nuc> {
str.chars().map(|c| to_nuc(c).unwrap_or(Nuc::N)).collect()
}

pub fn from_nuc_seq(seq: &[Nuc]) -> String {
seq.iter().map(|nuc| from_nuc(*nuc)).collect()
}

0 comments on commit 286f7f3

Please sign in to comment.