Merge pull request #877 from nextstrain/feat/cli-replace-unknown

nextstrain · Jun 23, 2022 · 286f7f3 · 286f7f3
2 parents d437b49 + 7ffc959
commit 286f7f3
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 33 deletions.
diff --git a/packages_rs/nextclade-cli/src/cli/nextalign_cli.rs b/packages_rs/nextclade-cli/src/cli/nextalign_cli.rs
@@ -249,6 +249,16 @@ pub struct NextalignRunOutputArgs {
   /// Note: the sequences which trigger errors during processing will be omitted from outputs, regardless of this flag.
   #[clap(long)]
   pub in_order: bool,
+
+  /// Replace unknown nucleotide characters with 'N'
+  ///
+  /// By default, the sequences containing unknown nucleotide nucleotide characters are skipped with a warning - they
+  /// are not aligned and not included into results. If this flag is provided, then before the alignment,
+  /// all unknown characters are replaced with 'N'. This replacement allows to align these sequences.
+  ///
+  /// The following characters are considered known:  '-', 'A', 'B', 'C', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'T', 'V', 'W', 'Y'
+  #[clap(long)]
+  pub replace_unknown: bool,
 }
 
 #[derive(Parser, Debug)]

diff --git a/packages_rs/nextclade-cli/src/cli/nextalign_loop.rs b/packages_rs/nextclade-cli/src/cli/nextalign_loop.rs
@@ -10,7 +10,7 @@ use nextclade::align::params::AlignPairwiseParams;
 use nextclade::io::fasta::{read_one_fasta, FastaReader, FastaRecord};
 use nextclade::io::gene_map::{filter_gene_map, GeneMap};
 use nextclade::io::gff3::read_gff3_file;
-use nextclade::io::nuc::to_nuc_seq;
+use nextclade::io::nuc::{to_nuc_seq, to_nuc_seq_replacing};
 use nextclade::run::nextalign_run_one::nextalign_run_one;
 use nextclade::translate::translate_genes_ref::translate_genes_ref;
 use nextclade::types::outputs::NextalignOutputs;
@@ -43,6 +43,7 @@ pub fn nextalign_run(run_args: NextalignRunArgs) -> Result<(), Report> {
         output_insertions,
         output_errors,
         include_reference,
+        replace_unknown,
         in_order,
         ..
       },
@@ -106,19 +107,23 @@ pub fn nextalign_run(run_args: NextalignRunArgs) -> Result<(), Report> {
         for FastaRecord { seq_name, seq, index } in &fasta_receiver {
           info!("Processing sequence '{seq_name}'");
 
-          let outputs_or_err = to_nuc_seq(&seq)
-            .wrap_err_with(|| format!("When processing sequence #{index} '{seq_name}'"))
-            .and_then(|qry_seq| {
-              nextalign_run_one(
-                &qry_seq,
-                ref_seq,
-                ref_peptides,
-                gene_map,
-                gap_open_close_nuc,
-                gap_open_close_aa,
-                alignment_params,
-              )
-            });
+          let outputs_or_err = if replace_unknown {
+            Ok(to_nuc_seq_replacing(&seq))
+          } else {
+            to_nuc_seq(&seq)
+          }
+          .wrap_err_with(|| format!("When processing sequence #{index} '{seq_name}'"))
+          .and_then(|qry_seq| {
+            nextalign_run_one(
+              &qry_seq,
+              ref_seq,
+              ref_peptides,
+              gene_map,
+              gap_open_close_nuc,
+              gap_open_close_aa,
+              alignment_params,
+            )
+          });
 
           let record = NextalignRecord {
             index,

diff --git a/packages_rs/nextclade-cli/src/cli/nextclade_cli.rs b/packages_rs/nextclade-cli/src/cli/nextclade_cli.rs
@@ -519,6 +519,16 @@ pub struct NextcladeRunOutputArgs {
   /// Note: the sequences which trigger errors during processing will be omitted from outputs, regardless of this flag.
   #[clap(long)]
   pub in_order: bool,
+
+  /// Replace unknown nucleotide characters with 'N'
+  ///
+  /// By default, the sequences containing unknown nucleotide nucleotide characters are skipped with a warning - they
+  /// are not analyzed and not included into results. If this flag is provided, then before the alignment,
+  /// all unknown characters are replaced with 'N'. This replacement allows to analyze these sequences.
+  ///
+  /// The following characters are considered known:  '-', 'A', 'B', 'C', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'T', 'V', 'W', 'Y'
+  #[clap(long)]
+  pub replace_unknown: bool,
 }
 
 #[derive(Parser, Debug, Clone)]

diff --git a/packages_rs/nextclade-cli/src/cli/nextclade_loop.rs b/packages_rs/nextclade-cli/src/cli/nextclade_loop.rs
@@ -14,7 +14,7 @@ use nextclade::align::params::AlignPairwiseParams;
 use nextclade::io::fasta::{FastaReader, FastaRecord};
 use nextclade::io::fs::has_extension;
 use nextclade::io::json::json_write;
-use nextclade::io::nuc::{to_nuc_seq, Nuc};
+use nextclade::io::nuc::{to_nuc_seq, to_nuc_seq_replacing, Nuc};
 use nextclade::make_error;
 use nextclade::run::nextclade_run_one::nextclade_run_one;
 use nextclade::translate::translate_genes::Translation;
@@ -89,6 +89,7 @@ pub fn nextclade_run(run_args: NextcladeRunArgs) -> Result<(), Report> {
         output_errors,
         include_reference,
         in_order,
+        replace_unknown,
         ..
       },
     other: NextcladeRunOtherArgs { jobs },
@@ -169,24 +170,28 @@ pub fn nextclade_run(run_args: NextcladeRunArgs) -> Result<(), Report> {
         for FastaRecord { seq_name, seq, index } in &fasta_receiver {
           info!("Processing sequence '{seq_name}'");
 
-          let outputs_or_err = to_nuc_seq(&seq)
-            .wrap_err_with(|| format!("When processing sequence #{index} '{seq_name}'"))
-            .and_then(|qry_seq| {
-              nextclade_run_one(
-                &seq_name,
-                &qry_seq,
-                ref_seq,
-                ref_peptides,
-                gene_map,
-                primers,
-                tree,
-                qc_config,
-                virus_properties,
-                gap_open_close_nuc,
-                gap_open_close_aa,
-                alignment_params,
-              )
-            });
+          let outputs_or_err = if replace_unknown {
+            Ok(to_nuc_seq_replacing(&seq))
+          } else {
+            to_nuc_seq(&seq)
+          }
+          .wrap_err_with(|| format!("When processing sequence #{index} '{seq_name}'"))
+          .and_then(|qry_seq| {
+            nextclade_run_one(
+              &seq_name,
+              &qry_seq,
+              ref_seq,
+              ref_peptides,
+              gene_map,
+              primers,
+              tree,
+              qc_config,
+              virus_properties,
+              gap_open_close_nuc,
+              gap_open_close_aa,
+              alignment_params,
+            )
+          });
 
           let record = NextcladeRecord {
             index,

diff --git a/packages_rs/nextclade/src/io/nuc.rs b/packages_rs/nextclade/src/io/nuc.rs
@@ -146,6 +146,11 @@ pub fn to_nuc_seq(str: &str) -> Result<Vec<Nuc>, Report> {
   str.chars().map(to_nuc).collect()
 }
 
+/// Converts string characters to `Nuc`s, replacing unknown characters with `N`
+pub fn to_nuc_seq_replacing(str: &str) -> Vec<Nuc> {
+  str.chars().map(|c| to_nuc(c).unwrap_or(Nuc::N)).collect()
+}
+
 pub fn from_nuc_seq(seq: &[Nuc]) -> String {
   seq.iter().map(|nuc| from_nuc(*nuc)).collect()
 }