From a2f174dfd7e1f0fb89277269b26083b914bae569 Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Wed, 22 Jun 2022 11:21:12 +0200
Subject: [PATCH] feat(cli): allow to replace unknown nucs with Ns

This adds a flag `--replace-unknown` to `run` command, which allows to replace unknown nucleotide characters with 'N'.

By default, the sequences containing unknown nucleotide nucleotide characters are skipped with a warning - they
are not aligned and not included into results. If this flag is provided, then before the alignment,
all unknown characters are replaced with 'N'. This replacement allows to align and analyze these sequences.

The following characters are considered known:

```
-ABCDGHKMNRSTVWY
```
---
 .../nextclade-cli/src/cli/nextalign_cli.rs    | 10 +++++
 .../nextclade-cli/src/cli/nextalign_loop.rs   | 33 ++++++++------
 .../nextclade-cli/src/cli/nextclade_cli.rs    | 10 +++++
 .../nextclade-cli/src/cli/nextclade_loop.rs   | 43 +++++++++++--------
 packages_rs/nextclade/src/io/nuc.rs           |  5 +++
 5 files changed, 68 insertions(+), 33 deletions(-)

diff --git a/packages_rs/nextclade-cli/src/cli/nextalign_cli.rs b/packages_rs/nextclade-cli/src/cli/nextalign_cli.rs
index 56e5cf606..f7938dbcd 100644
--- a/packages_rs/nextclade-cli/src/cli/nextalign_cli.rs
+++ b/packages_rs/nextclade-cli/src/cli/nextalign_cli.rs
@@ -237,6 +237,16 @@ pub struct NextalignRunArgs {
   #[clap(long)]
   pub in_order: bool,
 
+  /// Replace unknown nucleotide characters with 'N'
+  ///
+  /// By default, the sequences containing unknown nucleotide nucleotide characters are skipped with a warning - they
+  /// are not aligned and not included into results. If this flag is provided, then before the alignment,
+  /// all unknown characters are replaced with 'N'. This replacement allows to align these sequences.
+  ///
+  /// The following characters are considered known:  '-', 'A', 'B', 'C', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'T', 'V', 'W', 'Y'
+  #[clap(long)]
+  pub replace_unknown: bool,
+
   /// Number of processing jobs. If not specified, all available CPU threads will be used.
   #[clap(global = false, long, short = 'j', default_value_t = num_cpus::get() )]
   pub jobs: usize,
diff --git a/packages_rs/nextclade-cli/src/cli/nextalign_loop.rs b/packages_rs/nextclade-cli/src/cli/nextalign_loop.rs
index b8abe4a23..80ab4b416 100644
--- a/packages_rs/nextclade-cli/src/cli/nextalign_loop.rs
+++ b/packages_rs/nextclade-cli/src/cli/nextalign_loop.rs
@@ -8,7 +8,7 @@ use nextclade::align::params::AlignPairwiseParams;
 use nextclade::io::fasta::{read_one_fasta, FastaReader, FastaRecord};
 use nextclade::io::gene_map::{filter_gene_map, GeneMap};
 use nextclade::io::gff3::read_gff3_file;
-use nextclade::io::nuc::to_nuc_seq;
+use nextclade::io::nuc::{to_nuc_seq, to_nuc_seq_replacing};
 use nextclade::run::nextalign_run_one::nextalign_run_one;
 use nextclade::translate::translate_genes_ref::translate_genes_ref;
 use nextclade::types::outputs::NextalignOutputs;
@@ -37,6 +37,7 @@ pub fn nextalign_run(args: NextalignRunArgs) -> Result<(), Report> {
     output_errors,
     jobs,
     in_order,
+    replace_unknown,
     alignment_params: alignment_params_from_cli,
     ..
   } = args;
@@ -97,19 +98,23 @@ pub fn nextalign_run(args: NextalignRunArgs) -> Result<(), Report> {
         for FastaRecord { seq_name, seq, index } in &fasta_receiver {
           info!("Processing sequence '{seq_name}'");
 
-          let outputs_or_err = to_nuc_seq(&seq)
-            .wrap_err_with(|| format!("When processing sequence #{index} '{seq_name}'"))
-            .and_then(|qry_seq| {
-              nextalign_run_one(
-                &qry_seq,
-                ref_seq,
-                ref_peptides,
-                gene_map,
-                gap_open_close_nuc,
-                gap_open_close_aa,
-                alignment_params,
-              )
-            });
+          let outputs_or_err = if replace_unknown {
+            Ok(to_nuc_seq_replacing(&seq))
+          } else {
+            to_nuc_seq(&seq)
+          }
+          .wrap_err_with(|| format!("When processing sequence #{index} '{seq_name}'"))
+          .and_then(|qry_seq| {
+            nextalign_run_one(
+              &qry_seq,
+              ref_seq,
+              ref_peptides,
+              gene_map,
+              gap_open_close_nuc,
+              gap_open_close_aa,
+              alignment_params,
+            )
+          });
 
           let record = NextalignRecord {
             index,
diff --git a/packages_rs/nextclade-cli/src/cli/nextclade_cli.rs b/packages_rs/nextclade-cli/src/cli/nextclade_cli.rs
index 19b1aa8f1..9ba156c1a 100644
--- a/packages_rs/nextclade-cli/src/cli/nextclade_cli.rs
+++ b/packages_rs/nextclade-cli/src/cli/nextclade_cli.rs
@@ -493,6 +493,16 @@ pub struct NextcladeRunArgs {
   #[clap(long)]
   pub in_order: bool,
 
+  /// Replace unknown nucleotide characters with 'N'
+  ///
+  /// By default, the sequences containing unknown nucleotide nucleotide characters are skipped with a warning - they
+  /// are not analyzed and not included into results. If this flag is provided, then before the alignment,
+  /// all unknown characters are replaced with 'N'. This replacement allows to analyze these sequences.
+  ///
+  /// The following characters are considered known:  '-', 'A', 'B', 'C', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'T', 'V', 'W', 'Y'
+  #[clap(long)]
+  pub replace_unknown: bool,
+
   /// Number of processing jobs. If not specified, all available CPU threads will be used.
   #[clap(global = false, long, short = 'j', default_value_t = num_cpus::get() )]
   pub jobs: usize,
diff --git a/packages_rs/nextclade-cli/src/cli/nextclade_loop.rs b/packages_rs/nextclade-cli/src/cli/nextclade_loop.rs
index 178d8ed4d..46bee0c65 100644
--- a/packages_rs/nextclade-cli/src/cli/nextclade_loop.rs
+++ b/packages_rs/nextclade-cli/src/cli/nextclade_loop.rs
@@ -12,7 +12,7 @@ use nextclade::align::params::AlignPairwiseParams;
 use nextclade::io::fasta::{FastaReader, FastaRecord};
 use nextclade::io::fs::has_extension;
 use nextclade::io::json::json_write;
-use nextclade::io::nuc::{to_nuc_seq, Nuc};
+use nextclade::io::nuc::{to_nuc_seq, to_nuc_seq_replacing, Nuc};
 use nextclade::make_error;
 use nextclade::run::nextclade_run_one::nextclade_run_one;
 use nextclade::translate::translate_genes::Translation;
@@ -74,6 +74,7 @@ pub fn nextclade_run(args: NextcladeRunArgs) -> Result<(), Report> {
     output_errors,
     jobs,
     in_order,
+    replace_unknown,
     genes,
     ..
   } = args.clone();
@@ -152,24 +153,28 @@ pub fn nextclade_run(args: NextcladeRunArgs) -> Result<(), Report> {
         for FastaRecord { seq_name, seq, index } in &fasta_receiver {
           info!("Processing sequence '{seq_name}'");
 
-          let outputs_or_err = to_nuc_seq(&seq)
-            .wrap_err_with(|| format!("When processing sequence #{index} '{seq_name}'"))
-            .and_then(|qry_seq| {
-              nextclade_run_one(
-                &seq_name,
-                &qry_seq,
-                ref_seq,
-                ref_peptides,
-                gene_map,
-                primers,
-                tree,
-                qc_config,
-                virus_properties,
-                gap_open_close_nuc,
-                gap_open_close_aa,
-                alignment_params,
-              )
-            });
+          let outputs_or_err = if replace_unknown {
+            Ok(to_nuc_seq_replacing(&seq))
+          } else {
+            to_nuc_seq(&seq)
+          }
+          .wrap_err_with(|| format!("When processing sequence #{index} '{seq_name}'"))
+          .and_then(|qry_seq| {
+            nextclade_run_one(
+              &seq_name,
+              &qry_seq,
+              ref_seq,
+              ref_peptides,
+              gene_map,
+              primers,
+              tree,
+              qc_config,
+              virus_properties,
+              gap_open_close_nuc,
+              gap_open_close_aa,
+              alignment_params,
+            )
+          });
 
           let record = NextcladeRecord {
             index,
diff --git a/packages_rs/nextclade/src/io/nuc.rs b/packages_rs/nextclade/src/io/nuc.rs
index ee77e56a1..a7a93ef19 100644
--- a/packages_rs/nextclade/src/io/nuc.rs
+++ b/packages_rs/nextclade/src/io/nuc.rs
@@ -146,6 +146,11 @@ pub fn to_nuc_seq(str: &str) -> Result<Vec<Nuc>, Report> {
   str.chars().map(to_nuc).collect()
 }
 
+/// Converts string characters to `Nuc`s, replacing unknown characters with `N`
+pub fn to_nuc_seq_replacing(str: &str) -> Vec<Nuc> {
+  str.chars().map(|c| to_nuc(c).unwrap_or(Nuc::N)).collect()
+}
+
 pub fn from_nuc_seq(seq: &[Nuc]) -> String {
   seq.iter().map(|nuc| from_nuc(*nuc)).collect()
 }