diff --git a/crates/abd-clam/src/msa/quality/mod.rs b/crates/abd-clam/src/msa/quality/mod.rs index 99519e4a2..c08c472b7 100644 --- a/crates/abd-clam/src/msa/quality/mod.rs +++ b/crates/abd-clam/src/msa/quality/mod.rs @@ -6,6 +6,32 @@ use rayon::prelude::*; use crate::{utils, Dataset, FlatVec}; +impl FlatVec { + /// Remove all gaps from all sequences in the MSA. + #[must_use] + pub fn remove_gaps(mut self) -> Self { + self.instances = self + .instances + .into_iter() + .map(|s| s.chars().filter(|&c| c != '-').collect::()) + .collect(); + self + } +} + +impl FlatVec { + /// Parallel version of `remove_gaps`. + #[must_use] + pub fn par_remove_gaps(mut self) -> Self { + self.instances = self + .instances + .into_par_iter() + .map(|s| s.chars().filter(|&c| c != '-').collect::()) + .collect(); + self + } +} + // TODO: Consider adding a new trait for MSA datasets. Then move these methods // to that trait. diff --git a/crates/results/msa/src/main.rs b/crates/results/msa/src/main.rs index bf20aaee0..33c6d602f 100644 --- a/crates/results/msa/src/main.rs +++ b/crates/results/msa/src/main.rs @@ -35,6 +35,10 @@ struct Args { #[arg(short('i'), long)] inp_path: PathBuf, + /// Whether the original fasta file was pre-aligned by the provider. + #[arg(short('p'), long)] + pre_aligned: bool, + /// The number of samples to use for the dataset. #[arg(short('n'), long)] num_samples: Option, @@ -103,7 +107,8 @@ fn main() -> Result<(), String> { ftlog::info!("Input file: {:?}", fasta_file.raw_path()); ftlog::info!("Output directory: {:?}", fasta_file.out_dir()); - let data = fasta_file.read(args.num_samples)?; + let data = fasta_file.read::(args.num_samples)?; + let data = if args.pre_aligned { data } else { data.par_remove_gaps() }; ftlog::info!( "Finished reading original dataset: length range = {:?}", data.dimensionality_hint()