From bc4dcad8d55ae40b8ee2cb180a53238acf2d5f4c Mon Sep 17 00:00:00 2001 From: AroneyS Date: Tue, 4 Jun 2024 13:48:29 +1000 Subject: [PATCH] initial contig clustering attempt --- src/cluster_argument_parsing.rs | 24 +++ src/clusterer.rs | 1 + src/fastani.rs | 5 + src/lib.rs | 2 + src/skani.rs | 24 ++- tests/data/contigs/contigs.fna | 300 ++++++++++++++++++++++++++++++++ tests/test_cmdline.rs | 20 +++ 7 files changed, 372 insertions(+), 4 deletions(-) create mode 100644 tests/data/contigs/contigs.fna diff --git a/src/cluster_argument_parsing.rs b/src/cluster_argument_parsing.rs index beab606..36d5091 100644 --- a/src/cluster_argument_parsing.rs +++ b/src/cluster_argument_parsing.rs @@ -79,6 +79,13 @@ impl ClusterDistanceFinder for Clusterer { Clusterer::Skani(s) => s.calculate_ani(fasta1, fasta2), } } + + fn calculate_ani_contigs(&self, fasta1: &str) -> Option { + match self { + Clusterer::Fastani(f) => f.calculate_ani_contigs(fasta1), + Clusterer::Skani(s) => s.calculate_ani_contigs(fasta1), + } + } } pub struct GalahClusterer<'a> { @@ -95,6 +102,7 @@ pub struct GalahClustererCommandDefinition { pub dereplication_cluster_method_argument: String, pub dereplication_aligned_fraction_argument: String, pub dereplication_fraglen_argument: String, + pub dereplication_cluster_contigs_argument: String, // pub dereplication_ani_method_argument: String, pub dereplication_output_cluster_definition_file: String, pub dereplication_output_representative_fasta_directory: String, @@ -112,6 +120,7 @@ lazy_static! { dereplication_cluster_method_argument: "cluster-method".to_string(), dereplication_aligned_fraction_argument: "min-aligned-fraction".to_string(), dereplication_fraglen_argument: "fragment-length".to_string(), + dereplication_cluster_contigs_argument: "cluster-contigs".to_string(), // dereplication_ani_method_argument: "ani-method".to_string(), dereplication_output_cluster_definition_file: "output-cluster-definition".to_string(), dereplication_output_representative_fasta_directory: @@ -313,6 +322,14 @@ pub fn add_dereplication_clustering_parameters_to_section( default_roff(crate::DEFAULT_CLUSTER_METHOD) )), ) + .flag( + Flag::new() + .long(&format!( + "--{}", + definition.dereplication_cluster_contigs_argument + )) + .help("Cluster contigs instead of genomes."), + ) } pub fn add_dereplication_output_parameters_to_section( @@ -1152,6 +1169,9 @@ pub fn generate_galah_clusterer<'a>( }), _ => panic!("Programming error"), }, + cluster_contigs = clap_matches + .get_one::(&argument_definition.dereplication_cluster_contigs_argument) + .unwrap() }) } } @@ -1187,6 +1207,7 @@ impl GalahClusterer<'_> { &self.genome_fasta_paths, &self.preclusterer, &self.clusterer, + &self.cluster_contigs, ) } } @@ -1324,6 +1345,9 @@ pub fn add_cluster_subcommand(app: clap::Command) -> clap::Command { .help("method of calculating ANI. 'fastani' for FastANI, 'skani' for Skani") .value_parser(crate::CLUSTER_METHODS) .default_value(crate::DEFAULT_CLUSTER_METHOD)) + .arg(Arg::new(&*GALAH_COMMAND_DEFINITION.dereplication_cluster_contigs_argument) + .long("cluster-contigs") + .help("Cluster contigs instead of genomes")) .arg(Arg::new("threads") .short('t') .long("threads") diff --git a/src/clusterer.rs b/src/clusterer.rs index 7a8e194..2438a05 100644 --- a/src/clusterer.rs +++ b/src/clusterer.rs @@ -15,6 +15,7 @@ pub fn cluster Vec> { clusterer.initialise(); diff --git a/src/fastani.rs b/src/fastani.rs index 5c78a80..e92aa30 100644 --- a/src/fastani.rs +++ b/src/fastani.rs @@ -26,6 +26,11 @@ impl ClusterDistanceFinder for FastaniClusterer { fn calculate_ani(&self, fasta1: &str, fasta2: &str) -> Option { calculate_fastani(fasta1, fasta2, self.min_aligned_threshold, self.fraglen) } + + fn calculate_ani_contigs(&self, fasta1: &str) -> Option { + // FastANI doesn't support self-self comparisons, so we can't use it for contig comparisons + None + } } pub fn calculate_fastani( diff --git a/src/lib.rs b/src/lib.rs index 0aada20..d5610cf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -34,6 +34,8 @@ pub trait ClusterDistanceFinder { fn get_ani_threshold(&self) -> f32; fn calculate_ani(&self, fasta1: &str, fasta2: &str) -> Option; + + fn calculate_ani_contigs(&self, fasta1: &str) -> Option; } pub const DEFAULT_ALIGNED_FRACTION: &str = "15"; diff --git a/src/skani.rs b/src/skani.rs index a734ed3..e8a2025 100644 --- a/src/skani.rs +++ b/src/skani.rs @@ -126,9 +126,13 @@ impl ClusterDistanceFinder for SkaniClusterer { fn calculate_ani(&self, fasta1: &str, fasta2: &str) -> Option { Some(calculate_skani(fasta1, fasta2, self.min_aligned_threshold)) } + + fn calculate_ani_contigs(&self, fasta1: &str) -> Option { + Some(calculate_skani_contigs(fasta1, self.min_aligned_threshold)) + } } -fn default_params(mode: Mode, min_aligned_frac: f32) -> (CommandParams, SketchParams) { +fn default_params(mode: Mode, min_aligned_frac: f32, cluster_contigs: bool) -> (CommandParams, SketchParams) { let cmd_params = CommandParams { screen: true, screen_val: 0.00, @@ -143,8 +147,8 @@ fn default_params(mode: Mode, min_aligned_frac: f32) -> (CommandParams, SketchPa sparse: false, full_matrix: false, max_results: 10000000, // for Triange usize::MAX, - individual_contig_q: false, - individual_contig_r: false, + individual_contig_q: cluster_contigs, + individual_contig_r: cluster_contigs, min_aligned_frac: min_aligned_frac as f64, keep_refs: false, est_ci: false, @@ -167,7 +171,7 @@ pub fn calculate_skani(fasta1: &str, fasta2: &str, min_aligned_frac: f32) -> f32 let refs = vec![fasta1.to_string()]; let queries = vec![fasta2.to_string()]; - let (command_params, sketch_params) = default_params(Mode::Dist, min_aligned_frac); + let (command_params, sketch_params) = default_params(Mode::Dist, min_aligned_frac, false); let ref_sketch = &file_io::fastx_to_sketches(&refs, &sketch_params, true)[0]; let query_sketch = &file_io::fastx_to_sketches(&queries, &sketch_params, true)[0]; let map_params = chain::map_params_from_sketch(ref_sketch, false, &command_params); @@ -175,3 +179,15 @@ pub fn calculate_skani(fasta1: &str, fasta2: &str, min_aligned_frac: f32) -> f32 ani_result.ani * 100.0 } + +pub fn calculate_skani_contigs(fasta1: &str, min_aligned_frac: f32) -> f32 { + //Vector of Strings + let refs = vec![fasta1.to_string()]; + + let (command_params, sketch_params) = default_params(Mode::Dist, min_aligned_frac, true); + let ref_sketch = &file_io::fastx_to_sketches(&refs, &sketch_params, true)[0]; + let map_params = chain::map_params_from_sketch(ref_sketch, false, &command_params); + let ani_result = chain::chain_seeds(ref_sketch, map_params); + + ani_result.ani * 100.0 +} diff --git a/tests/data/contigs/contigs.fna b/tests/data/contigs/contigs.fna new file mode 100644 index 0000000..e4e9199 --- /dev/null +++ b/tests/data/contigs/contigs.fna @@ -0,0 +1,300 @@ +>73.20110600_S2D.10_contig_13024 +ACAAGGCAGCGTTATATGCTCGGCGTGTGGCGCACAGCGTCACTATGTCATACGAAGCGT +GATTTATGGGACGGACACGCCGCCGATGGCCCCTGTGGGGATGAAATATGATGCGTGGGA +CTTCGCGAAGACCAGCGAATGTCTGAACTGTAAAGAACACGCACGGCACAGCGTCCATCT +CGACGAGTACAAGATGAACGTGATCTGTGACGAGTGCGGTTTTTCGCGAGTGTTGAAGCT +CGATTATCTTGTCTTCCCGCATGTACTCTCGCACGAATAAGCCCCCTGAAGATGAGCACG +CGGTTGTACGTCTTTCACGCAGGGCAGTGTGATCCGAAGAAGTGCACCGCCAAAAAACTT +GCCAAATTTCATCTCGTAACTCTTTTCTCGACGCCGCGACCCTTGTGGGGCTCTCATACG +GTGCTCCTTGATCCATCCGCACCCGCTCCGTTATGCCGCGCGGATGTTGTGACGGCGCTC +GTTGCGGTTGACTGCTCCTGGAAACAACGTGAGCGCGTGTTCGAGTCGTTCCGGGCGCGG +ACGAGTCGACGACTTCCGTATCTTTTGGCAGCGAATCCTGTGAACTATGGGCGGCCGTTC +GAGCTCACAACCGTAGAAGCGCTGGCGGCTGCGCTCTACGTCCTTGGGGAATCACGACGC +GCAAGAGAACTCTTGAACAAGTTCAAATGGGGTTCGACATTTTTGAAGTTGAACGCGGCA +CCCCTCGATGAATACGCCGGCGCGAACACCGTCAGCGACGTGCTCGCACTTGAACGCGCC +TATATAGAACGCTAAATGACTTCTTATCCGCGTTCTTGCGGTTAACGCAAACAGAAAGAT +TAAATGCTGTTAGGCGTATGGTTCTACCGACGTGAGAGGTACATCTGATGGCACGTTTTC +CAGAAGCTGAAAAAAGGTTGCTTAATATCAAAATCTGCATGCGGTGTAACGCCCGCAACC +CGCCACGGGCGACTCGATGCAGGAAATGTGGTTATAAAAACCTGAGGGTCAAGTCGAAGG +AACGAAGGGGTTAGTCGATTGAGCATCGAGCAGCATCTCGAAGAGGCGATCAGCAGAGAA +GGAACGATACACCTAACGCTTATCGATCCCGACTCACAGCCGCCAGAGGAAGCCGCACGA +ATTGCACGGGGCGCCACGGCTGGAGGCACCACTGCGGTTCTTGTGGGAGGTTCTGTAGGC +GTCGGCGGACGTGGCATAGAGTGCACCGTATCGGAAATCAAAGGTAGCACGGGTGTCCCT +GCTATCATCTTCCCTTCAGACGTAGGGAGTGTGTGTACGACGGCCGACGCGATGTTTTTC +TTGTGCCTTATGAACTCTAGGAGCACGGCGTATCTCAGTACGAACCAAGCACTCGGCGCC +GTGTTTGTCAAGAGCTACGGAATAGAGCCCATTCCAGTAGCGTACATCATCGTCGAACCG +GGGGGAACTGTTGGCTGGATAGGTGATGCGCGTTTGATTCCGCGCAACAAACCCGAGATC +GCCGCCGCATACGCGCTCGCAGGAAAATATCTTGGGATGCGATGGGTCTACCTTGAGGCA +GGTTCTGGTGTCCCCGATACCGTCCCCCCCGAAATGGTGAAGGCGGTAAAAGCATCCATA +GACGATACAAGGCTCATCGTAGGCGGCGGGATACGCACCGACAAACAGGCCGAGTTGCTA +GCGAAGGCGGGAGCGGATGTCATCGTAACGGGCACGGCAATCGAAGAGGACGTTGACATC +GAAAACAAGATCGCAAGCTTTGTNNAGATCCCAAAATCGCTTGCCGCTTTGTACCTTGAT +AAGCCGCATGTCGTGTTCTTTGCACTTGCGGGCAGTTATGCGCAGCCTTCCTGATTTCGG +CAGGGGGAGTCCAAACGTGCACTTCGGATACCCGCTGCACCCTACAAACCGTCCGCCGCG +TTTTGAGCGACGCACGATGAGATCTGAGCTGCACTGAGGGCATGTCCCGACGGTCGCGTC +TCCGCGTAGCCCTTCTTGCAGCGATTTGCTAATCTCTTCGAGTTGCTCATCGAGCGTGCT +CAAGACCTTATCGAGAATATCCGTTGAGCTGACCACCACAAACTCCTTTGACAGCGTTCC +CGTTTTGATTTTGTCCATCTCTTCTTCGAGGAGTCGGGTCATCTCAGGTTTCGTAATTGT +GGGGGCGTAGCGCTCGAGCGATTGTGTGACGCTGCACGCGATACCGGTGGGACGAAGGGG +ATTCACGTCGACATAGCCCCGCTGAGCGAGCTTCTTGATGATCTCGTGGCGCGTGCTTTT +CGTCCCCAAGCCGAGCTCATCCATCTTCTTTATTAACTGGCCTTGTCCCCATCGCTGTGG +CGGCTGCGTCTCCTTCTCCACAAGCTGTTTGTTCGTGACAGTGACAACTTCGCCTTCGTG +AAGGTCAGGGAGGTATCGCTCTTGCGTCTGTGCATACGGGTAGTGATAGCGCCAGCCTCG +ATATGAAAGTTCTAGCCCAGTCGCTCTGAGCTGCTCCCCTCGGGAGTCCAGCGTGACCTT +TTTCGTGCACCATCGCGCCGGGTCGGAGAGCGTCGCAAAGAACCGTCGCACAACGAGCTC +GTACACCTTCCAGCTTTGTGGCTCGAGATCAGCTTGTGTGGCGCGCATCGTGGGATAGAT +CGGCGGATGGTCGGTCGTTTGCTTTTTACCGCGGGTGGCGACCAGCTGCTTCTTTTCTAG +CAGCATGCGAGCATTCTCAGCGAATGGACCGCTTCTGAACATGCTCACTGAGAGGCGGAG +ATCGAGCTCATTTGGATAGACTGTGTTGTCGGTGCGCGGATAGCTTATCCACCCTGCCGT +GTAGAGGCGCTCCGCCGTTTCCATAGCGCGGCTTGCTGAGATCCCAAGCGCGCTTGCTGC +AGCGAGAAACTCGGTCGTATTGAAGGGTTTCGGGGGGGCGTCCACTCGTTCGGTTTTGGC +AAACGTAACGACGGTGGCGGTGTCGCCGATGTTTTGTATCGCCGCCGTTGCGCTCGCTTT +GTTTCGAAATCGGTCTGCTTGGTGCTTTGCGTGAAACGTTTCGCTGCTCGTGCTGAGATC +AATGTATATTTCCCAGTATGGCTCTGACACGAATTGTGCAATTTTTCGCTCGCGATCAAC +GATCAGGGCGAGCGTTGGACTTTGAACGCGTCCCGCAGATAAGAAACGGTTGCCAAGTCG +CTTCGCAGACGTCGATATGAAGCGCGTGAGCGCAGCACCCCAGATGAGGTCCACTGCCTG +CCGTGCCTCTCCAGAACGAGCAAGGTTGAGGTCAATTCTGGTTGGCGATCGAAACGCGCG +CTTGACTTCAATGGGGGTCATTGCACTAAACCGTACGCGATCCACCCCGACGTCCTTGAC +CGACTTGACGATCGTGCACGCTTCTAAACCGATGAGTTCGCCTTCCGTGTCAAAGTCAGT +CGCAATGGTCACCCGGTCCGCGTCTTTGGCGGCCCGTTTGAGTCCTGACACGATGGTCTT +GTGCGTTGGAACGGTGACGACATCAGCCCGTATTAGGTCGTGAAGGTTTGTCTTCCAGTC +ACGGTATTCATCTGGGAAGTCGAGCTTGACGATGTGCCCCCTTAGGCCTAGCACGACCGT +ACTGTTGAAGCGATACGCGTTAGCGCCGGCGACTTTGGTTGCAGATGCTGCGCCGCCGGA +GAGAATCTCGGCGATCCGCTTCGCCGCGATATCCTTCTCAGTTACGATGAGGTGCATCAG +AGTTGCTCCCGAATAAGACGATTGGTAAGGGTGGGATCAGCCCGTCCGCGTGTCTTCTTC +ATGACCTGACCGACGAGGAAGTTAAGCGCGCCTTTCTTGCCGCTGTGATAGTCGTCAACG +GCCTGTGGGTTCTCCTCAACTGCTTGCTTAACGGCAGCGGTGATCTGTTCGACTGGGACA +GCTTGAAGCGCCTGTGCGTCGATGATCTCGCAAGGGGTCGCTCCGGTGTCGAGGACAGTT +CGTATGACCTCGACGGCTGCGTTTTCGGTAATGGTTCCTTGGTTCACGTTGGTGATGACG +TCGATTAAGAAGTTGGGGCTGAACGCATCAAGGGTAAGCCCGCGGTAGTTTAGCTCGCCC +TGGAGCACGTCAACGACCCAGCTTGCCGCCGCCGACGGGTCGACTCGTGCGGCGACCGTT +TCGAAGTAGTCGGCCAATGGCTTCGACGCGGTGAGCACTTTGGCAAGGTAATCTGAAATT +CTGTACTGCTCGATGAAGCGGTGACGTTTTGCATCCGGGAGTTCGACTTGCACTTGCTGC +GGGAACTGTTCGGCCACCGGTTCGGGATCGACGCGCGGAAGATCAGGTTCGGGGAAATAC +CGATAATCGTGCTCTTGCTCTTTTGAACGGAGTGAAATTGTGATGCCGCGCGCTTCGTCG +AAGTGCCGCGTTTCTTGCGTGATATGGATGCCTCTTCGAAGCAAGTTTCGCTGGCGGGTG +ATTTCAAAAAGGAGCGCTTTTTCCACGCCTTTGTATGAGGAGATGTTTTTGACTTCGGCC +CGTTGCCCGCCGGCGATCGAAATGTTAGCGTCGACGCGAAGCGATCCTTCGAGGTTGCCG +TCGAAGACGTCGAGATATTCCAGGATGTTGCGCAACTTGTTCAGGAAGCGACGTGCTTCC +TTAGGGGCGCGAAGGTCGGGTTCAGTAACCACTTCGAGCAGTGGCACGCCGGATCGATTG +TAATCTATCAGGGAATACTGCGCCTTTTCGATGGTTCCGCGGTACGTGAGCTTCCCCGGA +TCTTCTTCTAAGTGAACGCGTCTGATCCGAATCCTCCGTTCCTGCCCTTCGCTGTCGACC +ATCATGTAGCCGTTTGTGCCGAGGGGAAAGTCATACTGGCTGATTTGGAATCCTTTCGGC +AGGTCGGGGTAGTAGTAGTTCTTCCGGTAGAACTGCGTCTGCTGCACCGAGCAGCTGAGC +GCTTTCGCGACCCGTTCGGCGTAAACGATTGCCATTTCGTTGACGACCGGCAGACTCCCC +GGAAGCCCAAGACAGATGGGACACGTGTGGGTGTTTGGCTCTGAGTCTTGATACCGCGTT +GAACATCCGCAAAACAGCTTCGATTGCGTAGTTAGCTGTACGTGTGCCTCAAGCCCTATG +ATAACATCGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCGCTGATGG +GTTGCAGCACCTCGCACTGACTGGGAAGTCTGGAGGGTAGATCTATCAACCACGAGATCG +GACGGCACTGCAGCGAACCAAGTGAAAAGGTTTATCTGATAATCAGCAGCTTGTATATGG +GAGGAGTCAATATGAGCGCGTATAAATGGGCCCTTCCACTGCTGGTCACCGTCTTGTTGC +TCGCTTCGGCCTGCACCGTGGCGAGCGCGCAAAGCACACAAGGAGGCGTAGTACAGATCA +GCTCGCCTCCAAAGAACGGCAAGGCCGGACCTACGGAGCGCGTAAGCGGTAAAGCTCAGC +TTTCAAAGGGGTATTCGCTTTGGATCGTGCCGTACGATCCTACCGCGGGCAAGTACTATC +CTCAAGCGCCCTCGCTCACGGTGCGGAGTGATGCGACCTGGTCGTCTCGTCTCTCGGTAG +GCACCATATTTGGCGTCGGCAAAACGTTCCAGATCGATGCGGTCGTCGCGGATGAAAGCG +CCAGTACTGCGCTGAGCGGTTACGCCGCGCAGCGAATGGCAATAACGAAACTGCCGCCAG +GTGCGCAGGCTGTCGATGCGGTCACCGTAAAGCGCGTGGCAGCAGACGTCACGGGCACGC +CATCGGCAAGCCCCTCGCCCAGCGCCGTGCCACGCACGGCATCAACCGCTGGATCGAGCG +CGACCGCAGGGGCGAGTCCTTCTATAGCGAACGTCGCTCAAGCGAGTGCAGCTGCAAGTA +GCCAAAACCCGCTTCCCGGTTTTGAGGCGTTGTACGCACTTGGTGCCATTGCTGCCGTGT +TTCTGGCATTACGTGTCACCCGCCGTACCTGAGGCTCTGGTTGCGATGGATTGCCAATCT +TAAAACGCCTGCTTCAGGACAGATTAGCTGGAGGGGGTCAATATACAGTTGTCACACGAA +TGCTATCGATGACTTTTCCTCATTCCGTCGAAAACATGTGGGGCAGCTTGCGGCATTAAC +GAGT +>73.20110600_S2D.10_contig_13024_2 +ACAAGGCAGCGTTATATGCTCGGCGTGTGGCGCACAGCGTCACTATGTCATACGAAGCGT +GATTTATGGGACGGACACGCCGCCGATGGCCCCTGTGGGGATGAAATATGATGCGTGGGA +CTTCGCGAAGACCAGCGAATGTCTGAACTGTAAAGAACACGCACGGCACAGCGTCCATCT +CGACGAGTACAAGATGAACGTGATCTGTGACGAGTGCGGTTTTTCGCGAGTGTTGAAGCT +CGATTATCTTGTCTTCCCGCATGTACTCTCGCACGAATAAGCCCCCTGAAGATGAGCACG +CGGTTGTACGTCTTTCACGCAGGGCAGTGTGATCCGAAGAAGTGCACCGCCAAAAAACTT +GCCAAATTTCATCTCGTAACTCTTTTCTCGACGCCGCGACCCTTGTGGGGCTCTCATACG +GTGCTCCTTGATCCATCCGCACCCGCTCCGTTATGCCGCGCGGATGTTGTGACGGCGCTC +GTTGCGGTTGACTGCTCCTGGAAACAACGTGAGCGCGTGTTCGAGTCGTTCCGGGCGCGG +ACGAGTCGACGACTTCCGTATCTTTTGGCAGCGAATCCTGTGAACTATGGGCGGCCGTTC +GAGCTCACAACCGTAGAAGCGCTGGCGGCTGCGCTCTACGTCCTTGGGGAATCACGACGC +GCAAGAGAACTCTTGAACAAGTTCAAATGGGGTTCGACATTTTTGAAGTTGAACGCGGCA +CCCCTCGATGAATACGCCGGCGCGAACACCGTCAGCGACGTGCTCGCACTTGAACGCGCC +TATATAGAACGCTAAATGACTTCTTATCCGCGTTCTTGCGGTTAACGCAAACAGAAAGAT +TAAATGCTGTTAGGCGTATGGTTCTACCGACGTGAGAGGTACATCTGATGGCACGTTTTC +CAGAAGCTGAAAAAAGGTTGCTTAATATCAAAATCTGCATGCGGTGTAACGCCCGCAACC +CGCCACGGGCGACTCGATGCAGGAAATGTGGTTATAAAAACCTGAGGGTCAAGTCGAAGG +AACGAAGGGGTTAGTCGATTGAGCATCGAGCAGCATCTCGAAGAGGCGATCAGCAGAGAA +GGAACGATACACCTAACGCTTATCGATCCCGACTCACAGCCGCCAGAGGAAGCCGCACGA +ATTGCACGGGGCGCCACGGCTGGAGGCACCACTGCGGTTCTTGTGGGAGGTTCTGTAGGC +GTCGGCGGACGTGGCATAGAGTGCACCGTATCGGAAATCAAAGGTAGCACGGGTGTCCCT +GCTATCATCTTCCCTTCAGACGTAGGGAGTGTGTGTACGACGGCCGACGCGATGTTTTTC +TTGTGCCTTATGAACTCTAGGAGCACGGCGTATCTCAGTACGAACCAAGCACTCGGCGCC +GTGTTTGTCAAGAGCTACGGAATAGAGCCCATTCCAGTAGCGTACATCATCGTCGAACCG +GGGGGAACTGTTGGCTGGATAGGTGATGCGCGTTTGATTCCGCGCAACAAACCCGAGATC +GCCGCCGCATACGCGCTCGCAGGAAAATATCTTGGGATGCGATGGGTCTACCTTGAGGCA +GGTTCTGGTGTCCCCGATACCGTCCCCCCCGAAATGGTGAAGGCGGTAAAAGCATCCATA +GACGATACAAGGCTCATCGTAGGCGGCGGGATACGCACCGACAAACAGGCCGAGTTGCTA +GCGAAGGCGGGAGCGGATGTCATCGTAACGGGCACGGCAATCGAAGAGGACGTTGACATC +GAAAACAAGATCGCAAGCTTTGTNNAGATCCCAAAATCGCTTGCCGCTTTGTACCTTGAT +AAGCCGCATGTCGTGTTCTTTGCACTTGCGGGCAGTTATGCGCAGCCTTCCTGATTTCGG +CAGGGGGAGTCCAAACGTGCACTTCGGATACCCGCTGCACCCTACAAACCGTCCGCCGCG +TTTTGAGCGACGCACGATGAGATCTGAGCTGCACTGAGGGCATGTCCCGACGGTCGCGTC +TCCGCGTAGCCCTTCTTGCAGCGATTTGCTAATCTCTTCGAGTTGCTCATCGAGCGTGCT +CAAGACCTTATCGAGAATATCCGTTGAGCTGACCACCACAAACTCCTTTGACAGCGTTCC +CGTTTTGATTTTGTCCATCTCTTCTTCGAGGAGTCGGGTCATCTCAGGTTTCGTAATTGT +GGGGGCGTAGCGCTCGAGCGATTGTGTGACGCTGCACGCGATACCGGTGGGACGAAGGGG +ATTCACGTCGACATAGCCCCGCTGAGCGAGCTTCTTGATGATCTCGTGGCGCGTGCTTTT +CGTCCCCAAGCCGAGCTCATCCATCTTCTTTATTAACTGGCCTTGTCCCCATCGCTGTGG +CGGCTGCGTCTCCTTCTCCACAAGCTGTTTGTTCGTGACAGTGACAACTTCGCCTTCGTG +AAGGTCAGGGAGGTATCGCTCTTGCGTCTGTGCATACGGGTAGTGATAGCGCCAGCCTCG +ATATGAAAGTTCTAGCCCAGTCGCTCTGAGCTGCTCCCCTCGGGAGTCCAGCGTGACCTT +TTTCGTGCACCATCGCGCCGGGTCGGAGAGCGTCGCAAAGAACCGTCGCACAACGAGCTC +GTACACCTTCCAGCTTTGTGGCTCGAGATCAGCTTGTGTGGCGCGCATCGTGGGATAGAT +CGGCGGATGGTCGGTCGTTTGCTTTTTACCGCGGGTGGCGACCAGCTGCTTCTTTTCTAG +CAGCATGCGAGCATTCTCAGCGAATGGACCGCTTCTGAACATGCTCACTGAGAGGCGGAG +ATCGAGCTCATTTGGATAGACTGTGTTGTCGGTGCGCGGATAGCTTATCCACCCTGCCGT +GTAGAGGCGCTCCGCCGTTTCCATAGCGCGGCTTGCTGAGATCCCAAGCGCGCTTGCTGC +AGCGAGAAACTCGGTCGTATTGAAGGGTTTCGGGGGGGCGTCCACTCGTTCGGTTTTGGC +AAACGTAACGACGGTGGCGGTGTCGCCGATGTTTTGTATCGCCGCCGTTGCGCTCGCTTT +GTTTCGAAATCGGTCTGCTTGGTGCTTTGCGTGAAACGTTTCGCTGCTCGTGCTGAGATC +AATGTATATTTCCCAGTATGGCTCTGACACGAATTGTGCAATTTTTCGCTCGCGATCAAC +GATCAGGGCGAGCGTTGGACTTTGAACGCGTCCCGCAGATAAGAAACGGTTGCCAAGTCG +CTTCGCAGACGTCGATATGAAGCGCGTGAGCGCAGCACCCCAGATGAGGTCCACTGCCTG +CCGTGCCTCTCCAGAACGAGCAAGGTTGAGGTCAATTCTGGTTGGCGATCGAAACGCGCG +CTTGACTTCAATGGGGGTCATTGCACTAAACCGTACGCGATCCACCCCGACGTCCTTGAC +CGACTTGACGATCGTGCACGCTTCTAAACCGATGAGTTCGCCTTCCGTGTCAAAGTCAGT +CGCAATGGTCACCCGGTCCGCGTCTTTGGCGGCCCGTTTGAGTCCTGACACGATGGTCTT +GTGCGTTGGAACGGTGACGACATCAGCCCGTATTAGGTCGTGAAGGTTTGTCTTCCAGTC +ACGGTATTCATCTGGGAAGTCGAGCTTGACGATGTGCCCCCTTAGGCCTAGCACGACCGT +ACTGTTGAAGCGATACGCGTTAGCGCCGGCGACTTTGGTTGCAGATGCTGCGCCGCCGGA +GAGAATCTCGGCGATCCGCTTCGCCGCGATATCCTTCTCAGTTACGATGAGGTGCATCAG +AGTTGCTCCCGAATAAGACGATTGGTAAGGGTGGGATCAGCCCGTCCGCGTGTCTTCTTC +ATGACCTGACCGACGAGGAAGTTAAGCGCGCCTTTCTTGCCGCTGTGATAGTCGTCAACG +GCCTGTGGGTTCTCCTCAACTGCTTGCTTAACGGCAGCGGTGATCTGTTCGACTGGGACA +GCTTGAAGCGCCTGTGCGTCGATGATCTCGCAAGGGGTCGCTCCGGTGTCGAGGACAGTT +CGTATGACCTCGACGGCTGCGTTTTCGGTAATGGTTCCTTGGTTCACGTTGGTGATGACG +TCGATTAAGAAGTTGGGGCTGAACGCATCAAGGGTAAGCCCGCGGTAGTTTAGCTCGCCC +TGGAGCACGTCAACGACCCAGCTTGCCGCCGCCGACGGGTCGACTCGTGCGGCGACCGTT +TCGAAGTAGTCGGCCAATGGCTTCGACGCGGTGAGCACTTTGGCAAGGTAATCTGAAATT +CTGTACTGCTCGATGAAGCGGTGACGTTTTGCATCCGGGAGTTCGACTTGCACTTGCTGC +GGGAACTGTTCGGCCACCGGTTCGGGATCGACGCGCGGAAGATCAGGTTCGGGGAAATAC +CGATAATCGTGCTCTTGCTCTTTTGAACGGAGTGAAATTGTGATGCCGCGCGCTTCGTCG +AAGTGCCGCGTTTCTTGCGTGATATGGATGCCTCTTCGAAGCAAGTTTCGCTGGCGGGTG +ATTTCAAAAAGGAGCGCTTTTTCCACGCCTTTGTATGAGGAGATGTTTTTGACTTCGGCC +CGTTGCCCGCCGGCGATCGAAATGTTAGCGTCGACGCGAAGCGATCCTTCGAGGTTGCCG +TCGAAGACGTCGAGATATTCCAGGATGTTGCGCAACTTGTTCAGGAAGCGACGTGCTTCC +TTAGGGGCGCGAAGGTCGGGTTCAGTAACCACTTCGAGCAGTGGCACGCCGGATCGATTG +TAATCTATCAGGGAATACTGCGCCTTTTCGATGGTTCCGCGGTACGTGAGCTTCCCCGGA +TCTTCTTCTAAGTGAACGCGTCTGATCCGAATCCTCCGTTCCTGCCCTTCGCTGTCGACC +ATCATGTAGCCGTTTGTGCCGAGGGGAAAGTCATACTGGCTGATTTGGAATCCTTTCGGC +AGGTCGGGGTAGTAGTAGTTCTTCCGGTAGAACTGCGTCTGCTGCACCGAGCAGCTGAGC +GCTTTCGCGACCCGTTCGGCGTAAACGATTGCCATTTCGTTGACGACCGGCAGACTCCCC +GGAAGCCCAAGACAGATGGGACACGTGTGGGTGTTTGGCTCTGAGTCTTGATACCGCGTT +GAACATCCGCAAAACAGCTTCGATTGCGTAGTTAGCTGTACGTGTGCCTCAAGCCCTATG +ATAACATCGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCGCTGATGG +GTTGCAGCACCTCGCACTGACTGGGAAGTCTGGAGGGTAGATCTATCAACCACGAGATCG +GACGGCACTGCAGCGAACCAAGTGAAAAGGTTTATCTGATAATCAGCAGCTTGTATATGG +GAGGAGTCAATATGAGCGCGTATAAATGGGCCCTTCCACTGCTGGTCACCGTCTTGTTGC +TCGCTTCGGCCTGCACCGTGGCGAGCGCGCAAAGCACACAAGGAGGCGTAGTACAGATCA +GCTCGCCTCCAAAGAACGGCAAGGCCGGACCTACGGAGCGCGTAAGCGGTAAAGCTCAGC +TTTCAAAGGGGTATTCGCTTTGGATCGTGCCGTACGATCCTACCGCGGGCAAGTACTATC +CTCAAGCGCCCTCGCTCACGGTGCGGAGTGATGCGACCTGGTCGTCTCGTCTCTCGGTAG +GCACCATATTTGGCGTCGGCAAAACGTTCCAGATCGATGCGGTCGTCGCGGATGAAAGCG +CCAGTACTGCGCTGAGCGGTTACGCCGCGCAGCGAATGGCAATAACGAAACTGCCGCCAG +GTGCGCAGGCTGTCGATGCGGTCACCGTAAAGCGCGTGGCAGCAGACGTCACGGGCACGC +CATCGGCAAGCCCCTCGCCCAGCGCCGTGCCACGCACGGCATCAACCGCTGGATCGAGCG +CGACCGCAGGGGCGAGTCCTTCTATAGCGAACGTCGCTCAAGCGAGTGCAGCTGCAAGTA +GCCAAAACCCGCTTCCCGGTTTTGAGGCGTTGTACGCACTTGGTGCCATTGCTGCCGTGT +TTCTGGCATTACGTGTCACCCGCCGTACCTGAGGCTCTGGTTGCGATGGATTGCCAATCT +TAAAACGCCTGCTTCAGGACAGATTAGCTGGAGGGGGTCAATATACAGTTGTCACACGAA +TGCTATCGATGACTTTTCCTCATTCCGTCGAAAACATGTGGGGCAGCTTGCGGCATTAAC +GAGT +>73.20110600_S2D.10_contig_50844 +ACTCCCTCGTTTCATTGGTGGTCATTCGCGCGCTGCAGGCGGACGAGAACCCCCTATACC +CCGAACCAACTTCTAAGCCACGCCTGGTGTATTTCGAGGGGCAATTTTTCCATTGGCGAT +GTGCTGCTCCGCAATTTACGTTCCGTTTCTTCAAGCTCGGTTTTCGCCCACTCGTCACTA +TCAGTGACATCGATCCTGAAGAATGCGCGAGTGTCAAGGAAGTTCTCAAATAAGATTAGA +TCGTCGCNNTTGTTCTTCGATTGGCGTGACAACCAGTAAAGCTTGACAAGGGCAGGCTTG +TCTATTCGCCACGGACAAGGGACCGCTACTCGCCACGGATCGATCTGCGAAAAGTCGATC +TGCGATGGAAGGAGACGCCCCCATTCCGATTCCGCGGCCTGCGCTTTAATCCCATCGGAT +TCCCATGCTCGTACTTTCTCTATAGCAGCAACCCAGATAACCACAAGCTTGTTGAGGAAG +TCGGCTTTTTCTGTGTTGTCTGCGCCCGGAAAGAGGTCGAGCCTGTCATAATCTTCCCTG +ACGAATTTCATTTTGATATCCTCAGAAGCAATACGCTGTGTCGACAGCGGACCAATGATG +GAGGGGAGCGTATAGACCGCGAAGGGGCGTTCGTCGAGCGACGCGGCGGCACGCTTTATA +GCCTCGATCATTTCCGTTGTAAGAGCCCGCGGCTTTGTAAGGCCGGAATAACGTACCGCC +CATACCGCGGCGGCTTGCTCAACCGTTTCCGATAACCATAGTGAACAAGGGCGACCCGGA +CGCACTTTTTCGCCGTTCTCCTCGAGCTTCTGCCTTTTTTCCTGTCTTTTCTTCGCTGCG +TCGCTTCCTTTTTTCGGGCGGGCTCCGACGAATTTCTTTAGTCGAGGCTTGTTGCTAGGC +TGGTGCTGCTCGTAATTGGGGATAATATGTTCACTTGCCCATAGTCGAAGCGTTGCAATT +GGGACAGGAACTCCTAGCGTTTCGAGCTTCCTGACAAGCTGGCTCGAATTTATCTTACTT +GTCGCAGCAGCCATAATTTTCTAATAACTGGTTTCAAATTATTGACCAATACTTAATTAA +TTTCCACGTATAAGACAGTTTGTGGATGTTTTCAGAGGTGCAGCAATGAAAATGGAACCG +GAAAAAAACGTGCCTATGCTCTATCGCGGAGCTCGGGCGCTGACCGACGATCTAGATAAC +GCGGTCCTAGACTACGTTTTGCGTGCAGGAGATGCATCGGTGCGTCGCATCACCACAGAG +ATTGAAAGGCCCTACTCAACGGTTATCGTCCGTTGTCTAAAGCTGGAGGCGACAGGATTT +CTGCACAATATGTGGCCCGGCAACATCAGGCGGCGGCTCCACGCGGTGAACCGGCAAGGG +CGTACAAATGTGCGACGACATTCAATTGAAGGGGAAAGGCCGTTGTTGCCAAAATCTGAA +GAATGNNAAAATCATCAGCCACGAGATCACGATGCGGGCGGCCGCACGGTCGATGGGCAT +TGACATTTCCACGGTATCACGGCACATGAGCAATTGCGTTCCGAAACGCGTAGGCGAGCT +CGTTAAACCTGAACCGACAGAGGTACACGACCTAAACTGTGTCAATATCCTCGTATCATC +ACATCAGGACCTGCGAAACATCTACACCGAAGCAAGAGAGAAGGGCGACCTAAGGAACTC +ACTGAAGACGCTCGAGGTTGAGATTAAGCAAGTGCACGAGATTGCGACGCTCACAGGCCA +AACGCACGACGGCCCGCAGTTTAACTTATTGATGCTCCCGGAGTATGTCGAGTTCAAGCA +AACAGTTCTTGCCGCAGTCAGTGACTATCCTGAGGTTAGGGCCCGGATCAGCGCGGCGCT +CATGTGCACGGAGCCGGAACCAGGACCAGAAGAAGATGCTAGCGACGAACCCGTTTTTTA +GTGACCTCGCGGCTGCCCTAGACCCTGTTGACTTCGCGCAATCGCTCGGCATCGACCCCG +ATCCATGGCAGGAGGACATTCTCAGATCTGACTCCAACCGCATAATCCTCAACTGCGCGA +GACAAACTGGCAAAAGCAGCGTCGTCGCCATAATTGCCCTCCATCACGCATTGTATCATC +CGAAAGCGATGGTTATCATCATCAGCCATACGCTCCAGCAAGCGGCCGAGACGTTTCGCA +AGGTGCATGACTACTACCGGCAGATCGCAAAGCCGGTTCTCTCGATCATCGAGTCAGTTC +ACCGGCTCGAGCTGACAAACGGGTCTCGAATCGTCACGCTCACCGGCCAAGCGCCAGACA +GCATCCGGGGCTTTTCTAACGTCAGTCTCCTAATTATTGACGAAGCGAGTCAGGTTCCTG +ACGAGGCATATTATGCAGCTCGGCCAATGGTCGCGGTCAACGCCGGGAGAATTATATTAT +TATCAACTCCTCACGGCCAAAGAGGGTTTTATTGGCAGGCTTGGGCGAACGAGGACGATT +GGGAGAAGGTCGAAATCAACGCCGACCAGTGTCCTCGATTAACAAAAGCCTTTATCGTGG +AGGAAAAGGCGCTATATCCGTCATGGCTCTTTAGGCAGGAGTATTACAATGAATTCGCCG +AGGGAGTCACGTCAGTATTCCGGACCGAGGACATCGATGCGGCATT +>73.20110600_S2D.10_contig_37820 +ACCCAGCCAAAATGAGTCTAGGTAACACTAATCAGGTTCTACGACCTGTTAGCACCCTAC +CTTCACGGGAATCCGCACGTTTAAGCCGTCACGAAGATCTTGCGTATTTGTTTGTCGTTG +CGGGCTGTTCGTGGTTCTACTGAAAACTCTCGGGAGGGGCGTAAGCGCATGCGGAGATTA +TTCTGAACTTACCAATTTCTGTTTTGAAGAAGTTTGTATCTGGATCAATGAGGCAGAAAT +CTTGAAATTGCGATTTGCTATTTGCTATTTGGTGACTCTTAAAGCGTGTAGCGAAGGCGA +AAAGAGACGGTCCGCGAACATAAGGATAGCGAGTTTTGTTGTGTGCCTTTACCACATTCG +TCGCTGCGGCGAAATGTTCAACGTCGCATTGGGCGAGTGAAAAATGAGTGTATATCACAT +CAAGGGCGTTTTGTCTCGGCTTGTTATTTAAGCTGCTTTTTCCCGCAGCAAGACGCCGAG +TTTCGGAGCAATTCCGGCTGGCGAGAATTGATCTCTTTGCTTCAGTATGTTCTCCTGTTG +CGCGCGGTAACGATCCAGAGGTTTCTTTTCAAGGATCTCCATAACAGCACCCTTTGCGTC +ATTGTAACTCTGGCACAGGTACCCGATGTCCCCCATCTTGTTGAAGTAATATTCCGAGAG +TGGGGTCTTTAGTGCGATAATTGGCTTGAGATGTGAAAATGCGTCCAGCATCGCAGCAGT +TCCATTTAATTCGTAAGCACTCGCTACATGAAAAAAGAGGGCATAATCGATGCGTTTACT +ATATTTGTCAAACTCCTCTTGAGTGAGGGGCGCGTTAAGTGCGGGGATATTGACAGACGT +GTGCCGCATCTCTTTGAGCTGCTCATCTCTTACATCACCAATGAGAACAAAAATTGGCCG +ATATTTTGTTCTTGTAGAGCTCACTTCATCGGCTAACTTGAAAAAAATGTCCGCTCCCTT +GCGCAAACTCGCGTGACCATAAAAACCAAAATGAACAACATCATCTTTGAACGGCTGGCT +GCCGGTAGAATCTGCAAAGACATAAGGAACGTCTATCGGCGAAATACAGTTGTGTAGACG +TGGCAAGTAATTCTTAAGTCCTTGTTCCATATGGGGCGGGNNGTCCCGGGAGTAAGTAAT +GTATTCTCTCCGTGTTACCGGCGAGAAGCCAGTGTCGATATGAAAGGAACTGAGATGTCA +GAAGCTTTGGTTGAGGCGTGCTGATTTTGTTCTGCAGATCGTGTAGGACAACAAAACAGG +CGATCTGAGAAAATCGTCTCAGCATCATTTTGATAATGACTAGATCGTTCCTATGTGCTG +CTGTACAAAACACTATTTGCCTCGCTCGATTTTTGTCGGCCAATCTGAATAAGCGCCTGT +ACAGGGCTAGCTCCTCGAGGAGCGCTGCAAAGGGCAAGCGCCGACCTCGTTGCTCTTCGC +GCAGGGCCCTATCTGGAAGTTCGACTTCGACAAACCTGACTGTGGCCACGTGAGTATCTG +CAATCCACTGGACATGATTCAAATGTTCGCGTTCAGCACAAAATATCAGCTCCTCCTCAG +GGAAAGCCTCAGCGACTGAACAAACCAGAGCCGCATTAAATTGGGCGTGCCCAAAACCGC +GAAAATGCGGTTCGCATATAACTATCATGAGCTCTCTATATCTTTAATCAGTTGATCTTA +TCGATTTCGTTTTTTCGTTGACTGCGCTTCGTGATAATGGTGAGGTTCATTTCCGCATAC +AAGCGCTATTCCAGCCCTTGAGCTTTTCAGATGCGCAGACATACCTTGATTTGCCAATAT +ATATTTAACGTTGCAGGGAAGCTGCATCTGTCAGGAAGCTTGACTGTCAGATTATCTCAG +TTCACAAGCCTTCGCACCTGCCTGAGAATCCAGTCTTACACAATTGCCAGGATCAAACGT +CTAACTCGCTTCAAAAAAGCGTCAGAAAAAGCGATCAGCCCGGATCTCTTCAGTTCCGAT +GCGTAAATGGATGACAATGACCTTGTTTGGTTTCCAAACGAGGTGGTTAGCACGCTCGTA +GAGGTAGTCTCACTTTTGAACAAGCTCGTCGTACGGTTCATCGCTCAACAAACCATTGAA +CCTCATGACAGTTGCGTTTTTATTCGTAAAGAGCACAGCAAAAATCCTCCATCAGTGTAG +CGCAACACGCCTGACATTCAGTCCACGAGTATTCTCTGTTGATCAATTCTAAGCTTTCTC +GTTATTCCTTTACTACTTCACACCATATGTTTTCGTTGAAGGTTTTCGAAGAGAAGATCC +TAGAAAGAATAAAGACCGCGGTTCTCGGTATCCGATTGTTCCACAAGTTTATACCAACTG +TCGCATTTATCTTGGTCATCTCTTTGTAGCTTCGGATCTTTCCTCCACTTAAGCGCACGA +TAGTATTCACCTCTTCTTGTGAAAAACCACAATGTTGATGCGTTTTATCGAGTTGATGGT +ATTCAACAACTCCGCGTTCCCAGAGCGCCTCATTTCTTTTCGGAACACACAAAAGGACGT +TCTCTTGGCAACTCTTGATAGATTTGCTCAATAAAGTACATGGATCTTCTACGTGCTCTA +ATAACCAGTACAAAACCACGGTGTCGATTTTTGGGAAGGTAACTGCGGTTGCGTCGTTCG +TTATGATCTCAACGCCCGGCGGGTACGCATAATCGTCGTACCGTTGTGCGTCAATGCCAA +TGTAT diff --git a/tests/test_cmdline.rs b/tests/test_cmdline.rs index 3075454..6713bf0 100644 --- a/tests/test_cmdline.rs +++ b/tests/test_cmdline.rs @@ -337,6 +337,26 @@ mod tests { .unwrap(); } + #[test] + fn test_contig_cluster() { + Assert::main_binary() + .with_args(&[ + "cluster", + "--genome-fasta-files", + "tests/data/contigs/contigs.fna", + "--cluster-contigs", + "--output-cluster-definition", + "/dev/stdout"]) + .succeeds() + .stdout() + .is("\ + 73.20110600_S2D.10_contig_13024 73.20110600_S2D.10_contig_13024\n\ + 73.20110600_S2D.10_contig_13024 73.20110600_S2D.10_contig_13024_2\n\ + 73.20110600_S2D.10_contig_50844 73.20110600_S2D.10_contig_50844\n\ + 73.20110600_S2D.10_contig_37820 73.20110600_S2D.10_contig_37820\n") + .unwrap(); + } + // #[test] // fn test_fraglen() { // Assert::main_binary()