diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bc37c4..b651203 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,14 @@ -### v0.2.2 released - 2024-TODO +### v0.2.2 released - 2024-07-04 + +#### Major + +* added the `--small-genomes` preset. This is just an alias for `-c 30 -m 200 --faster-small`. This makes skani much faster when comparing hundreds of thousands of small genomes. #### Minor * fixed a bug where `skani triangle --full-matrix` gave different results between STDOUT and `-o` (thanks to Florian Plaza Onate) +* added a `--diagonal` option (suggested by Antonio Camargo) to print diagonal entries for sparse and lower-triangular distance matrices +* added a warning to use `--faster-small` when comparing too many contigs (e.g. viruses, plasmids). ### v0.2.1 released - 2023-10-11 diff --git a/Cargo.lock b/Cargo.lock index ff3895d..3500084 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1618,7 +1618,7 @@ dependencies = [ [[package]] name = "skani" -version = "0.2.1" +version = "0.2.2" dependencies = [ "assert_cmd", "bincode", diff --git a/Cargo.toml b/Cargo.toml index abee73f..29045b2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "skani" ###Make sure to change version in main.rs after changing cargo.toml -version = "0.2.1" +version = "0.2.2" #### edition = "2021" license = "MIT OR Apache-2.0" diff --git a/src/cmd_line.rs b/src/cmd_line.rs index fcdd72b..44a6d18 100644 --- a/src/cmd_line.rs +++ b/src/cmd_line.rs @@ -30,7 +30,7 @@ pub const C_FACTOR: &str = "c"; pub const CMD_C_FACTOR: &str = "c"; pub const H_C_FACTOR: &str = "Compression factor (k-mer subsampling rate).\t[default: 125]"; -pub const H_SCREEN: &str = "Screen out pairs with < % identity using k-mer sketching.\t[default: 80]"; +pub const H_SCREEN: &str = "Screen out pairs with *approximately* < % identity using k-mer sketching.\t[default: 80]"; pub const CONF_INTERVAL: &str = "ci"; pub const CMD_CONF_INTERVAL: &str = "ci"; @@ -45,6 +45,10 @@ pub const MODE_SLOW: &str = "slow"; pub const CMD_MODE_SLOW : &str = "slow"; pub const H_MODE_SLOW : &str = "Slower skani mode; 4x slower and more memory. Gives much more accurate AF for distant genomes. More accurate ANI for VERY fragmented assemblies (< 3kb N50), but less accurate ANI otherwise. Alias for -c 30."; +pub const MODE_SMALL_GENOMES: &str = "small-genomes"; +pub const CMD_MODE_SMALL_GENOMES: &str = "small-genomes"; +pub const H_MODE_SMALL_GENOMES : &str = "Mode for small genomes such as viruses or plasmids (< 20 kb). Can be much faster for large data, but is slower/less accurate on bacterial-sized genomes. Alias for: -c 30 -m 200 --faster-small."; + pub const MODE_FAST: &str = "fast"; pub const CMD_MODE_FAST : &str = "fast"; pub const H_MODE_FAST : &str = "Faster skani mode; 2x faster and less memory. Less accurate AF and less accurate ANI for distant genomes, but works ok for high N50 and > 95% ANI. Alias for -c 200."; @@ -63,7 +67,7 @@ pub const H_DETAIL_OUT: &str = "Print additional info including contig N50s and pub const DISTANCE_OUT: &str = "distance"; pub const CMD_DISTANCE_OUT: &str = "distance"; -pub const H_DISTANCE_OUT: &str = "Output 100 - ANI instead of ANI, creating a distance instead of a similarity matrix."; +pub const H_DISTANCE_OUT: &str = "Output 100 - ANI instead of ANI, creating a distance instead of a similarity matrix. No effect if using --sparse or -E."; pub const INT_WRITE: &str = "intermediate write count"; pub const CMD_INT_WRITE: &str = "inter-write"; @@ -72,3 +76,8 @@ pub const H_INT_WRITE: &str = "Write results to output after --inter-write queri pub const FAST_SMALL: &str = "faster-small"; pub const CMD_FAST_SMALL: &str = "faster-small"; pub const H_FAST_SMALL: &str = "Filter genomes with < 20 marker k-mers more aggressively. Much faster for many small genomes but may miss some comparisons."; + +pub const DIAG: &str = "diagonal"; +pub const CMD_DIAG: &str = "diagonal"; +pub const H_DIAG: &str = "Output the diagonal of the ANI matrix (i.e. self-self comparisons) for both dense and sparse matrices."; + diff --git a/src/file_io.rs b/src/file_io.rs index 0a008ab..f779a41 100644 --- a/src/file_io.rs +++ b/src/file_io.rs @@ -22,6 +22,64 @@ fn write_header(writer: &mut impl Write, id_str: &str, ci: bool, verbose: bool) } } +fn write_ani_res_perfect(writer: &mut impl Write, sketch: &Sketch, ci: bool, verbose: bool) { + if !ci && !verbose { + writeln!( + writer, + "{}\t{}\t{:.2}\t{:.2}\t{:.2}\t{}\t{}", + sketch.file_name, + sketch.file_name, + 100, + 100, + 100, + sketch.contigs[0], + sketch.contigs[0], + ) + .unwrap(); + } else if !verbose { + writeln!( + writer, + "{}\t{}\t{:.2}\t{:.2}\t{:.2}\t{}\t{}\t{:.2}\t{:.2}", + sketch.file_name, + sketch.file_name, + 100, + 100, + 100, + sketch.contigs[0], + sketch.contigs[0], + 100, + 100, + ) + .unwrap(); + } else { + writeln!( + writer, + "{}\t{}\t{:.2}\t{:.2}\t{:.2}\t{}\t{}\t{}\t{}\t{:.2}\t{:.2}\t{:.2}\t{:0}\t{:0}\t{:0}\t{:0}\t{:0}\t{:0}\t{:0}\t{:0}", + sketch.file_name, + sketch.file_name, + 100, + 100, + 100, + sketch.contigs[0], + sketch.contigs[0], + sketch.contigs.len(), + sketch.contigs.len(), + 100, + 100, + 0, + -1, + -1, + -1, + -1, + -1, + -1, + 0, + sketch.total_sequence_length, + ) + .unwrap(); + } +} + fn write_ani_res(writer: &mut impl Write, ani_res: &AniEstResult, ci: bool, verbose: bool) { if !ci && !verbose { writeln!( @@ -309,6 +367,7 @@ pub fn write_phyllip_matrix( file_name: &str, use_contig_names: bool, full_matrix: bool, + diag: bool, aai: bool, distance: bool, ) { @@ -332,9 +391,17 @@ pub fn write_phyllip_matrix( if full_matrix { end = sketches.len(); } else { - end = i; + if diag { + end = i + 1; + } else { + end = i; + } } for j in 0..end { + if j == i { + write!(&mut handle, "\t{:.2}", perfect).unwrap(); + continue; + } let x = usize::min(i, j); let y = usize::max(i, j); if i == j { @@ -418,7 +485,7 @@ pub fn write_phyllip_matrix( for j in 0..end { let full_cond = full_matrix || (i > j); if i == j { - if full_cond { + if full_cond || diag { write!(&mut ani_file, "\t{:.2}", perfect).unwrap(); } write!(&mut af_file, "\t{:.2}", 100.).unwrap(); @@ -473,11 +540,12 @@ pub fn write_phyllip_matrix( pub fn write_sparse_matrix( anis: &FxHashMap>, - _sketches: &Vec, + sketches: &Vec, file_name: &str, aai: bool, est_ci: bool, detailed_out: bool, + diag: bool, append: bool ) { let id_str = if aai { "AAI" } else { "ANI" }; @@ -488,6 +556,11 @@ pub fn write_sparse_matrix( write_header(&mut handle, id_str, est_ci, detailed_out); } // write!(&mut handle,"Ref_file\tQuery_file\t{}\tAlign_fraction_ref\tAlign_fraction_query\t{}_95_percentile\t{}_5_percentile\tRef_name\tQuery_name\n", id_str, id_str, id_str).unwrap(); + if diag{ + for sketch in sketches.iter(){ + write_ani_res_perfect(&mut handle, sketch, est_ci, detailed_out); + } + } for i in anis.keys() { for (j, ani_res) in anis[i].iter() { if !(anis[i][j].ani == -1. || anis[i][j].ani.is_nan()) { @@ -511,7 +584,17 @@ pub fn write_sparse_matrix( if !append{ write_header(&mut ani_file, id_str, est_ci, detailed_out); } + + if diag{ + for sketch in sketches.iter(){ + write_ani_res_perfect(&mut ani_file, sketch, est_ci, detailed_out); + } + } + for i in anis.keys() { + if diag{ + write_ani_res_perfect(&mut ani_file, &sketches[*i], est_ci, detailed_out); + } for (j, ani_res) in anis[i].iter() { if !(anis[i][j].ani == -1. || anis[i][j].ani.is_nan()) { write_ani_res(&mut ani_file, ani_res, est_ci, detailed_out); @@ -602,7 +685,12 @@ pub fn sketches_from_sketch(ref_files: &Vec) -> (SketchParams, Vec = bincode::deserialize_from(reader); if res.is_ok() { let (temp_sketch_param, temp_ref_sketch) = res.unwrap(); diff --git a/src/main.rs b/src/main.rs index d1cad68..25648a6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -21,7 +21,7 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; fn main() { let matches = Command::new("skani") .setting(AppSettings::ArgRequiredElseHelp) - .version(params::VERSION) + .version("0.2.2") .about("fast, robust ANI calculation and database searching for metagenomic contigs and assemblies. \n\nQuick ANI calculation:\nskani dist genome1.fa genome2.fa \n\nMemory-efficient database search:\nskani sketch genomes/* -o database; skani search -d database query1.fa query2.fa ...\n\nAll-to-all comparison:\nskani triangle genomes/*") .subcommand( SubCommand::with_name("help").setting(AppSettings::Hidden) @@ -75,7 +75,6 @@ fn main() { .help(H_MODE_FAST) .takes_value(false), ) - .help_heading("SKETCH PARAMETERS") .arg( Arg::new("aai") @@ -235,6 +234,12 @@ fn main() { .help(H_MODE_FAST) .takes_value(false), ) + .arg( + Arg::new(MODE_SMALL_GENOMES) + .long(CMD_MODE_SMALL_GENOMES) + .help(H_MODE_SMALL_GENOMES) + .takes_value(false), + ) .help_heading("ALGORITHM PARAMETERS") .arg( Arg::new(NO_LEARNED_ANI) @@ -350,6 +355,12 @@ fn main() { .long(CMD_FULL_MAT) .help(H_FULL_MAT) ) + .arg( + Arg::new(DIAG) + .long(DIAG) + .help(H_DIAG) + .takes_value(false) + ) .arg( Arg::new(MIN_ALIGN_FRAC) .long(CMD_MIN_ALIGN_FRAC) @@ -399,7 +410,12 @@ fn main() { .help(H_MODE_FAST) .takes_value(false), ) - + .arg( + Arg::new(MODE_SMALL_GENOMES) + .long(CMD_MODE_SMALL_GENOMES) + .help(H_MODE_SMALL_GENOMES) + .takes_value(false), + ) .help_heading("ALGORITHM PARAMETERS") .arg( Arg::new(NO_LEARNED_ANI) diff --git a/src/params.rs b/src/params.rs index 832015e..7a9b74d 100644 --- a/src/params.rs +++ b/src/params.rs @@ -1,7 +1,7 @@ use crate::types::*; use gbdt::gradient_boost::GBDT; -pub const VERSION: &str = "0.2.1"; +pub const VERSION: &str = "0.2.2"; pub const GB_IN_BYTES: usize = 1_073_741_824; pub const SMALL_VEC_SIZE: usize = 1; @@ -56,6 +56,7 @@ pub const LEARNED_INFO_HELP: &str = "Learned ANI mode detected. ANI may be adjus pub const FAST_C: usize = 200; pub const SLOW_C: usize = 30; pub const MEDIUM_C: usize = 70; +pub const SMALL_M: usize = 200; pub const ASCII_N: usize = 78; pub const ASCII_N_SMALL: usize = 110; @@ -104,6 +105,7 @@ pub struct CommandParams{ pub median: bool, pub sparse: bool, pub full_matrix: bool, + pub diagonal: bool, pub max_results: usize, pub individual_contig_q: bool, pub individual_contig_r: bool, diff --git a/src/parse.rs b/src/parse.rs index c69c23f..ab12796 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -63,7 +63,7 @@ pub fn parse_params(matches: &ArgMatches) -> (SketchParams, CommandParams) { let rescue_small; if mode == Mode::Triangle || mode == Mode::Dist{ - if matches_subc.is_present(FAST_SMALL){ + if matches_subc.is_present(FAST_SMALL) || matches_subc.is_present(MODE_SMALL_GENOMES) { rescue_small = false; } else{ @@ -163,6 +163,13 @@ pub fn parse_params(matches: &ArgMatches) -> (SketchParams, CommandParams) { .unwrap_or(def_c) .parse::() .unwrap(); + + let mut marker_c = matches_subc + .value_of("marker_c") + .unwrap_or(MARKER_C_DEFAULT) + .parse::() + .unwrap(); + if matches_subc.is_present(MODE_FAST) && matches_subc.is_present(MODE_SLOW){ panic!("Both --slow and --fast were set. This is not allowed."); @@ -185,6 +192,15 @@ pub fn parse_params(matches: &ArgMatches) -> (SketchParams, CommandParams) { } c = MEDIUM_C } + if mode == Mode::Triangle || mode == Mode::Dist{ + if matches_subc.is_present(MODE_SMALL_GENOMES){ + if matches_subc.is_present("c") || matches_subc.is_present("marker_c") { + warn!("-c or -m value is set but --small-genomes is also set. Using -c 30 and -m 200 instead."); + } + c = SLOW_C; + marker_c = SMALL_M; + } + } let min_aligned_frac; let est_ci; @@ -209,11 +225,6 @@ pub fn parse_params(matches: &ArgMatches) -> (SketchParams, CommandParams) { detailed_out = false; } - let marker_c = matches_subc - .value_of("marker_c") - .unwrap_or(MARKER_C_DEFAULT) - .parse::() - .unwrap(); let out_file_name; if mode == Mode::Triangle { out_file_name = matches_subc.value_of("output").unwrap_or("").to_string(); @@ -290,10 +301,13 @@ pub fn parse_params(matches: &ArgMatches) -> (SketchParams, CommandParams) { } let full_matrix; + let diagonal; if mode == Mode::Triangle { full_matrix = matches_subc.is_present(FULL_MAT); + diagonal = matches_subc.is_present(DIAG); } else { full_matrix = false; + diagonal = false; } let screen; @@ -340,6 +354,7 @@ pub fn parse_params(matches: &ArgMatches) -> (SketchParams, CommandParams) { median, sparse, full_matrix, + diagonal, max_results, individual_contig_q, individual_contig_r, @@ -451,6 +466,7 @@ pub fn parse_params_search(matches_subc: &ArgMatches) -> (SketchParams, CommandP median, sparse, full_matrix: false, + diagonal: false, max_results, individual_contig_q, individual_contig_r: false, diff --git a/src/screen.rs b/src/screen.rs index 0a7a679..6c226e1 100644 --- a/src/screen.rs +++ b/src/screen.rs @@ -1,5 +1,4 @@ use crate::params::*; -use std::iter; //use std::collections::{HashMap, HashSet}; //use std::hash::{BuildHasherDefault, Hash, Hasher}; use smallvec::SmallVec; diff --git a/src/triangle.rs b/src/triangle.rs index 0a1b822..d961dc6 100644 --- a/src/triangle.rs +++ b/src/triangle.rs @@ -16,8 +16,8 @@ pub fn triangle(command_params: CommandParams, mut sketch_params: SketchParams) if command_params.refs_are_sketch { info!("Sketches detected."); let param_and_sketches = file_io::sketches_from_sketch(&command_params.ref_files); - if param_and_sketches.0.c != sketch_params.c { - warn!("Input parameter c = {} is not equal to the sketch parameter c = {}. Using sketch parameters.", sketch_params.c, param_and_sketches.0.c); + if param_and_sketches.0.c != sketch_params.c || param_and_sketches.0.marker_c != sketch_params.marker_c { + warn!("Input parameter c = {}, m = {} is not equal to the sketch parameter c = {},m = {}. Using sketch parameters.", sketch_params.c, sketch_params.marker_c, param_and_sketches.0.c, param_and_sketches.0.marker_c); } ref_sketches = param_and_sketches.1; sketch_params = param_and_sketches.0; @@ -55,6 +55,14 @@ pub fn triangle(command_params: CommandParams, mut sketch_params: SketchParams) if model_opt.is_some() { info!("{}", LEARNED_INFO_HELP); } + + let num_rescue = ref_sketches.iter().filter(|x| x.marker_seeds.len() < 20).count(); + if num_rescue > 1000 { + if command_params.rescue_small && ref_sketches.len() > 2000 { + warn!("> 1000 genomes with < 20 markers are detected. Consider decreasing -m value and/or using --faster-small for faster calculations."); + } + } + (0..ref_sketches.len() - 1) .collect::>() .into_par_iter() @@ -113,6 +121,7 @@ pub fn triangle(command_params: CommandParams, mut sketch_params: SketchParams) sketch_params.use_aa, command_params.est_ci, command_params.detailed_out, + command_params.diagonal, !*locked, ); if *locked == true { @@ -134,6 +143,7 @@ pub fn triangle(command_params: CommandParams, mut sketch_params: SketchParams) sketch_params.use_aa, command_params.est_ci, command_params.detailed_out, + command_params.diagonal, !*first.lock().unwrap(), ); } else { @@ -143,6 +153,7 @@ pub fn triangle(command_params: CommandParams, mut sketch_params: SketchParams) &command_params.out_file_name, command_params.individual_contig_r, command_params.full_matrix, + command_params.diagonal, sketch_params.use_aa, command_params.distance, ); diff --git a/test_files/viruses.fna b/test_files/viruses.fna index 3af0493..6ace4d2 100644 --- a/test_files/viruses.fna +++ b/test_files/viruses.fna @@ -498,7 +498,6 @@ GGGAGGACTTGAAAGAGCCACCACATTTTCACCGAGGCCACGCGGAGTACGATCGAGTGT ACAGTGAACAATGCTAGGGAGAGCTGCCTATATGGAAGAGCCCTAATGTGTAAAATTAAT TTTAGTAGTGCTATCCCCATGTGATTTTAATAGCTTCTTAGGAGAATGACAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAA - >OR649331.1 CTTTTGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACT CGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAG diff --git a/tests/int_test_new.rs b/tests/int_test_new.rs index bb85701..e28aa39 100644 --- a/tests/int_test_new.rs +++ b/tests/int_test_new.rs @@ -74,11 +74,32 @@ fn fast_test_small_genomes() { let results = get_result_from_out(&out_line); println!("o157_reads, number of results without faster-small: {}", results.len()); + let out_line = run_skani(&["triangle", "-t", "1", "-i", "-E", "--small-genomes", "./test_files/o157_reads.fastq"], false); + let results1 = get_result_from_out(&out_line); + + let out_line = run_skani(&["triangle","-t", "1", "-i", "-E", "-c", "30", "-m", "200", "--faster-small", "./test_files/o157_reads.fastq"], false); + let results2 = get_result_from_out(&out_line); + + assert!(results1 == results2); + + let out_line = run_skani(&["triangle", "-i", "-E", "--faster-small", "./test_files/o157_reads.fastq"], false); let results = get_result_from_out(&out_line); println!("o157_reads, number of results WITH faster-small: {}", results.len()); } +#[test] +fn test_diag_triangle(){ + let out_line = run_skani(&["triangle", "-i", "-E", "./test_files/viruses.fna", "--diagonal"], false); + let results = get_result_from_out(&out_line); + assert!(results[0].ani == 100.0); + + let out_line = run_skani(&["triangle", "-i", "-E", "./test_files/viruses.fna", "--diagonal", "--detailed"], false); + let results = get_result_from_out(&out_line); + assert!(results[0].ani == 100.0); + +} + #[test] fn fast_test_screen(){ diff --git a/tests/tests.rs b/tests/tests.rs index 20b001c..e9f8fb3 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -19,6 +19,7 @@ fn default_params(mode: Mode) -> (CommandParams, SketchParams) { median: false, sparse: false, full_matrix: false, + diagonal: false, max_results: 10000000, individual_contig_q: false, individual_contig_r: false,