Skip to content

Commit

Permalink
v0.2.2 - better small genome options and some bug fixes/features
Browse files Browse the repository at this point in the history
  • Loading branch information
bluenote-1577 committed Jul 5, 2024
1 parent b6cf523 commit a17e639
Show file tree
Hide file tree
Showing 13 changed files with 191 additions and 23 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
### v0.2.2 released - 2024-TODO
### v0.2.2 released - 2024-07-04

#### Major

* added the `--small-genomes` preset. This is just an alias for `-c 30 -m 200 --faster-small`. This makes skani much faster when comparing hundreds of thousands of small genomes.

#### Minor

* fixed a bug where `skani triangle --full-matrix` gave different results between STDOUT and `-o` (thanks to Florian Plaza Onate)
* added a `--diagonal` option (suggested by Antonio Camargo) to print diagonal entries for sparse and lower-triangular distance matrices
* added a warning to use `--faster-small` when comparing too many contigs (e.g. viruses, plasmids).

### v0.2.1 released - 2023-10-11

Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "skani"
###Make sure to change version in main.rs after changing cargo.toml
version = "0.2.1"
version = "0.2.2"
####
edition = "2021"
license = "MIT OR Apache-2.0"
Expand Down
13 changes: 11 additions & 2 deletions src/cmd_line.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ pub const C_FACTOR: &str = "c";
pub const CMD_C_FACTOR: &str = "c";
pub const H_C_FACTOR: &str = "Compression factor (k-mer subsampling rate).\t[default: 125]";

pub const H_SCREEN: &str = "Screen out pairs with < % identity using k-mer sketching.\t[default: 80]";
pub const H_SCREEN: &str = "Screen out pairs with *approximately* < % identity using k-mer sketching.\t[default: 80]";

pub const CONF_INTERVAL: &str = "ci";
pub const CMD_CONF_INTERVAL: &str = "ci";
Expand All @@ -45,6 +45,10 @@ pub const MODE_SLOW: &str = "slow";
pub const CMD_MODE_SLOW : &str = "slow";
pub const H_MODE_SLOW : &str = "Slower skani mode; 4x slower and more memory. Gives much more accurate AF for distant genomes. More accurate ANI for VERY fragmented assemblies (< 3kb N50), but less accurate ANI otherwise. Alias for -c 30.";

pub const MODE_SMALL_GENOMES: &str = "small-genomes";
pub const CMD_MODE_SMALL_GENOMES: &str = "small-genomes";
pub const H_MODE_SMALL_GENOMES : &str = "Mode for small genomes such as viruses or plasmids (< 20 kb). Can be much faster for large data, but is slower/less accurate on bacterial-sized genomes. Alias for: -c 30 -m 200 --faster-small.";

pub const MODE_FAST: &str = "fast";
pub const CMD_MODE_FAST : &str = "fast";
pub const H_MODE_FAST : &str = "Faster skani mode; 2x faster and less memory. Less accurate AF and less accurate ANI for distant genomes, but works ok for high N50 and > 95% ANI. Alias for -c 200.";
Expand All @@ -63,7 +67,7 @@ pub const H_DETAIL_OUT: &str = "Print additional info including contig N50s and

pub const DISTANCE_OUT: &str = "distance";
pub const CMD_DISTANCE_OUT: &str = "distance";
pub const H_DISTANCE_OUT: &str = "Output 100 - ANI instead of ANI, creating a distance instead of a similarity matrix.";
pub const H_DISTANCE_OUT: &str = "Output 100 - ANI instead of ANI, creating a distance instead of a similarity matrix. No effect if using --sparse or -E.";

pub const INT_WRITE: &str = "intermediate write count";
pub const CMD_INT_WRITE: &str = "inter-write";
Expand All @@ -72,3 +76,8 @@ pub const H_INT_WRITE: &str = "Write results to output after --inter-write queri
pub const FAST_SMALL: &str = "faster-small";
pub const CMD_FAST_SMALL: &str = "faster-small";
pub const H_FAST_SMALL: &str = "Filter genomes with < 20 marker k-mers more aggressively. Much faster for many small genomes but may miss some comparisons.";

pub const DIAG: &str = "diagonal";
pub const CMD_DIAG: &str = "diagonal";
pub const H_DIAG: &str = "Output the diagonal of the ANI matrix (i.e. self-self comparisons) for both dense and sparse matrices.";

96 changes: 92 additions & 4 deletions src/file_io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,64 @@ fn write_header(writer: &mut impl Write, id_str: &str, ci: bool, verbose: bool)
}
}

fn write_ani_res_perfect(writer: &mut impl Write, sketch: &Sketch, ci: bool, verbose: bool) {
if !ci && !verbose {
writeln!(
writer,
"{}\t{}\t{:.2}\t{:.2}\t{:.2}\t{}\t{}",
sketch.file_name,
sketch.file_name,
100,
100,
100,
sketch.contigs[0],
sketch.contigs[0],
)
.unwrap();
} else if !verbose {
writeln!(
writer,
"{}\t{}\t{:.2}\t{:.2}\t{:.2}\t{}\t{}\t{:.2}\t{:.2}",
sketch.file_name,
sketch.file_name,
100,
100,
100,
sketch.contigs[0],
sketch.contigs[0],
100,
100,
)
.unwrap();
} else {
writeln!(
writer,
"{}\t{}\t{:.2}\t{:.2}\t{:.2}\t{}\t{}\t{}\t{}\t{:.2}\t{:.2}\t{:.2}\t{:0}\t{:0}\t{:0}\t{:0}\t{:0}\t{:0}\t{:0}\t{:0}",
sketch.file_name,
sketch.file_name,
100,
100,
100,
sketch.contigs[0],
sketch.contigs[0],
sketch.contigs.len(),
sketch.contigs.len(),
100,
100,
0,
-1,
-1,
-1,
-1,
-1,
-1,
0,
sketch.total_sequence_length,
)
.unwrap();
}
}

fn write_ani_res(writer: &mut impl Write, ani_res: &AniEstResult, ci: bool, verbose: bool) {
if !ci && !verbose {
writeln!(
Expand Down Expand Up @@ -309,6 +367,7 @@ pub fn write_phyllip_matrix(
file_name: &str,
use_contig_names: bool,
full_matrix: bool,
diag: bool,
aai: bool,
distance: bool,
) {
Expand All @@ -332,9 +391,17 @@ pub fn write_phyllip_matrix(
if full_matrix {
end = sketches.len();
} else {
end = i;
if diag {
end = i + 1;
} else {
end = i;
}
}
for j in 0..end {
if j == i {
write!(&mut handle, "\t{:.2}", perfect).unwrap();
continue;
}
let x = usize::min(i, j);
let y = usize::max(i, j);
if i == j {
Expand Down Expand Up @@ -418,7 +485,7 @@ pub fn write_phyllip_matrix(
for j in 0..end {
let full_cond = full_matrix || (i > j);
if i == j {
if full_cond {
if full_cond || diag {
write!(&mut ani_file, "\t{:.2}", perfect).unwrap();
}
write!(&mut af_file, "\t{:.2}", 100.).unwrap();
Expand Down Expand Up @@ -473,11 +540,12 @@ pub fn write_phyllip_matrix(

pub fn write_sparse_matrix(
anis: &FxHashMap<usize, FxHashMap<usize, AniEstResult>>,
_sketches: &Vec<Sketch>,
sketches: &Vec<Sketch>,
file_name: &str,
aai: bool,
est_ci: bool,
detailed_out: bool,
diag: bool,
append: bool
) {
let id_str = if aai { "AAI" } else { "ANI" };
Expand All @@ -488,6 +556,11 @@ pub fn write_sparse_matrix(
write_header(&mut handle, id_str, est_ci, detailed_out);
}
// write!(&mut handle,"Ref_file\tQuery_file\t{}\tAlign_fraction_ref\tAlign_fraction_query\t{}_95_percentile\t{}_5_percentile\tRef_name\tQuery_name\n", id_str, id_str, id_str).unwrap();
if diag{
for sketch in sketches.iter(){
write_ani_res_perfect(&mut handle, sketch, est_ci, detailed_out);
}
}
for i in anis.keys() {
for (j, ani_res) in anis[i].iter() {
if !(anis[i][j].ani == -1. || anis[i][j].ani.is_nan()) {
Expand All @@ -511,7 +584,17 @@ pub fn write_sparse_matrix(
if !append{
write_header(&mut ani_file, id_str, est_ci, detailed_out);
}

if diag{
for sketch in sketches.iter(){
write_ani_res_perfect(&mut ani_file, sketch, est_ci, detailed_out);
}
}

for i in anis.keys() {
if diag{
write_ani_res_perfect(&mut ani_file, &sketches[*i], est_ci, detailed_out);
}
for (j, ani_res) in anis[i].iter() {
if !(anis[i][j].ani == -1. || anis[i][j].ani.is_nan()) {
write_ani_res(&mut ani_file, ani_res, est_ci, detailed_out);
Expand Down Expand Up @@ -602,7 +685,12 @@ pub fn sketches_from_sketch(ref_files: &Vec<String>) -> (SketchParams, Vec<Sketc
.for_each(|i| {
let sketch_file = &ref_files[i];
if !sketch_file.contains("markers.bin") {
let reader = BufReader::new(File::open(sketch_file).expect(sketch_file));
let f = File::open(sketch_file);
if f.is_err() {
error!("Problem reading sketch file {}. Perhaps your file path is wrong? Exiting.", sketch_file);
std::process::exit(1)
}
let reader = BufReader::new(f.unwrap());
let res: Result<(SketchParams, Sketch), _> = bincode::deserialize_from(reader);
if res.is_ok() {
let (temp_sketch_param, temp_ref_sketch) = res.unwrap();
Expand Down
22 changes: 19 additions & 3 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
fn main() {
let matches = Command::new("skani")
.setting(AppSettings::ArgRequiredElseHelp)
.version(params::VERSION)
.version("0.2.2")
.about("fast, robust ANI calculation and database searching for metagenomic contigs and assemblies. \n\nQuick ANI calculation:\nskani dist genome1.fa genome2.fa \n\nMemory-efficient database search:\nskani sketch genomes/* -o database; skani search -d database query1.fa query2.fa ...\n\nAll-to-all comparison:\nskani triangle genomes/*")
.subcommand(
SubCommand::with_name("help").setting(AppSettings::Hidden)
Expand Down Expand Up @@ -75,7 +75,6 @@ fn main() {
.help(H_MODE_FAST)
.takes_value(false),
)

.help_heading("SKETCH PARAMETERS")
.arg(
Arg::new("aai")
Expand Down Expand Up @@ -235,6 +234,12 @@ fn main() {
.help(H_MODE_FAST)
.takes_value(false),
)
.arg(
Arg::new(MODE_SMALL_GENOMES)
.long(CMD_MODE_SMALL_GENOMES)
.help(H_MODE_SMALL_GENOMES)
.takes_value(false),
)
.help_heading("ALGORITHM PARAMETERS")
.arg(
Arg::new(NO_LEARNED_ANI)
Expand Down Expand Up @@ -350,6 +355,12 @@ fn main() {
.long(CMD_FULL_MAT)
.help(H_FULL_MAT)
)
.arg(
Arg::new(DIAG)
.long(DIAG)
.help(H_DIAG)
.takes_value(false)
)
.arg(
Arg::new(MIN_ALIGN_FRAC)
.long(CMD_MIN_ALIGN_FRAC)
Expand Down Expand Up @@ -399,7 +410,12 @@ fn main() {
.help(H_MODE_FAST)
.takes_value(false),
)

.arg(
Arg::new(MODE_SMALL_GENOMES)
.long(CMD_MODE_SMALL_GENOMES)
.help(H_MODE_SMALL_GENOMES)
.takes_value(false),
)
.help_heading("ALGORITHM PARAMETERS")
.arg(
Arg::new(NO_LEARNED_ANI)
Expand Down
4 changes: 3 additions & 1 deletion src/params.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::types::*;
use gbdt::gradient_boost::GBDT;

pub const VERSION: &str = "0.2.1";
pub const VERSION: &str = "0.2.2";

pub const GB_IN_BYTES: usize = 1_073_741_824;
pub const SMALL_VEC_SIZE: usize = 1;
Expand Down Expand Up @@ -56,6 +56,7 @@ pub const LEARNED_INFO_HELP: &str = "Learned ANI mode detected. ANI may be adjus
pub const FAST_C: usize = 200;
pub const SLOW_C: usize = 30;
pub const MEDIUM_C: usize = 70;
pub const SMALL_M: usize = 200;

pub const ASCII_N: usize = 78;
pub const ASCII_N_SMALL: usize = 110;
Expand Down Expand Up @@ -104,6 +105,7 @@ pub struct CommandParams{
pub median: bool,
pub sparse: bool,
pub full_matrix: bool,
pub diagonal: bool,
pub max_results: usize,
pub individual_contig_q: bool,
pub individual_contig_r: bool,
Expand Down
28 changes: 22 additions & 6 deletions src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ pub fn parse_params(matches: &ArgMatches) -> (SketchParams, CommandParams) {

let rescue_small;
if mode == Mode::Triangle || mode == Mode::Dist{
if matches_subc.is_present(FAST_SMALL){
if matches_subc.is_present(FAST_SMALL) || matches_subc.is_present(MODE_SMALL_GENOMES) {
rescue_small = false;
}
else{
Expand Down Expand Up @@ -163,6 +163,13 @@ pub fn parse_params(matches: &ArgMatches) -> (SketchParams, CommandParams) {
.unwrap_or(def_c)
.parse::<usize>()
.unwrap();

let mut marker_c = matches_subc
.value_of("marker_c")
.unwrap_or(MARKER_C_DEFAULT)
.parse::<usize>()
.unwrap();

if matches_subc.is_present(MODE_FAST) &&
matches_subc.is_present(MODE_SLOW){
panic!("Both --slow and --fast were set. This is not allowed.");
Expand All @@ -185,6 +192,15 @@ pub fn parse_params(matches: &ArgMatches) -> (SketchParams, CommandParams) {
}
c = MEDIUM_C
}
if mode == Mode::Triangle || mode == Mode::Dist{
if matches_subc.is_present(MODE_SMALL_GENOMES){
if matches_subc.is_present("c") || matches_subc.is_present("marker_c") {
warn!("-c or -m value is set but --small-genomes is also set. Using -c 30 and -m 200 instead.");
}
c = SLOW_C;
marker_c = SMALL_M;
}
}

let min_aligned_frac;
let est_ci;
Expand All @@ -209,11 +225,6 @@ pub fn parse_params(matches: &ArgMatches) -> (SketchParams, CommandParams) {
detailed_out = false;
}

let marker_c = matches_subc
.value_of("marker_c")
.unwrap_or(MARKER_C_DEFAULT)
.parse::<usize>()
.unwrap();
let out_file_name;
if mode == Mode::Triangle {
out_file_name = matches_subc.value_of("output").unwrap_or("").to_string();
Expand Down Expand Up @@ -290,10 +301,13 @@ pub fn parse_params(matches: &ArgMatches) -> (SketchParams, CommandParams) {
}

let full_matrix;
let diagonal;
if mode == Mode::Triangle {
full_matrix = matches_subc.is_present(FULL_MAT);
diagonal = matches_subc.is_present(DIAG);
} else {
full_matrix = false;
diagonal = false;
}

let screen;
Expand Down Expand Up @@ -340,6 +354,7 @@ pub fn parse_params(matches: &ArgMatches) -> (SketchParams, CommandParams) {
median,
sparse,
full_matrix,
diagonal,
max_results,
individual_contig_q,
individual_contig_r,
Expand Down Expand Up @@ -451,6 +466,7 @@ pub fn parse_params_search(matches_subc: &ArgMatches) -> (SketchParams, CommandP
median,
sparse,
full_matrix: false,
diagonal: false,
max_results,
individual_contig_q,
individual_contig_r: false,
Expand Down
1 change: 0 additions & 1 deletion src/screen.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use crate::params::*;
use std::iter;
//use std::collections::{HashMap, HashSet};
//use std::hash::{BuildHasherDefault, Hash, Hasher};
use smallvec::SmallVec;
Expand Down
Loading

0 comments on commit a17e639

Please sign in to comment.