Update to version 1.1.0

PacificBiosciences · Jul 17, 2024 · 4b7576d · 4b7576d
1 parent e231f95
commit 4b7576d
Show file tree

Hide file tree

Showing 21 changed files with 1,400 additions and 56 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "trgt"
-version = "1.0.0"
+version = "1.1.0"
 edition = "2021"
 build = "build.rs"
 
@@ -26,4 +26,5 @@ tiny-skia = "0.11"
 svg2pdf = "0.9"
 tempfile = "3"
 rayon = "1.10"
-crossbeam-channel = "0.5"
+crossbeam-channel = "0.5"
+semver = "1.0"
diff --git a/README.md b/README.md
@@ -19,15 +19,14 @@ changes to the input and output file formats of TRGT.
 - Repeat definition files are available in [this Zenodo repository](https://zenodo.org/record/8329210)
   and definitions of known pathogenic repeats are [also available here](repeats/).
 
-## TRGTdb
+## Joint analysis of multiple samples
 
-TRGT outputs VCFs containing TR alleles from each region in the repeat catalog.
-To facilitate analysis of alleles across multiple samples, we provide the TRGTdb
-which can be found [here](https://github.com/PacificBiosciences/trgt/pull/6).
-After cloning that fork, the TRGTdb can be installed by running
-`python3 -m pip install trgt/`. See the fork's `notebooks/` directory for tutorials
-converting results into TRGTdb as well as example analyses. TRGTdb was developed by
-[Adam English](https://github.com/ACEnglish).
+TRGT outputs VCFs containing repeat alleles from each region in the repeat
+catalog. To facilitate analysis of repeats across multiple samples, VCFs can be
+either merged into a multi-sample VCF using the `merge` sub-command or converted
+into a database using the [TDB tool](https://github.com/ACEnglish/tdb) (formerly
+called TRGTdb). TDB offers many advantages over multi-sample VCFs, including
+simpler data extraction, support for queries, and reduced file sizes.
 
 ## Documentation
 
@@ -119,6 +118,11 @@ tandem repeats at genome scale. 2024](https://www.nature.com/articles/s41587-023
   - Lower memory footprint: Better memory management significantly reduces memory usage with large repeat catalogs.
   - Updated error handling: Malformed entries are now logged as errors without terminating the program.
   - Added shorthand CLI options to simplify command usage.
+- 1.1.0
+  - Added a new subcommand `trgt merge`. This command merges VCF files generated by `trgt genotype` into a joint VCF file. **Works with VCFs generated by all versions of TRGT** (the resulting joint VCF will always be in the TRGT v1.0+ format which includes padding bases).
+  - Added subsampling of regions with ultra-high coverage (`>MAX_DEPTH * 3`, by default 750); implemented via reservoir sampling.
+  - Fixed a cluster genotyper bug that occurred when only a single read covered a locus.
+  - Added new logic for filtering non-HiFi reads: remove up to 3% of lower quality reads that do not match the expected repeat sequence.
 
 ### DISCLAIMER
 

diff --git a/src/cli.rs b/src/cli.rs
@@ -1,4 +1,7 @@
-use crate::utils::{Genotyper, Result, TrgtScoring};
+use crate::{
+    merge::vcf_writer::OutputType,
+    utils::{Genotyper, Result, TrgtScoring},
+};
 use chrono::Datelike;
 use clap::{ArgAction, ArgGroup, Parser, Subcommand};
 use env_logger::fmt::Color;
@@ -46,6 +49,91 @@ pub enum Command {
     Plot(PlotArgs),
     #[clap(about = "Tandem Repeat Catalog Validator")]
     Validate(ValidateArgs),
+    #[clap(about = "Tandem Repeat VCF Merger")]
+    Merge(MergeArgs),
+}
+
+#[derive(Parser, Debug)]
+#[command(group(ArgGroup::new("merge")))]
+#[command(arg_required_else_help(true))]
+pub struct MergeArgs {
+    #[clap(required = true)]
+    #[clap(short = 'v')]
+    #[clap(long = "vcf")]
+    #[clap(help = "VCF files to merge")]
+    #[clap(value_name = "VCF")]
+    #[clap(num_args = 1..)]
+    pub vcfs: Vec<PathBuf>,
+
+    #[clap(short = 'g')]
+    #[clap(long = "genome")]
+    #[clap(help = "Path to reference genome FASTA")]
+    #[clap(value_name = "FASTA")]
+    #[arg(value_parser = check_file_exists)]
+    pub genome_path: PathBuf,
+
+    #[clap(short = 'o')]
+    #[clap(long = "output")]
+    #[clap(value_name = "FILE")]
+    #[clap(help = "Write output to a file [standard output]")]
+    #[arg(value_parser = check_prefix_path)]
+    pub output: Option<String>,
+
+    #[clap(help_heading("Advanced"))]
+    #[clap(short = 'O')]
+    #[clap(long = "output-type")]
+    #[clap(value_name = "OUTPUT_TYPE")]
+    #[clap(help = "Output type: u|b|v|z, u/b: un/compressed BCF, v/z: un/compressed VCF")]
+    #[clap(value_parser = merge_validate_output_type)]
+    pub output_type: Option<OutputType>,
+
+    #[clap(help_heading("Advanced"))]
+    #[clap(long = "skip-n")]
+    #[clap(value_name = "SKIP_N")]
+    #[clap(help = "Skip the first N records")]
+    pub skip_n: Option<usize>,
+
+    #[clap(help_heading("Advanced"))]
+    #[clap(long = "process-n")]
+    #[clap(value_name = "process_N")]
+    #[clap(help = "Only process N records")]
+    pub process_n: Option<usize>,
+
+    #[clap(help_heading("Advanced"))]
+    #[clap(long = "print-header")]
+    #[clap(help = "Print only the merged header and exit")]
+    pub print_header: bool,
+
+    #[clap(help_heading("Advanced"))]
+    #[clap(long = "force-single")]
+    #[clap(help = "Run even if there is only one file on input")]
+    pub force_single: bool,
+
+    #[clap(help_heading("Advanced"))]
+    #[clap(hide = true)]
+    #[clap(long = "force-samples")]
+    #[clap(help = "Resolve duplicate sample names")]
+    pub force_samples: bool,
+
+    #[clap(help_heading("Advanced"))]
+    #[clap(long = "no-version")]
+    #[clap(help = "Do not append version and command line to the header")]
+    pub no_version: bool,
+
+    #[clap(help_heading("Advanced"))]
+    #[clap(hide = true)]
+    #[clap(long = "missing-to-ref")]
+    #[clap(help = "Assume genotypes at missing sites are 0/0")]
+    pub missing_to_ref: bool,
+
+    #[clap(help_heading("Advanced"))]
+    #[clap(hide = true)]
+    #[clap(long = "strategy")]
+    #[clap(value_name = "STRATEGY")]
+    #[clap(help = "Set variant merging strategy to use")]
+    #[clap(value_parser(["exact"]))]
+    #[clap(default_value = "exact")]
+    pub merge_strategy: String,
 }
 
 #[derive(Parser, Debug)]
@@ -391,3 +479,56 @@ fn scoring_from_string(s: &str) -> Result<TrgtScoring> {
         bandwidth: values[5] as usize,
     })
 }
+
+fn merge_validate_output_type(s: &str) -> Result<OutputType> {
+    let valid_prefixes = ["u", "b", "v", "z"];
+    if valid_prefixes.contains(&s) {
+        return match s {
+            "u" => Ok(OutputType::Bcf {
+                is_uncompressed: true,
+                level: None,
+            }),
+            "v" => Ok(OutputType::Vcf {
+                is_uncompressed: true,
+                level: None,
+            }),
+            "b" => Ok(OutputType::Bcf {
+                is_uncompressed: false,
+                level: None,
+            }),
+            "z" => Ok(OutputType::Vcf {
+                is_uncompressed: false,
+                level: None,
+            }),
+            _ => unreachable!(),
+        };
+    }
+
+    // NOTE: Can't actually set compression level in rust/htslib at the moment
+    // if s.len() == 2 {
+    //     let (prefix, suffix) = s.split_at(1);
+    //     if (prefix == "b" || prefix == "z") && suffix.chars().all(|c| c.is_digit(10)) {
+    //         return match prefix {
+    //             "b" => Ok(OutputType::Bcf {
+    //                 is_uncompressed: false,
+    //                 level: Some(suffix.parse().unwrap()),
+    //             }),
+    //             "z" => Ok(OutputType::Vcf {
+    //                 is_uncompressed: false,
+    //                 level: Some(suffix.parse().unwrap()),
+    //             }),
+    //             _ => unreachable!(),
+    //         };
+    //     } else if (prefix == "u" || prefix == "v") && suffix.chars().all(|c| c.is_digit(10)) {
+    //         return Err(format!(
+    //             "Error: compression level ({}) cannot be set on uncompressed streams ({})",
+    //             suffix, prefix
+    //         ));
+    //     }
+    // }
+
+    Err(format!(
+        "Invalid output type: {}. Must be one of u, b, v, z.",
+        s
+    ))
+}
diff --git a/src/commands.rs b/src/commands.rs
@@ -1,3 +1,4 @@
 pub mod genotype;
+pub mod merge;
 pub mod plot;
 pub mod validate;
diff --git a/src/commands/genotype.rs b/src/commands/genotype.rs
@@ -27,9 +27,9 @@ struct ThreadLocalData {
 }
 
 thread_local! {
-    static LOCAL_BAM_READER: ThreadLocalData = ThreadLocalData {
+    static LOCAL_BAM_READER: ThreadLocalData = const { ThreadLocalData {
         bam: RefCell::new(None),
-    };
+    } };
 }
 
 const CHANNEL_BUFFER_SIZE: usize = 2048;

diff --git a/src/commands/merge.rs b/src/commands/merge.rs
@@ -0,0 +1,20 @@
+use crate::cli::MergeArgs;
+use crate::merge::vcf_processor::VcfProcessor;
+use crate::utils::Result;
+use std::time;
+
+pub fn merge(args: MergeArgs) -> Result<()> {
+    let start_timer = time::Instant::now();
+
+    let mut vcf_processor = VcfProcessor::new(&args)?;
+
+    if args.print_header {
+        return Ok(());
+    }
+
+    vcf_processor.merge_variants();
+
+    // TODO: If --output, --write-index is set and the output is compressed, index the file
+    log::info!("Total execution time: {:.2?}", start_timer.elapsed());
+    Ok(())
+}
diff --git a/src/hmm/builder.rs b/src/hmm/builder.rs
@@ -188,7 +188,7 @@ mod tests {
     use super::*;
     use crate::hmm::spans::Span;
 
-    fn summarize(spans: &Vec<Span>) -> Vec<(usize, usize, usize)> {
+    fn summarize(spans: &[Span]) -> Vec<(usize, usize, usize)> {
         let mut summary = Vec::new();
         for (motif_index, group) in &spans
             .iter()

diff --git a/src/hmm/purity.rs b/src/hmm/purity.rs
@@ -80,7 +80,7 @@ mod tests {
         let hmm = build_hmm(&motifs);
         //                 GCNGCNGCNGXN
         let query = "GCAGCCGCTGAG";
-        let states = hmm.label(&query);
+        let states = hmm.label(query);
         let purity = calc_purity(query.as_bytes(), &hmm, &motifs, &states);
         assert_eq!(purity, 11.0 / 12.0);
     }
@@ -90,8 +90,8 @@ mod tests {
         let motifs = vec!["CAG".as_bytes().to_vec(), "CCG".as_bytes().to_vec()];
         let hmm = build_hmm(&motifs);
         let query = "";
-        let states = hmm.label(&query);
-        let purity = calc_purity(&query.as_bytes(), &hmm, &motifs, &states);
+        let states = hmm.label(query);
+        let purity = calc_purity(query.as_bytes(), &hmm, &motifs, &states);
         assert!(purity.is_nan());
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,6 +1,7 @@
 pub mod cli;
 pub mod commands;
 pub mod hmm;
+pub mod merge;
 pub mod trgt;
 pub mod trvz;
 pub mod utils;
diff --git a/src/main.rs b/src/main.rs
@@ -1,7 +1,7 @@
 use clap::Parser;
 use trgt::{
     cli::{init_verbose, Cli, Command, FULL_VERSION},
-    commands::{genotype, plot, validate},
+    commands::{genotype, merge, plot, validate},
     utils::{handle_error_and_exit, Result},
 };
 
@@ -12,6 +12,7 @@ fn runner() -> Result<()> {
         Command::Genotype(_) => "genotype",
         Command::Plot(_) => "plot",
         Command::Validate(_) => "validate",
+        Command::Merge(_) => "merge",
     };
 
     log::info!(
@@ -24,6 +25,7 @@ fn runner() -> Result<()> {
         Command::Genotype(args) => genotype::trgt(args)?,
         Command::Plot(args) => plot::trvz(args)?,
         Command::Validate(args) => validate::validate(args)?,
+        Command::Merge(args) => merge::merge(args)?,
     }
     log::info!("{} end", env!("CARGO_PKG_NAME"));
     Ok(())

diff --git a/src/merge/mod.rs b/src/merge/mod.rs
@@ -0,0 +1,4 @@
+pub mod strategy;
+pub mod vcf_processor;
+pub mod vcf_reader;
+pub mod vcf_writer;