Split-read SV calling improvements (#15)

* remove read overlapping alignments based on mismatch rates * paired copy number predictions in split alignment regions to reduce similar calls * fix hmm prediction errors * fix insertion and duplication breakpoint errors * add ethnicity argument for use in gnomad allele frequencies
WGLab · Oct 25, 2024 · 2812c1e · 2812c1e
1 parent c82ec91
commit 2812c1e
Show file tree

Hide file tree

Showing 30 changed files with 2,077 additions and 1,756 deletions.
diff --git a/.gitignore b/.gitignore
@@ -63,8 +63,8 @@ docs/html/
 *.sif
 
 # Test directories
-python/dbscan_tests
-python/agglo_tests
+python/dbscan
+python/agglo
 linktoscripts
 tests/data
 

diff --git a/Makefile b/Makefile
@@ -8,4 +8,4 @@ all:
 	swig -c++ -python -I$(INCL_DIR) -o $(SRC_DIR)/swig_wrapper.cpp -outdir $(LIB_DIR) $(SRC_DIR)/swig_wrapper.i
 
 	# Compile the SWIG wrapper using setuptools
-	python setup.py build_ext --build-lib $(LIB_DIR)
+	python3 setup.py build_ext --build-lib $(LIB_DIR)
diff --git a/README.md b/README.md
@@ -1,22 +1,16 @@
-
 [![build
 tests](https://github.com/WGLab/ContextSV/actions/workflows/build-tests.yml/badge.svg)](https://github.com/WGLab/ContextSV/actions/workflows/build-tests.yml)
 
-![contextsv_small_15p](https://github.com/WGLab/ContextSV/assets/14855676/79d70c76-a34a-472e-a14c-e49489ae0f09)
-
 # ContextSV
-> [!NOTE]
-> This is a work in progress, software is under development and not ready for official release.
 
-An alignment-based, generalized structural variant caller for long-read
-sequencing/mapping data.
-
-ContextSV takes as input a long read alignments file (BAM), a 
-corresponding reference genome file (FASTA), a VCF file with high-quality SNPs 
- (e.g. via GATK, Deepvariant, [NanoCaller](https://github.com/WGLab/NanoCaller)), and [gnomAD](https://gnomad.broadinstitute.org/downloads) database
+<p>
+<img src="https://github.com/user-attachments/assets/b6dca03c-11f8-4882-852f-d06c23bebebb" alt="ContextSV" align="left" style="width:100px;"/>
+A long-read, whole-genome structural variant (SV) caller. It takes as input long read alignments (BAM), the 
+corresponding reference genome (FASTA), a VCF with high-quality SNPs 
+ (e.g. via GATK, Deepvariant, <a href="https://github.com/WGLab/NanoCaller">NanoCaller</a>, and <a href="https://gnomad.broadinstitute.org/downloads">gnomAD</a> database
  VCF files with SNP population frequencies for each chromosome.
-
-Class documentation is available at [https://wglab.openbioinformatics.org/ContextSV](https://wglab.openbioinformatics.org/ContextSV)
+Class documentation is available at <a href="https://wglab.openbioinformatics.org/ContextSV">https://wglab.openbioinformatics.org/ContextSV</a>
+</p>
 
 ## Installation (Linux)
 ### Using Anaconda (recommended)

diff --git a/__main__.py b/__main__.py
@@ -59,6 +59,31 @@ def main():
         required=False
     )
 
+    parser.add_argument(
+        '-c', '--chr',
+        help="chromosome to analyze (e.g. 1, 2, 3, ..., X, Y)",
+        required=False,
+        default="",
+        type=str
+    )
+
+    parser.add_argument(
+        "-r", "--region",
+        help="region to analyze (e.g. 1:1000-2000)",
+        required=False,
+        default="",
+        type=str
+    )
+
+    # Specify the ethnicity of the sample for obtaining population allele
+    # frequencies from a database such as gnomAD. If not provided, the allele
+    # frequencies will be obtained for all populations.
+    parser.add_argument(
+        "-e", "--ethnicity",
+        help="ethnicity of the sample (e.g. afr, amr, eas, fin, nfe, oth, sas, asj)",
+        required=False
+    )
+
     # Text file with VCF filepaths of SNP population allele frequencies for each
     # chromosome from a database such as gnomAD (e.g. 1=chr1.vcf.gz\n2=chr2.vcf.gz\n...).
     parser.add_argument(
@@ -73,12 +98,6 @@ def main():
         required=True
     )
 
-    parser.add_argument(
-        "-r", "--region",
-        help="region to analyze (e.g. chr1, chr1:1000-2000). If not provided, the entire genome will be analyzed",
-        required=False,
-    )
-
     # Thread count.
     parser.add_argument(
         "-t", "--threads",
@@ -98,10 +117,19 @@ def main():
     # Window size for calculating log2 ratios for CNV predictions.
     parser.add_argument(
         "--window-size",
-        help="window size for calculating log2 ratios for CNV predictions (default: 10 kb)",
+        help="window size for calculating log2 ratios for CNV predictions (default: 2500 bp)",
         required=False,
         type=int,
-        default=10000
+        default=2500
+    )
+
+    # Minimum SV length for copy number variation (CNV) predictions.
+    parser.add_argument(
+        "--min-cnv-length",
+        help="minimum SV length for CNV predictions (default: 1000 bp)",
+        required=False,
+        type=int,
+        default=1000
     )
 
     # Verbose mode.
@@ -137,31 +165,6 @@ def main():
         required=False,
         help=argparse.SUPPRESS
     )
-
-    # Chromosome mean coverage values passed in as a comma-separated list (e.g. chr1:100,chr2:200,chr3:300)
-    parser.add_argument(
-        "--chr-cov",
-        required=False,
-        help=argparse.SUPPRESS
-    )
-
-    # Turn off CIGAR string SV detection (split-read only)
-    parser.add_argument(
-        "--disable-cigar",
-        required=False,
-        action="store_true",
-        default=False,
-        help=argparse.SUPPRESS
-    )
-
-    # Turn off SNP-based CNV predictions for SV classification.
-    parser.add_argument(
-        "--disable-snp-cnv",
-        required=False,
-        action="store_true",
-        default=False,
-        help=argparse.SUPPRESS
-    )
 
     # ----------------------------------------------------------------------- #
 
@@ -186,8 +189,8 @@ def main():
         log.warning("Short read alignment file not provided. Using long read alignment file in its place.")
         args.short_read = args.long_read
 
-    # SNPs file is required unless SNP-based CNV predictions are disabled.
-    if (args.snps is None and not args.disable_snp_cnv):
+    # SNPs file is required
+    if (args.snps is None):
         log.error("Please provide the SNPs file.")
         arg_error = True
 
@@ -208,39 +211,49 @@ def main():
         if value is None:
             setattr(args, key, "")
 
-    # Set input parameters.
+    # Set input parameters
     input_data = contextsv.InputData()
     input_data.setVerbose(args.debug)
     input_data.setShortReadBam(args.short_read)
     input_data.setLongReadBam(args.long_read)
     input_data.setRefGenome(args.reference)
     input_data.setSNPFilepath(args.snps)
-    input_data.setRegion(args.region)
+    input_data.setEthnicity(args.ethnicity)
     input_data.setThreadCount(args.threads)
-    input_data.setMeanChromosomeCoverage(args.chr_cov)
+    input_data.setChromosome(args.chr)
+    input_data.setRegion(args.region)
     input_data.setAlleleFreqFilepaths(args.pfb)
     input_data.setHMMFilepath(args.hmm)
     input_data.setOutputDir(args.output)
-    input_data.setDisableCIGAR(args.disable_cigar)
-    input_data.setDisableSNPCNV(args.disable_snp_cnv)
     input_data.saveCNVData(args.save_cnv)
     input_data.setWindowSize(args.window_size)
+    input_data.setMinCNVLength(args.min_cnv_length)
 
-    # Run the analysis.
+    # Run the analysis
     contextsv.run(input_data)
 
     # Determine the data paths for downstream analysis.
-    vcf_path = os.path.join(args.output, "sv_calls.vcf")
+    vcf_path = os.path.join(args.output, "output.vcf")
     output_dir = args.output
-    region = args.region
-    cnv_data_path = os.path.join(args.output, "cnv_data.tsv")
+    # cnv_data_path = os.path.join(args.output, "cnv_data.tsv")
 
-    # Generate python-based CNV plots if SNP-based CNV predictions are enabled.
-    if (args.save_cnv and not args.disable_snp_cnv):
+    # Generate python-based CNV plots if SNP-based CNV predictions are enabled
+    if (args.save_cnv):
         log.info("Generating CNV plots...")
-        cnv_plots.run(vcf_path, cnv_data_path, output_dir, region)
 
-    log.info("Complete. Thank you for using contextSV!")
+        # Find all TSV files in the output directory
+        for file in os.listdir(output_dir):
+            if file.endswith(".tsv"):
+                cnv_data_path = os.path.join(output_dir, file)
+
+                # Set the HTML output path by changing the file extension
+                output_html = os.path.splitext(cnv_data_path)[0] + ".html"
+
+                # Generate the CNV plot for the current TSV file
+                cnv_plots.run(cnv_data_path, output_html)
+        # cnv_plots.run(vcf_path, cnv_data_path, output_dir, region)
+
+    log.info("Complete. File saved to %s\nThank you for using ContextSV!", vcf_path)
 
 if __name__ == '__main__':
 

diff --git a/data/wgs_test.hmm b/data/wgs_test.hmm
@@ -0,0 +1,36 @@
+M=6
+N=6
+A:
+0.899997 0.009 0.091 0.000001 0.000001 0.000001
+0.009 0.899997 0.091 0.000001 0.000001 0.000001
+0.00001 0.00005 0.99987 0.00001 0.00005 0.00001
+0.000001 0.000001 0.00003 0.999966 0.000001 0.000001
+0.000001 0.000001 0.091 0.000001 0.899997 0.009
+0.000001 0.000001 0.091 0.000001 0.009 0.899997
+B:
+0.950000 0.000001 0.050000 0.000001 0.000001 0.000001 
+0.000001 0.950000 0.050000 0.000001 0.000001 0.000001 
+0.000001 0.000001 0.999995 0.000001 0.000001 0.000001 
+0.000001 0.000001 0.050000 0.950000 0.000001 0.000001 
+0.000001 0.000001 0.050000 0.000001 0.950000 0.000001 
+0.000001 0.000001 0.050000 0.000001 0.000001 0.950000 
+pi:
+0.000001 0.000500 0.999000 0.000001 0.000500 0.000001 
+B1_mean:
+-3.739099 -0.727964 0.000000 100 0.395454 0.658622 
+B1_sd:
+2.564467 0.303606 0.163877 0.163877 0.127181 0.124527 
+B1_uf:
+0.001000
+B2_mean:
+0.000000 0.250000 0.333333 0.500000 0.500000 
+B2_sd:
+0.155241 0.157236 0.166946 0.057305 0.044416
+B2_uf:
+0.010000
+B3_mean:
+-3.739099 -0.727964 0.000000 100 0.395454 0.658622 
+B3_sd:
+2.564467 0.303606 0.163877 0.163877 0.127181 0.124527 
+B3_uf:
+0.010000
diff --git a/environment.yml b/environment.yml
@@ -1,16 +1,21 @@
 name: contextsv
 channels:
-  - bioconda
+  - defaults
   - anaconda
   - conda-forge
-  - defaults
+  - bioconda
 dependencies:
   - python
   - numpy
   - htslib
   - swig
   - pytest
   - plotly
-  - pandas
-  - scikit-learn
-  - joblib
+
+# [A] Generate directly from the file:
+#  conda env create -f environment.yml -n contextsv
+# [B] Generate after creating a new environment:
+# conda create -n contextsv
+# conda activate contextsv
+# conda env update -f environment.yml --prune  # Prune removes unused packages
+
diff --git a/include/cnv_caller.h b/include/cnv_caller.h
@@ -48,15 +48,14 @@ struct SNPData {
 class CNVCaller {
     private:
         InputData* input_data;
-
-        // Mutex for locking the SV candidate map
-        mutable std::mutex sv_candidates_mtx;
-
-        // Mutex for locking the SNP info
-        mutable std::mutex snp_data_mtx;
-
-        // Mutex for locking the HMM prediction
-        mutable std::mutex hmm_mtx;
+        mutable std::mutex sv_candidates_mtx; // SV candidate map mutex
+        mutable std::mutex snp_data_mtx;  // SNP data mutex
+        mutable std::mutex hmm_mtx;  // HMM mutex
+        CHMM hmm;
+        SNPData snp_data;
+        SNPInfo snp_info;
+        double mean_chr_cov = 0.0;
+        std::unordered_map<uint32_t, int> pos_depth_map;
 
         // Define a map of CNV genotypes by HMM predicted state.
         // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output.
@@ -92,22 +91,17 @@ class CNVCaller {
 
         void updateSNPData(SNPData& snp_data, int64_t pos, double pfb, double baf, double log2_cov, bool is_snp);
 
-        void updateSNPVectors(SNPData& snp_data, std::vector<int64_t>& pos, std::vector<double>& pfb, std::vector<double>& baf, std::vector<double>& log2_cov, std::vector<int>& state_sequence, std::vector<bool>& is_snp);
-
-        std::vector<int> runViterbi(CHMM hmm, SNPData &snp_data);
+        std::pair<std::vector<int>, double> runViterbi(CHMM hmm, SNPData &snp_data);
 
         // Query a region for SNPs and return the SNP data
-        std::pair<SNPData, bool> querySNPRegion(std::string chr, int64_t start_pos, int64_t end_pos, SNPInfo &snp_info, std::unordered_map<uint64_t, int> &pos_depth_map, double mean_chr_cov);
+        std::pair<SNPData, bool> querySNPRegion(std::string chr, int64_t start_pos, int64_t end_pos, SNPInfo &snp_info, std::unordered_map<uint32_t, int> &pos_depth_map, double mean_chr_cov);
 
-        // Run copy number prediction for a region
-        SNPData runCopyNumberPrediction(std::string chr, std::map<SVCandidate, SVInfo>& sv_candidates, SNPInfo& snp_info, CHMM hmm, int window_size, double mean_chr_cov);
+        // Run copy number prediction for a chunk of SV candidates from CIGAR strings
+        void runCIGARCopyNumberPredictionChunk(std::string chr, std::map<SVCandidate, SVInfo>& sv_candidates, std::vector<SVCandidate> sv_chunk, SNPInfo& snp_info, CHMM hmm, int window_size, double mean_chr_cov, std::unordered_map<uint32_t, int>& pos_depth_map);
 
-        // Run copy number prediction for a chunk of SV candidates
-        SNPData runCopyNumberPredictionChunk(std::string chr, std::map<SVCandidate, SVInfo>& sv_candidates, std::vector<SVCandidate> sv_chunk, SNPInfo& snp_info, CHMM hmm, int window_size, double mean_chr_cov, std::unordered_map<uint64_t, int>& pos_depth_map);
+        void updateSVCopyNumber(std::map<SVCandidate, SVInfo>& sv_candidates, SVCandidate key, int sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood);
 
-        void updateSVType(std::map<SVCandidate, SVInfo>& sv_candidates, SVCandidate key, int sv_type, std::string data_type);
-
-        void updateSVGenotype(std::map<SVCandidate, SVInfo>& sv_candidates, SVCandidate key, std::string genotype);
+        void updateDPValue(std::map<SVCandidate, SVInfo>& sv_candidates, SVCandidate key, int dp_value);
 
         // Split a region into chunks for parallel processing
         std::vector<std::string> splitRegionIntoChunks(std::string chr, int64_t start_pos, int64_t end_pos, int chunk_count);
@@ -116,14 +110,22 @@ class CNVCaller {
         std::vector<std::vector<SVCandidate>> splitSVCandidatesIntoChunks(std::map<SVCandidate, SVInfo>& sv_candidates, int chunk_count);
 
         // Merge the read depths from a chunk into the main read depth map
-        void mergePosDepthMaps(std::unordered_map<uint64_t, int>& pos_depth_map, std::unordered_map<uint64_t, int>& pos_depth_map_chunk);
+        void mergePosDepthMaps(std::unordered_map<uint32_t, int>& main_map, std::unordered_map<uint32_t, int>& map_update);
 
     public:
         CNVCaller(InputData& input_data);
 
-        // Detect CNVs and return the state sequence by SNP position
-        // (key = [chromosome, SNP position], value = state)
-		void run(SVData& sv_calls);
+        // Load file data for a chromosome (SNP positions, BAF values, and PFB values)
+        void loadChromosomeData(std::string chr);
+
+        // Run copy number prediction for a pair of SV candidates, and add only
+        // the SV candidate with the highest likelihood
+        std::tuple<int, double, int, std::string, bool> runCopyNumberPredictionPair(std::string chr, SVCandidate sv_one, SVCandidate sv_two);
+
+        // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
+        SNPData runCIGARCopyNumberPrediction(std::string chr, std::map<SVCandidate, SVInfo>& sv_candidates, int min_length);
+
+        void updateSVsFromCopyNumberPrediction(SVData& sv_calls, std::vector<std::pair<SVCandidate, std::string>>& sv_list, std::string chr);
 
         // Calculate the mean chromosome coverage
         double calculateMeanChromosomeCoverage(std::string chr);
@@ -133,7 +135,7 @@ class CNVCaller {
 
         // Calculate the log2 ratio for a region given the read depths and mean
         // chromosome coverage
-        double calculateLog2Ratio(int start_pos, int end_pos, std::unordered_map<uint64_t, int>& pos_depth_map, double mean_chr_cov);
+        double calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::unordered_map<uint32_t, int>& pos_depth_map, double mean_chr_cov);
 
         // Read SNP positions and BAF values from the VCF file of SNP calls
         void readSNPAlleleFrequencies(std::string chr, std::string filepath, SNPInfo& snp_info);
@@ -143,7 +145,7 @@ class CNVCaller {
         void getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info);
 
         // Save a TSV with B-allele frequencies, log 2 ratios, and copy number predictions
-        void saveToTSV(SNPData& snp_data, std::string filepath);
+        void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, int64_t start, int64_t end, std::string sv_type, double likelihood);
 };
 
 #endif // CNV_CALLER_H