VS-776. Update to latest version of VQSR Lite. (#8269)

* Updated to latest version of VQSR Lite (from Master) * Ported tests and files for VQSR Lite over * Refactored VQSR Classic code into its own WDL
broadinstitute · Apr 21, 2023 · f4a355c · f4a355c
1 parent 17afee4
commit f4a355c
Show file tree

Hide file tree

Showing 24 changed files with 636 additions and 572 deletions.
diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
diff --git a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl
@@ -30,16 +30,15 @@ workflow GvsJointVariantCalling {
       File sample_names_to_extract = ""
       Int split_intervals_disk_size_override = ""
       Int split_intervals_mem_override = ""
-      Int INDEL_VQSR_max_gaussians_override = 4
-      Int INDEL_VQSR_mem_gb_override = ""
-      Int SNP_VQSR_max_gaussians_override = 6
-      Int SNP_VQSR_mem_gb_override = ""
+      Int INDEL_VQSR_CLASSIC_max_gaussians_override = 4
+      Int INDEL_VQSR_CLASSIC_mem_gb_override = ""
+      Int SNP_VQSR_CLASSIC_max_gaussians_override = 6
+      Int SNP_VQSR_CLASSIC_mem_gb_override = ""
     }
     # This is the most updated snapshot of the code as of Feb 10, 2023
     File gatk_override = "gs://gvs_quickstart_storage/jars/gatk-package-4.2.0.0-654-g4a1c203-SNAPSHOT-local.jar"
     File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
-    Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
-    Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
+
     File interval_weights_bed = "gs://broad-public-datasets/gvs/weights/gvs_vet_weights_1kb.bed"
     # do we ever want non-beta customers to use this instead of using GvsUnified directly?  If so, we can make this an
     # argument that just defaults to true
@@ -65,7 +64,6 @@ workflow GvsJointVariantCalling {
             extract_table_prefix = extract_table_prefix,
             fq_temp_table_dataset = "~{project_id}.~{dataset_name}",
             gatk_override = gatk_override,
-            indel_recalibration_annotation_values = indel_recalibration_annotation_values,
             interval_list = interval_list,
             interval_weights_bed = interval_weights_bed,
             load_data_batch_size = load_data_batch_size,
@@ -74,13 +72,12 @@ workflow GvsJointVariantCalling {
             query_labels = query_labels,
             query_project = project_id,
             sample_names_to_extract = sample_names_to_extract,
-            snp_recalibration_annotation_values = snp_recalibration_annotation_values,
             split_intervals_disk_size_override = split_intervals_disk_size_override,
             split_intervals_mem_override = split_intervals_mem_override,
-            INDEL_VQSR_max_gaussians_override = INDEL_VQSR_max_gaussians_override,
-            INDEL_VQSR_mem_gb_override = INDEL_VQSR_mem_gb_override,
-            SNP_VQSR_max_gaussians_override = SNP_VQSR_max_gaussians_override,
-            SNP_VQSR_mem_gb_override = SNP_VQSR_mem_gb_override,
+            INDEL_VQSR_CLASSIC_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override,
+            INDEL_VQSR_CLASSIC_mem_gb_override = INDEL_VQSR_CLASSIC_mem_gb_override,
+            SNP_VQSR_CLASSIC_max_gaussians_override = SNP_VQSR_CLASSIC_max_gaussians_override,
+            SNP_VQSR_CLASSIC_mem_gb_override = SNP_VQSR_CLASSIC_mem_gb_override,
             drop_state = drop_state,
             is_beta_user = is_beta_user,
     }

diff --git a/scripts/variantstore/wdl/GvsUnified.wdl b/scripts/variantstore/wdl/GvsUnified.wdl
@@ -39,13 +39,11 @@ workflow GvsUnified {
 
         # Begin GvsCreateFilterSet
         String filter_set_name = call_set_identifier
-        Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
-        Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
 
-        Int? INDEL_VQSR_max_gaussians_override = 4
-        Int? INDEL_VQSR_mem_gb_override
-        Int? SNP_VQSR_max_gaussians_override = 6
-        Int? SNP_VQSR_mem_gb_override
+        Int? INDEL_VQSR_CLASSIC_max_gaussians_override = 4
+        Int? INDEL_VQSR_CLASSIC_mem_gb_override
+        Int? SNP_VQSR_CLASSIC_max_gaussians_override = 6
+        Int? SNP_VQSR_CLASSIC_mem_gb_override
         # End GvsCreateFilterSet
 
         # Begin GvsPrepareRangesCallset
@@ -116,14 +114,12 @@ workflow GvsUnified {
             project_id = project_id,
             call_set_identifier = call_set_identifier,
             filter_set_name = filter_set_name,
-            indel_recalibration_annotation_values = indel_recalibration_annotation_values,
-            snp_recalibration_annotation_values = snp_recalibration_annotation_values,
             interval_list = interval_list,
             gatk_override = gatk_override,
-            INDEL_VQSR_max_gaussians_override = INDEL_VQSR_max_gaussians_override,
-            INDEL_VQSR_mem_gb_override = INDEL_VQSR_mem_gb_override,
-            SNP_VQSR_max_gaussians_override = SNP_VQSR_max_gaussians_override,
-            SNP_VQSR_mem_gb_override = SNP_VQSR_mem_gb_override
+            INDEL_VQSR_CLASSIC_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override,
+            INDEL_VQSR_CLASSIC_mem_gb_override = INDEL_VQSR_CLASSIC_mem_gb_override,
+            SNP_VQSR_CLASSIC_max_gaussians_override = SNP_VQSR_CLASSIC_max_gaussians_override,
+            SNP_VQSR_CLASSIC_mem_gb_override = SNP_VQSR_CLASSIC_mem_gb_override
     }
 
     call PrepareRangesCallset.GvsPrepareCallset {

diff --git a/scripts/variantstore/wdl/GvsVQSRClassic.wdl b/scripts/variantstore/wdl/GvsVQSRClassic.wdl
@@ -0,0 +1,198 @@
+version 1.0
+
+import "GvsWarpTasks.wdl" as Tasks
+import "GvsUtils.wdl" as Utils
+
+workflow JointVcfFiltering {
+  input {
+    String base_name
+    Int num_samples_loaded
+    File sites_only_variant_filtered_vcf
+    File sites_only_variant_filtered_vcf_idx
+    Array[File] sites_only_variant_filtered_vcfs
+    Array[File] sites_only_variant_filtered_vcf_idxs
+
+    File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
+    File? gatk_override
+
+    Int? INDEL_VQSR_max_gaussians_override = 4
+    Int? INDEL_VQSR_maximum_training_variants
+    Int? INDEL_VQSR_mem_gb_override
+    Int? SNP_VQSR_max_gaussians_override = 6
+    Int? SNP_VQSR_mem_gb_override
+    Int? SNP_VQSR_sample_every_nth_variant
+    Int? SNP_VQSR_maximum_training_variants
+
+    # This is the minimum number of samples where the SNP model will be created and applied in separate tasks
+    # (SNPsVariantRecalibratorClassic vs. SNPsVariantRecalibratorCreateModel and SNPsVariantRecalibratorScattered)
+    # For VQSR classic this is done with 20k but the 10K Stroke Anderson dataset would not work unscattered (at least
+    # with the default VM memory settings) so this was adjusted down to 5K.
+    Int snps_variant_recalibration_threshold = 5000
+  }
+
+  Array[String] indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
+  Array[String] snp_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
+
+  Array[String] snp_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ]
+  Array[String] indel_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"]
+
+  # reference files
+  # Axiom - Used only for indels
+  File axiomPoly_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz"
+  File axiomPoly_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi"
+
+  # DbSNP - BOTH SNPs and INDELs.
+  File dbsnp_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf"
+  File dbsnp_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx"
+
+  # HapMap - SNPs
+  File hapmap_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz"
+  File hapmap_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi"
+
+  # Mills - Indels
+  File mills_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
+  File mills_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi"
+
+  # Omni - SNPs
+  File omni_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz"
+  File omni_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi"
+
+  # 1000G - SNPs
+  File one_thousand_genomes_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz"
+  File one_thousand_genomes_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi"
+
+  call Tasks.IndelsVariantRecalibrator {
+    input:
+      sites_only_variant_filtered_vcf = sites_only_variant_filtered_vcf,
+      sites_only_variant_filtered_vcf_index = sites_only_variant_filtered_vcf_idx,
+      recalibration_filename = base_name + ".indels.recal",
+      tranches_filename = base_name + ".indels.tranches",
+      recalibration_tranche_values = indel_recalibration_tranche_values,
+      recalibration_annotation_values = indel_recalibration_annotations,
+      mills_resource_vcf = mills_resource_vcf,
+      mills_resource_vcf_index = mills_resource_vcf_index,
+      axiomPoly_resource_vcf = axiomPoly_resource_vcf,
+      axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index,
+      dbsnp_resource_vcf = dbsnp_vcf,
+      dbsnp_resource_vcf_index = dbsnp_vcf_index,
+      use_allele_specific_annotations = true,
+      disk_size = "1000",
+      machine_mem_gb = INDEL_VQSR_mem_gb_override,
+      max_gaussians = INDEL_VQSR_max_gaussians_override,
+      maximum_training_variants = INDEL_VQSR_maximum_training_variants,
+  }
+
+  if (num_samples_loaded > snps_variant_recalibration_threshold) {
+    call Tasks.SNPsVariantRecalibratorCreateModel {
+      input:
+        sites_only_variant_filtered_vcf = sites_only_variant_filtered_vcf,
+        sites_only_variant_filtered_vcf_index = sites_only_variant_filtered_vcf_idx,
+        recalibration_filename = base_name + ".snps.recal",
+        tranches_filename = base_name + ".snps.tranches",
+        model_report_filename = base_name + ".snps.model.report",
+        recalibration_tranche_values = snp_recalibration_tranche_values,
+        recalibration_annotation_values = snp_recalibration_annotations,
+        hapmap_resource_vcf = hapmap_resource_vcf,
+        hapmap_resource_vcf_index = hapmap_resource_vcf_index,
+        omni_resource_vcf = omni_resource_vcf,
+        omni_resource_vcf_index = omni_resource_vcf_index,
+        one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
+        one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
+        dbsnp_resource_vcf = dbsnp_vcf,
+        dbsnp_resource_vcf_index = dbsnp_vcf_index,
+        use_allele_specific_annotations = true,
+        disk_size = "1000",
+        machine_mem_gb = SNP_VQSR_mem_gb_override,
+        max_gaussians = SNP_VQSR_max_gaussians_override,
+        sample_every_nth_variant = SNP_VQSR_sample_every_nth_variant,
+        maximum_training_variants = SNP_VQSR_maximum_training_variants
+    }
+
+    scatter (idx in range(length(sites_only_variant_filtered_vcfs))) {
+      call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered {
+        input:
+          sites_only_variant_filtered_vcf = sites_only_variant_filtered_vcfs[idx],
+          sites_only_variant_filtered_vcf_index = sites_only_variant_filtered_vcf_idxs[idx],
+          recalibration_filename = base_name + ".snps." + idx + ".recal",
+          tranches_filename = base_name + ".snps." + idx + ".tranches",
+          model_report = SNPsVariantRecalibratorCreateModel.model_report,
+          recalibration_tranche_values = snp_recalibration_tranche_values,
+          recalibration_annotation_values = snp_recalibration_annotations,
+          hapmap_resource_vcf = hapmap_resource_vcf,
+          hapmap_resource_vcf_index = hapmap_resource_vcf_index,
+          omni_resource_vcf = omni_resource_vcf,
+          omni_resource_vcf_index = omni_resource_vcf_index,
+          one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
+          one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
+          dbsnp_resource_vcf = dbsnp_vcf,
+          dbsnp_resource_vcf_index = dbsnp_vcf_index,
+          use_allele_specific_annotations = true,
+          disk_size = "1000",
+          machine_mem_gb = SNP_VQSR_mem_gb_override
+      }
+    }
+
+    call Tasks.GatherTranches as SNPGatherTranches {
+      input:
+        tranches = SNPsVariantRecalibratorScattered.tranches,
+        output_filename = base_name + ".snps.gathered.tranches",
+        output_tranche_values = snp_recalibration_tranche_values,
+        mode = "SNP",
+        disk_size = "200",
+        gatk_override = gatk_override
+    }
+
+    call Utils.MergeVCFs as MergeRecalibrationFiles {
+      input:
+        input_vcfs = SNPsVariantRecalibratorScattered.recalibration,
+        gather_type = "CONVENTIONAL",
+        output_vcf_name = "${base_name}.vrecalibration.vcf.gz",
+        preemptible_tries = 3,
+    }
+  }
+
+  if (num_samples_loaded <= snps_variant_recalibration_threshold) {
+    call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic {
+      input:
+        sites_only_variant_filtered_vcf = sites_only_variant_filtered_vcf,
+        sites_only_variant_filtered_vcf_index = sites_only_variant_filtered_vcf_idx,
+        recalibration_filename = base_name + ".snps.recal",
+        tranches_filename = base_name + ".snps.tranches",
+        recalibration_tranche_values = snp_recalibration_tranche_values,
+        recalibration_annotation_values = snp_recalibration_annotations,
+        hapmap_resource_vcf = hapmap_resource_vcf,
+        hapmap_resource_vcf_index = hapmap_resource_vcf_index,
+        omni_resource_vcf = omni_resource_vcf,
+        omni_resource_vcf_index = omni_resource_vcf_index,
+        one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
+        one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
+        dbsnp_resource_vcf = dbsnp_vcf,
+        dbsnp_resource_vcf_index = dbsnp_vcf_index,
+        use_allele_specific_annotations = true,
+        disk_size = "1000",
+        machine_mem_gb = SNP_VQSR_mem_gb_override,
+        max_gaussians = SNP_VQSR_max_gaussians_override,
+    }
+  }
+
+  output {
+    File snps_variant_recalibration_file = select_first([MergeRecalibrationFiles.output_vcf, SNPsVariantRecalibratorClassic.recalibration])
+    File snps_variant_recalibration_file_index = select_first([MergeRecalibrationFiles.output_vcf_index, SNPsVariantRecalibratorClassic.recalibration_index])
+    File snps_variant_tranches_file = select_first([SNPGatherTranches.tranches_file, SNPsVariantRecalibratorClassic.tranches])
+    File indels_variant_recalibration_file = IndelsVariantRecalibrator.recalibration
+    File indels_variant_recalibration_file_index = IndelsVariantRecalibrator.recalibration_index
+    File indels_variant_tranches_file = IndelsVariantRecalibrator.tranches
+    Array[File] monitoring_logs = select_all(
+                                  flatten(
+                                   [
+                                   [IndelsVariantRecalibrator.monitoring_log],
+                                   [SNPsVariantRecalibratorCreateModel.monitoring_log],
+                                   select_first([SNPsVariantRecalibratorScattered.monitoring_log, []]),
+                                   [SNPGatherTranches.monitoring_log],
+                                   [MergeRecalibrationFiles.monitoring_log],
+                                   [SNPsVariantRecalibratorClassic.monitoring_log]
+                                   ]))
+  }
+
+}
+
diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/README.md b/scripts/vcf_site_level_filtering_cromwell_tests/README.md
@@ -2,7 +2,7 @@
 
 **This directory is for GATK devs only**
 
-This directory contains scripts for running Variant Site Level WDL tests in the automated travis build environment.
+This directory contains scripts for running Variant Site Level WDL tests in the automated build environment.
 
 Please note that this only tests whether the WDL will complete successfully.
 

diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh b/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh
@@ -4,14 +4,16 @@ set -e
 script_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P )
 cd "$script_path"
 
-WORKING_DIR=/home/runner/work/gatk
+WORKING_DIR=/home/runner/work/gatk/gatk
+WDL_DIR=$WORKING_DIR/scripts/vcf_site_level_filtering_wdl
+CROMWELL_TEST_DIR=$WORKING_DIR/scripts/vcf_site_level_filtering_cromwell_tests
 
 set -e
 echo "Building docker image for VCF Site Level Filtering WDL tests (skipping unit tests)..."
 
 #assume Dockerfile is in root
 echo "Building docker without running unit tests... ========="
-cd $WORKING_DIR/gatk
+cd $WORKING_DIR
 
 # IMPORTANT: This code is duplicated in the cnv and M2 WDL test.
 if [ ! -z "$CI_PULL_REQUEST" ]; then
@@ -21,18 +23,17 @@ if [ ! -z "$CI_PULL_REQUEST" ]; then
 else
   HASH_TO_USE=${CI_COMMIT}
   sudo bash build_docker.sh  -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/;
-  echo "using travis commit:"$HASH_TO_USE
+  echo "using commit:"$HASH_TO_USE
 fi
 echo "Docker build done =========="
 
-cd $WORKING_DIR/gatk/scripts/
-sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json >$WORKING_DIR/vcf_site_level_filtering_travis.json
-echo "JSON FILES (modified) ======="
-cat $WORKING_DIR/vcf_site_level_filtering_travis.json
-echo "=================="
-
+sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/vcf_site_level_filtering.json >$WORKING_DIR/vcf_site_level_filtering_mod.json
+sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/vcf_site_level_filtering_pos_neg.json >$WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json
 
 echo "Running Filtering WDL through cromwell"
-ln -fs $WORKING_DIR/gatk/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
-cd $WORKING_DIR/gatk/scripts/vcf_site_level_filtering_wdl/
-java -jar $CROMWELL_JAR run JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_travis.json
+
+cat $WORKING_DIR/vcf_site_level_filtering_mod.json
+java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_mod.json
+
+cat $WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json
+java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json