From 1a2b3de9ba7f223b44b14facbd08ed1478e9f680 Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Wed, 29 Mar 2023 11:36:16 -0400
Subject: [PATCH 01/25] First cut - wanna run wdl validation.

---
 .../variantstore/wdl/GvsCreateFilterSet.wdl   |  10 +-
 .../JointVcfFiltering.wdl                     | 551 +++++++++---------
 2 files changed, 272 insertions(+), 289 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index 8a78b0db858..3bc0a5b2cef 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -129,16 +129,16 @@ workflow GvsCreateFilterSet {
   if (!use_classic_VQSR) {
     call VQSRLite.JointVcfFiltering as JointVcfFiltering {
       input:
-        vcf = ExtractFilterTask.output_vcf,
-        vcf_index = ExtractFilterTask.output_vcf_index,
+        input_vcfs = ExtractFilterTask.output_vcf,
+        input_vcf_idxs = ExtractFilterTask.output_vcf_index,
         sites_only_vcf = MergeVCFs.output_vcf,
         sites_only_vcf_index = MergeVCFs.output_vcf_index,
-        basename = filter_set_name,
+        output_prefix = filter_set_name,
         gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0",
         extract_interval_list = interval_list,
         score_interval_list = interval_list,
-        snp_annotations = "-A AS_QD -A AS_MQRankSum -A AS_ReadPosRankSum -A AS_FS -A AS_MQ -A AS_SOR",
-        indel_annotations = "-A AS_FS -A AS_ReadPosRankSum -A AS_MQRankSum -A AS_QD -A AS_SOR",
+        annotations = "-A AS_QD -A AS_MQRankSum -A AS_ReadPosRankSum -A AS_FS -A AS_MQ -A AS_SOR",
+        resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz",
         use_allele_specific_annotations = true,
     }
 
diff --git a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
index ccd80df201f..b7c4fddae73 100644
--- a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
+++ b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
@@ -1,297 +1,280 @@
 version 1.0
 
-# This is a workflow for filtering a joint callset VCF using INFO level annotations (so filtering is at the site level).
-# Note that the input VCFs here may be sharded by genomic position which may be helpful for large cohorts. The script
-# will output the same number of shards that are input.
-# This portion of the filtering pipeline will assign a SCORE INFO field annotation to each site, but does not yet apply
-# the filtering threshold to the final VCF.
+# Workflow for scoring and optionally filtering a VCF based on site-level annotations using the
+# ExtractVariationAnnotations-TrainVariantAnnotationsModel-ScoreVariantAnnotations toolchain,
+# which supersedes the corresponding VariantRecalibrator-ApplyVQSR toolchain.
+# See the parameter_meta section below for descriptions of the workflow inputs.
+# Also see the GATK documentation for these tools for descriptions of the corresponding methods and additional details.
+
+struct RuntimeAttributes {
+    Int? cpu
+    Int? command_mem_gb
+    Int? additional_mem_gb
+    Int? disk_size_gb
+    Int? boot_disk_size_gb
+    Boolean? use_ssd
+    Int? preemptible
+    Int? max_retries
+}
 
 workflow JointVcfFiltering {
-	input {
-		Array[File] vcf
-		Array[File] vcf_index
-		File sites_only_vcf
-		File sites_only_vcf_index
-		String basename
-
-		String? model_backend
-		File? training_python_script
-		File? scoring_python_script
-		File? hyperparameters_json
-
-		String gatk_docker
-		File? extract_interval_list
-		File? score_interval_list
-
-		String snp_annotations
-		String indel_annotations
-		File? gatk_override
-
-		Boolean use_allele_specific_annotations
-
-		String snp_resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz"
-		String indel_resource_args = "--resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz"
-	}
-
-	parameter_meta {
-		vcf: "An array of input VCFs that are one callset sharded by genomic region."
-		sites_only_vcf: "The full VCF callset without any genotype or sample level information."
-		basename: "Desired output file basename."
-	}
-
-	call ExtractVariantAnnotations as ExtractVariantAnnotationsSNPs {
-		input:
-			input_vcf = sites_only_vcf,
-			input_vcf_index = sites_only_vcf_index,
-			mode = "SNP",
-			annotations = snp_annotations,
-			resource_args = snp_resource_args,
-			basename = basename,
-			interval_list = extract_interval_list,
-			use_allele_specific_annotations = use_allele_specific_annotations,
-			gatk_override = gatk_override,
-			gatk_docker = gatk_docker
-	}
-
-	call ExtractVariantAnnotations as ExtractVariantAnnotationsINDELs {
-		input:
-			input_vcf = sites_only_vcf,
-			input_vcf_index = sites_only_vcf_index,
-			mode = "INDEL",
-			annotations = indel_annotations,
-			resource_args = indel_resource_args,
-			basename = basename,
-			interval_list = extract_interval_list,
-			use_allele_specific_annotations = use_allele_specific_annotations,
-			gatk_override = gatk_override,
-			gatk_docker = gatk_docker
-	}
-
-	call TrainVariantAnnotationModel as TrainVariantAnnotationModelSNPs {
-		input:
-			annots = ExtractVariantAnnotationsSNPs.annots,
-			basename = basename,
-			mode = "snp",
-			model_backend = model_backend,
-			python_script = training_python_script,
-			hyperparameters_json = hyperparameters_json,
-			gatk_override = gatk_override,
-			gatk_docker = gatk_docker
-	}
-
-	call TrainVariantAnnotationModel as TrainVariantAnnotationModelINDELs {
-		input:
-			annots = ExtractVariantAnnotationsINDELs.annots,
-			basename = basename,
-			mode = "indel",
-			model_backend = model_backend,
-			python_script = training_python_script,
-			hyperparameters_json = hyperparameters_json,
-			gatk_override = gatk_override,
-			gatk_docker = gatk_docker
-	}
-
-	scatter(idx in range(length(vcf))) {
-		call ScoreVariantAnnotations as ScoreVariantAnnotationsSNPs {
-			input:
-				vcf = vcf[idx],
-				vcf_index = vcf_index[idx],
-				basename = basename,
-				mode = "SNP",
-				model_backend = model_backend,
-				python_script = scoring_python_script,
-				annotations = snp_annotations,
-				extracted_training_vcf = ExtractVariantAnnotationsSNPs.extracted_training_vcf,
-				extracted_training_vcf_index = ExtractVariantAnnotationsSNPs.extracted_training_vcf_index,
-				interval_list = score_interval_list,
-				model_files = TrainVariantAnnotationModelSNPs.outputs,
-				resource_args = snp_resource_args,
-				use_allele_specific_annotations = use_allele_specific_annotations,
-				gatk_override = gatk_override,
-				gatk_docker = gatk_docker
-		}
-
-		call ScoreVariantAnnotations as ScoreVariantAnnotationsINDELs {
-			input:
-				vcf = vcf[idx],
-				vcf_index = vcf_index[idx],
-				basename = basename,
-				mode = "INDEL",
-				model_backend = model_backend,
-				python_script = scoring_python_script,
-				annotations = indel_annotations,
-				extracted_training_vcf = ExtractVariantAnnotationsINDELs.extracted_training_vcf,
-				extracted_training_vcf_index = ExtractVariantAnnotationsINDELs.extracted_training_vcf_index,
-				interval_list = score_interval_list,
-				model_files = TrainVariantAnnotationModelINDELs.outputs,
-				resource_args = indel_resource_args,
-				use_allele_specific_annotations = use_allele_specific_annotations,
-				gatk_override = gatk_override,
-				gatk_docker = gatk_docker
-		}
-
-	}
-
-	output {
-		Array[File] indels_variant_scored_vcf = ScoreVariantAnnotationsINDELs.output_vcf
-		Array[File] indels_variant_scored_vcf_index = ScoreVariantAnnotationsINDELs.output_vcf_index
-		Array[File] snps_variant_scored_vcf = ScoreVariantAnnotationsSNPs.output_vcf
-		Array[File] snps_variant_scored_vcf_index = ScoreVariantAnnotationsSNPs.output_vcf_index
-	}
-
+    input {
+        Array[File] input_vcfs
+        Array[File] input_vcf_idxs
+        File sites_only_vcf
+        File sites_only_vcf_idx
+        String output_prefix
+
+        Array[String] annotations
+        String resource_args
+
+        String? model_backend
+        File? python_script
+        File? hyperparameters_json
+
+        String? extract_extra_args
+        String? train_extra_args
+        String? score_extra_args
+
+        String gatk_docker
+        File? gatk_override
+
+        RuntimeAttributes? extract_runtime_attributes
+        RuntimeAttributes? train_runtime_attributes
+        RuntimeAttributes? score_runtime_attributes
+    }
+
+    parameter_meta {
+        input_vcfs: "Sharded input VCFs to be scored and optionally filtered."
+        sites_only_vcf: "A concatenated, sites-only version of the sharded input VCFs; used for extracting training and calibration sets."
+        output_prefix: "Base prefix for output files. Sharded output VCFs will be named following the pattern \"{output_prefix}.{zero_based_shard_index}.score.vcf.gz\"."
+        annotations: "Annotations to be used for extraction, training, and scoring."
+        resource_args: "Resource arguments to be used for extraction and scoring. For example, \"--resource:training_and_calibration_set,training=true,calibration=true gs://path-to-training-and-calibration-set ...\".\n See GATK documentation for the ExtractVariantAnnotations and ScoreVariantAnnotations tools."
+        model_backend: "(Optional) Model backend to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool."
+        python_script: "(Optional) Python script specifying custom model backend to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool."
+        hyperparameters_json: "(Optional) JSON file specifying model hyperparameters to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool."
+        extract_extra_args: "(Optional) Catch-all string to provide additional arguments for ExtractVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-negative training, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
+        train_extra_args: "(Optional) Catch-all string to provide additional arguments for TrainVariantAnnotationsModel. This can include variant-type modes, arguments for enabling positive-negative training, etc. See GATK documentation for this tool."
+        score_extra_args: "(Optional) Catch-all string to provide additional arguments for ScoreVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-negative training and hard filtering, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
+    }
+
+    call ExtractVariantAnnotations {
+        input:
+            input_vcf = sites_only_vcf,
+            input_vcf_idx = sites_only_vcf_idx,
+            output_prefix = output_prefix,
+            annotations = annotations,
+            resource_args = resource_args,
+            extra_args = extract_extra_args,
+            gatk_docker = gatk_docker,
+            gatk_override = gatk_override,
+            runtime_attributes = extract_runtime_attributes
+    }
+
+    call TrainVariantAnnotationsModel {
+        input:
+            annotations_hdf5 = ExtractVariantAnnotations.annotations_hdf5,
+            unlabeled_annotations_hdf5 = ExtractVariantAnnotations.unlabeled_annotations_hdf5,
+            model_backend = model_backend,
+            python_script = python_script,
+            hyperparameters_json = hyperparameters_json,
+            output_prefix = output_prefix,
+            extra_args = train_extra_args,
+            gatk_docker = gatk_docker,
+            gatk_override = gatk_override,
+            runtime_attributes = train_runtime_attributes
+    }
+
+    scatter (i in range(length(input_vcfs))) {
+        call ScoreVariantAnnotations {
+            input:
+                input_vcf = input_vcfs[i],
+                input_vcf_idx = input_vcf_idxs[i],
+                output_prefix = "~{output_prefix}.~{i}",
+                annotations = annotations,
+                resource_args = resource_args,
+                extracted_vcf = ExtractVariantAnnotations.extracted_vcf,
+                extracted_vcf_idx = ExtractVariantAnnotations.extracted_vcf_idx,
+                model_prefix = output_prefix,
+                model_files = TrainVariantAnnotationsModel.model_files,
+                extra_args = score_extra_args,
+                gatk_docker = gatk_docker,
+                gatk_override = gatk_override,
+                runtime_attributes = score_runtime_attributes
+        }
+    }
+
+    output {
+        File extracted_annotations_hdf5 = ExtractVariantAnnotations.annotations_hdf5
+        File? extracted_unlabeled_annotations_hdf5 = ExtractVariantAnnotations.unlabeled_annotations_hdf5
+        File extracted_vcf = ExtractVariantAnnotations.extracted_vcf
+        File extracted_vcf_idx = ExtractVariantAnnotations.extracted_vcf_idx
+
+        Array[File] model_files = TrainVariantAnnotationsModel.model_files
+
+        Array[File] scored_vcfs = ScoreVariantAnnotations.scored_vcf
+        Array[File] scored_vcf_idxs = ScoreVariantAnnotations.scored_vcf_idx
+        Array[File?] annotations_hdf5s = ScoreVariantAnnotations.annotations_hdf5
+        Array[File?] scores_hdf5s = ScoreVariantAnnotations.scores_hdf5
+    }
 }
 
 task ExtractVariantAnnotations {
-	input {
-		String gatk_docker
-		File? gatk_override
-		File input_vcf
-		File input_vcf_index
-		String basename
-		String mode
-		String annotations
-		String resource_args
-		File? interval_list
-		Boolean use_allele_specific_annotations
-
-		Int memory_mb = 28000
-		Int command_mem = memory_mb - 1000
-	}
-	Int disk_size = ceil(size(input_vcf, "GB") + size(input_vcf_index, "GB") + 100)
-
-	command {
-		set -e
-
-		export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
-
-		gatk --java-options "-Xmx~{command_mem}m" \
-			ExtractVariantAnnotations \
-			-V ~{input_vcf} \
-			-O ~{basename}.~{mode} \
-			~{annotations} \
-			~{if use_allele_specific_annotations then "--use-allele-specific-annotations" else ""} \
-			~{"-L " + interval_list} \
-			--mode ~{mode} \
-			~{resource_args}
-	}
-	output {
-		File annots = "~{basename}.~{mode}.annot.hdf5"
-		File extracted_training_vcf = "~{basename}.~{mode}.vcf.gz"
-		File extracted_training_vcf_index = "~{basename}.~{mode}.vcf.gz.tbi"
-		Array[File] outputs = glob("~{basename}.~{mode}.*")
-	}
-	runtime {
-		docker: gatk_docker
-		disks: "local-disk " + disk_size + " HDD"
-		memory: memory_mb + " MiB"
-	}
+    input {
+        File input_vcf
+        File input_vcf_idx
+        String output_prefix
+        Array[String] annotations
+        String resource_args
+        String? extra_args
+
+        String gatk_docker
+        File? gatk_override
+
+        RuntimeAttributes runtime_attributes = {}
+    }
+
+    parameter_meta {
+        input_vcf: {localization_optional: true}
+        input_vcf_idx: {localization_optional: true}
+    }
+
+    command {
+        set -e
+        export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
+
+        gatk --java-options "-Xmx~{default=6 runtime_attributes.command_mem_gb}G" \
+            ExtractVariantAnnotations \
+                -V ~{input_vcf} \
+                -O ~{output_prefix}.extract \
+                -A ~{sep=" -A " annotations} \
+                ~{resource_args} \
+                ~{extra_args}
+    }
+
+    runtime {
+        docker: gatk_docker
+        cpu: select_first([runtime_attributes.cpu, 1])
+        memory: select_first([runtime_attributes.command_mem_gb, 6]) + select_first([runtime_attributes.additional_mem_gb, 1]) + " GB"
+        disks: "local-disk " + select_first([runtime_attributes.disk_size_gb, 100]) + if select_first([runtime_attributes.use_ssd, false]) then " SSD" else " HDD"
+        bootDiskSizeGb: select_first([runtime_attributes.boot_disk_size_gb, 15])
+        preemptible: select_first([runtime_attributes.preemptible, 2])
+        maxRetries: select_first([runtime_attributes.max_retries, 1])
+    }
+
+    output {
+        File annotations_hdf5 = "~{output_prefix}.extract.annot.hdf5"
+        File? unlabeled_annotations_hdf5 = "~{output_prefix}.extract.unlabeled.annot.hdf5"
+        File extracted_vcf = "~{output_prefix}.extract.vcf.gz"          # this line will break if extra_args includes the do-not-gzip-vcf-output argument
+        File extracted_vcf_idx = "~{output_prefix}.extract.vcf.gz.tbi"  # this line will break if extra_args includes the do-not-gzip-vcf-output argument
+    }
 }
 
-task TrainVariantAnnotationModel {
-	input {
-		String gatk_docker
-		File? gatk_override
-		File annots
-		String basename
-		String mode
-		String? model_backend
-		File? python_script
-		File? hyperparameters_json
-
-		Int memory_mb = 28000
-		Int command_mem = memory_mb - 1000
-	}
-	Int disk_size = ceil(size(annots, "GB") + 100)
-
-	command <<<
-		set -e
-
-		export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
-
-		mode=$(echo "~{mode}" | awk '{print toupper($0)}')
-
-		gatk --java-options "-Xmx~{command_mem}m" \
-			TrainVariantAnnotationsModel \
-			--annotations-hdf5 ~{annots} \
-			-O ~{basename} \
-			~{"--model-backend " + model_backend} \
-			~{"--python-script " + python_script} \
-			~{"--hyperparameters-json " + hyperparameters_json} \
-			--mode $mode
-
-	>>>
-	output {
-		Array[File] outputs = glob("~{basename}.~{mode}.*")
-	}
-	runtime {
-		docker: gatk_docker
-		disks: "local-disk " + disk_size + " HDD"
-		memory: memory_mb + " MiB"
-	}
+task TrainVariantAnnotationsModel {
+    input {
+        File annotations_hdf5
+        File? unlabeled_annotations_hdf5
+        String? model_backend
+        File? python_script
+        File? hyperparameters_json
+        String output_prefix
+        String? extra_args
+
+        String gatk_docker
+        File? gatk_override
+
+        RuntimeAttributes runtime_attributes = {}
+    }
+
+    command {
+        set -e
+        export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
+
+        gatk --java-options "-Xmx~{default=6 runtime_attributes.command_mem_gb}G" \
+            TrainVariantAnnotationsModel \
+                --annotations-hdf5 ~{annotations_hdf5} \
+                ~{"--unlabeled-annotations-hdf5 " + unlabeled_annotations_hdf5} \
+                ~{"--model-backend " + model_backend} \
+                ~{"--python-script " + python_script} \
+                ~{"--hyperparameters-json " + hyperparameters_json} \
+                -O ~{output_prefix}.train \
+                ~{extra_args}
+    }
+
+    runtime {
+        docker: gatk_docker
+        cpu: select_first([runtime_attributes.cpu, 1])
+        memory: select_first([runtime_attributes.command_mem_gb, 6]) + select_first([runtime_attributes.additional_mem_gb, 1]) + " GB"
+        disks: "local-disk " + select_first([runtime_attributes.disk_size_gb, 100]) + if select_first([runtime_attributes.use_ssd, false]) then " SSD" else " HDD"
+        bootDiskSizeGb: select_first([runtime_attributes.boot_disk_size_gb, 15])
+        preemptible: select_first([runtime_attributes.preemptible, 2])
+        maxRetries: select_first([runtime_attributes.max_retries, 1])
+    }
+
+    output {
+        Array[File] model_files = glob("~{output_prefix}.train.*")
+    }
 }
 
 task ScoreVariantAnnotations {
-	input {
-		String gatk_docker
-		File? gatk_override
-		File vcf
-		File vcf_index
-		String basename
-		String mode
-		String? model_backend
-		File? python_script
-		String annotations
-		String resource_args
-		File extracted_training_vcf
-		File extracted_training_vcf_index
-		File? interval_list
-		Array[File] model_files
-		Boolean use_allele_specific_annotations
-
-		Int memory_mb = 16000
-		Int command_mem = memory_mb - 1000
-	}
-	Int disk_size = ceil(size(vcf, "GB") * 2 + 50)
-
-	command {
-		zgrep -v '#' ~{vcf} > empty.txt
-		set -e
-
-		if [ -s empty.txt ]; then
-			ln -s ~{sep=" . && ln -s " model_files} .
-
-			export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
-
-			gatk --java-options "-Xmx~{command_mem}m" \
-				ScoreVariantAnnotations \
-				~{"-L " + interval_list} \
-				-V ~{vcf} \
-				-O ~{basename}.~{mode} \
-				~{"--model-backend " + model_backend} \
-				~{"--python-script " + python_script} \
-				--model-prefix ~{basename} \
-				~{annotations} \
-				~{if use_allele_specific_annotations then "--use-allele-specific-annotations" else ""} \
-				-mode ~{mode} \
-				--resource:extracted,extracted=true ~{extracted_training_vcf} \
-				~{resource_args}
-		else
-			echo "Input VCF was empty so we'll return the same VCF that was input."
-			echo "Scores and annot hdf5 files will not be produced since the input was empty."
-			ln -s ~{vcf} ~{basename}.~{mode}.vcf.gz
-			ln -s ~{vcf_index} ~{basename}.~{mode}.vcf.gz.tbi
-		fi
-	}
-	output {
-		File? scores = "~{basename}.~{mode}.scores.hdf5"
-		File? annots = "~{basename}.~{mode}.annot.hdf5"
-		File output_vcf = "~{basename}.~{mode}.vcf.gz"
-		File output_vcf_index = "~{basename}.~{mode}.vcf.gz.tbi"
-	}
-	runtime {
-		docker: gatk_docker
-		disks: "local-disk " + disk_size + " HDD"
-		memory: memory_mb + " MiB"
-	}
+    input {
+        File input_vcf
+        File input_vcf_idx
+        String output_prefix
+        Array[String] annotations
+        String resource_args
+        File extracted_vcf
+        File extracted_vcf_idx
+        String model_prefix
+        Array[File] model_files
+        String? extra_args
+
+        String gatk_docker
+        File? gatk_override
+
+        RuntimeAttributes runtime_attributes = {}
+    }
+
+    parameter_meta {
+        input_vcf: {localization_optional: true}
+        input_vcf_idx: {localization_optional: true}
+        extracted_vcf: {localization_optional: true}
+        extracted_vcf_idx: {localization_optional: true}
+    }
+
+    command {
+        set -e
+
+        export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
+
+        mkdir model-files
+        ln -s ~{sep=" model-files && ln -s " model_files} model-files
+
+        gatk --java-options "-Xmx~{default=2 runtime_attributes.command_mem_gb}G" \
+            ScoreVariantAnnotations \
+                -V ~{input_vcf} \
+                -O ~{output_prefix}.score \
+                -A ~{sep=" -A " annotations} \
+                ~{resource_args} \
+                --resource:extracted,extracted=true ~{extracted_vcf} \
+                --model-prefix model-files/~{model_prefix}.train \
+                ~{extra_args}
+    }
+
+    runtime {
+        docker: gatk_docker
+        cpu: select_first([runtime_attributes.cpu, 1])
+        memory: select_first([runtime_attributes.command_mem_gb, 2]) + select_first([runtime_attributes.additional_mem_gb, 1]) + " GB"
+        disks: "local-disk " + select_first([runtime_attributes.disk_size_gb, 100]) + if select_first([runtime_attributes.use_ssd, false]) then " SSD" else " HDD"
+        bootDiskSizeGb: select_first([runtime_attributes.boot_disk_size_gb, 15])
+        preemptible: select_first([runtime_attributes.preemptible, 2])
+        maxRetries: select_first([runtime_attributes.max_retries, 1])
+    }
+
+    output {
+        File scored_vcf = "~{output_prefix}.score.vcf.gz"               # this line will break if extra_args includes the do-not-gzip-vcf-output argument
+        File scored_vcf_idx = "~{output_prefix}.score.vcf.gz.tbi"       # this line will break if extra_args includes the do-not-gzip-vcf-output argument
+        File? annotations_hdf5 = "~{output_prefix}.score.annot.hdf5"    # this file will only be produced if the number of sites scored is nonzero
+        File? scores_hdf5 = "~{output_prefix}.score.scores.hdf5"        # this file will only be produced if the number of sites scored is nonzero
+    }
 }
-

From 01151c6c5937da2dc302377c1f678d53fcaec21b Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Wed, 29 Mar 2023 13:10:23 -0400
Subject: [PATCH 02/25] Fixed the annotations, and I think I have the flag for
 allele-specific annotations right now.

---
 .dockstore.yml                                  | 1 +
 scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.dockstore.yml b/.dockstore.yml
index df45322584e..218fe684052 100644
--- a/.dockstore.yml
+++ b/.dockstore.yml
@@ -96,6 +96,7 @@ workflows:
          - master
          - ah_var_store
          - gg_VS-695_RunPandSForVQSR_Lite
+         - gg_VS-776_UpdateToLatestVQSRLite
    - name: GvsPopulateAltAllele
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsPopulateAltAllele.wdl
diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index 3bc0a5b2cef..546e3a93cf2 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -137,9 +137,10 @@ workflow GvsCreateFilterSet {
         gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0",
         extract_interval_list = interval_list,
         score_interval_list = interval_list,
-        annotations = "-A AS_QD -A AS_MQRankSum -A AS_ReadPosRankSum -A AS_FS -A AS_MQ -A AS_SOR",
+        annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"],
         resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz",
-        use_allele_specific_annotations = true,
+        extract_extra_args = "--use-allele-specific-annotations",
+        score_extra_args = "--use-allele-specific-annotations"
     }
 
     call Utils.MergeVCFs as MergeINDELScoredVCFs {

From 3f5931820d38dd3989707d5272ef41c88689933e Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Wed, 29 Mar 2023 13:23:30 -0400
Subject: [PATCH 03/25] Pass the interval list right? Correct for the newly
 renamed output file.

---
 .../variantstore/wdl/GvsCreateFilterSet.wdl   | 28 ++++++-------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index 546e3a93cf2..6bb7667be76 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -135,27 +135,17 @@ workflow GvsCreateFilterSet {
         sites_only_vcf_index = MergeVCFs.output_vcf_index,
         output_prefix = filter_set_name,
         gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0",
-        extract_interval_list = interval_list,
-        score_interval_list = interval_list,
         annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"],
         resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz",
-        extract_extra_args = "--use-allele-specific-annotations",
-        score_extra_args = "--use-allele-specific-annotations"
+        extract_extra_args = "-L ${interval_list} --use-allele-specific-annotations",
+        score_extra_args = "-L ${interval_list} --use-allele-specific-annotations"
     }
 
-    call Utils.MergeVCFs as MergeINDELScoredVCFs {
+    call Utils.MergeVCFs as MergeScoredVCFs {
       input:
-        input_vcfs = JointVcfFiltering.indels_variant_scored_vcf,
+        input_vcfs = JointVcfFiltering.scored_vcfs,
         gather_type = "CONVENTIONAL",
-        output_vcf_name = "${filter_set_name}.indel.vrecalibration.gz",
-        preemptible_tries = 3,
-    }
-
-    call Utils.MergeVCFs as MergeSNPScoredVCFs {
-      input:
-        input_vcfs = JointVcfFiltering.snps_variant_scored_vcf,
-        gather_type = "CONVENTIONAL",
-        output_vcf_name = "${filter_set_name}.snp.vrecalibration.gz",
+        output_vcf_name = "${filter_set_name}.vrecalibration.gz",
         preemptible_tries = 3,
     }
 
@@ -166,8 +156,8 @@ workflow GvsCreateFilterSet {
     #     which we don't want to put into the filter_set_info_vqsr_lite table.
     call Utils.SelectVariants as CreateFilteredScoredSNPsVCF {
       input:
-        input_vcf = MergeSNPScoredVCFs.output_vcf,
-        input_vcf_index = MergeSNPScoredVCFs.output_vcf_index,
+        input_vcf = MergeScoredVCFs.output_vcf,
+        input_vcf_index = MergeScoredVCFs.output_vcf_index,
         type_to_include = "SNP",
         exclude_filtered = true,
         output_basename = "${filter_set_name}.filtered.scored.snps"
@@ -175,8 +165,8 @@ workflow GvsCreateFilterSet {
 
     call Utils.SelectVariants as CreateFilteredScoredINDELsVCF {
       input:
-        input_vcf = MergeINDELScoredVCFs.output_vcf,
-        input_vcf_index = MergeINDELScoredVCFs.output_vcf_index,
+        input_vcf = MergeScoredVCFs.output_vcf,
+        input_vcf_index = MergeScoredVCFs.output_vcf_index,
         type_to_include = "INDEL",
         exclude_filtered = true,
         output_basename = "${filter_set_name}.filtered.scored.indels"

From a155bbeb2c062b08339755ebdd08109a4c9fe2df Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Wed, 29 Mar 2023 13:32:47 -0400
Subject: [PATCH 04/25] Fixed another bug.

---
 scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index 6bb7667be76..e3b02cac119 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -132,7 +132,7 @@ workflow GvsCreateFilterSet {
         input_vcfs = ExtractFilterTask.output_vcf,
         input_vcf_idxs = ExtractFilterTask.output_vcf_index,
         sites_only_vcf = MergeVCFs.output_vcf,
-        sites_only_vcf_index = MergeVCFs.output_vcf_index,
+        sites_only_vcf_idx = MergeVCFs.output_vcf_index,
         output_prefix = filter_set_name,
         gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0",
         annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"],

From ee626966511ffde55df7e7331ffdadb99aad9c5b Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Wed, 29 Mar 2023 16:23:21 -0400
Subject: [PATCH 05/25] Manually copying over the vcf_site_level_filtering
 tests.

---
 .../README.md                                 |  2 +-
 .../run_vcf_site_level_filtering_wdl.sh       | 25 ++++++++++---------
 .../vcf_site_level_filtering.json             | 17 +++++++++++++
 .../vcf_site_level_filtering_pos_neg.json     | 19 ++++++++++++++
 .../vcf_site_level_filtering_travis.json      | 14 -----------
 5 files changed, 50 insertions(+), 27 deletions(-)
 create mode 100644 scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering.json
 create mode 100644 scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_pos_neg.json
 delete mode 100644 scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json

diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/README.md b/scripts/vcf_site_level_filtering_cromwell_tests/README.md
index 6f9950fa36d..28ee43d9b8c 100644
--- a/scripts/vcf_site_level_filtering_cromwell_tests/README.md
+++ b/scripts/vcf_site_level_filtering_cromwell_tests/README.md
@@ -2,7 +2,7 @@
 
 **This directory is for GATK devs only**
 
-This directory contains scripts for running Variant Site Level WDL tests in the automated travis build environment.
+This directory contains scripts for running Variant Site Level WDL tests in the automated build environment.
 
 Please note that this only tests whether the WDL will complete successfully.
 
diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh b/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh
index 1c19d18c3b6..1f5955aa1f1 100644
--- a/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh
+++ b/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh
@@ -4,14 +4,16 @@ set -e
 script_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P )
 cd "$script_path"
 
-WORKING_DIR=/home/runner/work/gatk
+WORKING_DIR=/home/runner/work/gatk/gatk
+WDL_DIR=$WORKING_DIR/scripts/vcf_site_level_filtering_wdl
+CROMWELL_TEST_DIR=$WORKING_DIR/scripts/vcf_site_level_filtering_cromwell_tests
 
 set -e
 echo "Building docker image for VCF Site Level Filtering WDL tests (skipping unit tests)..."
 
 #assume Dockerfile is in root
 echo "Building docker without running unit tests... ========="
-cd $WORKING_DIR/gatk
+cd $WORKING_DIR
 
 # IMPORTANT: This code is duplicated in the cnv and M2 WDL test.
 if [ ! -z "$CI_PULL_REQUEST" ]; then
@@ -21,18 +23,17 @@ if [ ! -z "$CI_PULL_REQUEST" ]; then
 else
   HASH_TO_USE=${CI_COMMIT}
   sudo bash build_docker.sh  -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/;
-  echo "using travis commit:"$HASH_TO_USE
+  echo "using commit:"$HASH_TO_USE
 fi
 echo "Docker build done =========="
 
-cd $WORKING_DIR/gatk/scripts/
-sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json >$WORKING_DIR/vcf_site_level_filtering_travis.json
-echo "JSON FILES (modified) ======="
-cat $WORKING_DIR/vcf_site_level_filtering_travis.json
-echo "=================="
-
+sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/vcf_site_level_filtering.json >$WORKING_DIR/vcf_site_level_filtering_mod.json
+sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/vcf_site_level_filtering_pos_neg.json >$WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json
 
 echo "Running Filtering WDL through cromwell"
-ln -fs $WORKING_DIR/gatk/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
-cd $WORKING_DIR/gatk/scripts/vcf_site_level_filtering_wdl/
-java -jar $CROMWELL_JAR run JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_travis.json
+
+cat $WORKING_DIR/vcf_site_level_filtering_mod.json
+java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_mod.json
+
+cat $WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json
+java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json
\ No newline at end of file
diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering.json b/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering.json
new file mode 100644
index 00000000000..37cba35ad9d
--- /dev/null
+++ b/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering.json
@@ -0,0 +1,17 @@
+{
+  "JointVcfFiltering.gatk_docker": "__GATK_DOCKER__",
+  "JointVcfFiltering.input_vcfs": [
+    "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz",
+    "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz",
+    "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz"],
+  "JointVcfFiltering.input_vcf_idxs": [
+    "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz.tbi",
+    "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz.tbi",
+    "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz.tbi"],
+  "JointVcfFiltering.sites_only_vcf": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz",
+  "JointVcfFiltering.sites_only_vcf_idx": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz.tbi",
+  "JointVcfFiltering.annotations": ["ReadPosRankSum", "FS", "SOR", "QD"],
+  "JointVcfFiltering.output_prefix": "test_10_samples",
+  "JointVcfFiltering.resource_args": "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
+  "JointVcfFiltering.extract_extra_args": "-L chr21"
+}
\ No newline at end of file
diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_pos_neg.json b/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_pos_neg.json
new file mode 100644
index 00000000000..ee2d116e1d4
--- /dev/null
+++ b/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_pos_neg.json
@@ -0,0 +1,19 @@
+{
+  "JointVcfFiltering.gatk_docker": "__GATK_DOCKER__",
+  "JointVcfFiltering.input_vcfs": [
+    "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz",
+    "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz",
+    "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz"],
+  "JointVcfFiltering.input_vcf_idxs": [
+    "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz.tbi",
+    "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz.tbi",
+    "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz.tbi"],
+  "JointVcfFiltering.sites_only_vcf": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz",
+  "JointVcfFiltering.sites_only_vcf_idx": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz.tbi",
+  "JointVcfFiltering.annotations": ["ReadPosRankSum", "FS", "SOR", "QD"],
+  "JointVcfFiltering.output_prefix": "test_10_samples",
+  "JointVcfFiltering.resource_args": "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
+  "JointVcfFiltering.extract_extra_args": "-L chr21 --maximum-number-of-unlabeled-variants 10000000",
+  "JointVcfFiltering.train_extra_args": "--calibration-sensitivity-threshold 0.95",
+  "JointVcfFiltering.score_extra_args": "--snp-calibration-sensitivity-threshold 0.95 --indel-calibration-sensitivity-threshold 0.95"
+}
\ No newline at end of file
diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json b/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json
deleted file mode 100644
index 8165e199d22..00000000000
--- a/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json
+++ /dev/null
@@ -1,14 +0,0 @@
-{
-  "JointVcfFiltering.gatk_docker": "__GATK_DOCKER__",
-  "JointVcfFiltering.vcf": ["/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz",
-    "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz"],
-  "JointVcfFiltering.vcf_index": ["/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi",
-    "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi"],
-  "JointVcfFiltering.sites_only_vcf": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz",
-  "JointVcfFiltering.sites_only_vcf_index": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi",
-  "JointVcfFiltering.basename": "test_10_samples",
-  "JointVcfFiltering.snp_annotations": "-A ReadPosRankSum -A FS -A SOR -A QD -A AVERAGE_TREE_SCORE -A AVERAGE_ASSEMBLED_HAPS -A AVERAGE_FILTERED_HAPS",
-  "JointVcfFiltering.indel_annotations": "-A MQRankSum -A ReadPosRankSum -A FS -A SOR -A QD -A AVERAGE_TREE_SCORE",
-  "JointVcfFiltering.model_backend": "PYTHON_IFOREST",
-  "JointVcfFiltering.use_allele_specific_annotations": false
-}

From 42fbd2db3d8e66f463c1cbd06c467e76910987ca Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Thu, 30 Mar 2023 08:21:59 -0400
Subject: [PATCH 06/25] Upping command memory for ExtractTask

---
 scripts/variantstore/wdl/GvsCreateFilterSet.wdl            | 3 ++-
 scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index a269c6782eb..f9da79feaa8 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -138,7 +138,8 @@ workflow GvsCreateFilterSet {
         annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"],
         resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz",
         extract_extra_args = "-L ${interval_list} --use-allele-specific-annotations",
-        score_extra_args = "-L ${interval_list} --use-allele-specific-annotations"
+        score_extra_args = "-L ${interval_list} --use-allele-specific-annotations",
+        extract_runtime_attributes = {"command_mem_gb": 15}
     }
 
     call Utils.MergeVCFs as MergeScoredVCFs {
diff --git a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
index b7c4fddae73..cd73c8e1ef2 100644
--- a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
+++ b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
@@ -139,10 +139,14 @@ task ExtractVariantAnnotations {
         input_vcf_idx: {localization_optional: true}
     }
 
+    File monitoring_script = "gs://gvs_quickstart_storage/cromwell_monitoring_script.sh"
+
     command {
         set -e
         export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
 
+        bash ~{monitoring_script} > monitoring.log &
+
         gatk --java-options "-Xmx~{default=6 runtime_attributes.command_mem_gb}G" \
             ExtractVariantAnnotations \
                 -V ~{input_vcf} \
@@ -167,6 +171,7 @@ task ExtractVariantAnnotations {
         File? unlabeled_annotations_hdf5 = "~{output_prefix}.extract.unlabeled.annot.hdf5"
         File extracted_vcf = "~{output_prefix}.extract.vcf.gz"          # this line will break if extra_args includes the do-not-gzip-vcf-output argument
         File extracted_vcf_idx = "~{output_prefix}.extract.vcf.gz.tbi"  # this line will break if extra_args includes the do-not-gzip-vcf-output argument
+        File monitoring_log = "monitoring.log"
     }
 }
 

From fc3eb3aa2d0a756cfc10c548d16e993c4ee6b1d4 Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Thu, 30 Mar 2023 11:17:33 -0400
Subject: [PATCH 07/25] Still more memory and use newer gatk

---
 scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index f9da79feaa8..77110028107 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -134,12 +134,12 @@ workflow GvsCreateFilterSet {
         sites_only_vcf = MergeVCFs.output_vcf,
         sites_only_vcf_idx = MergeVCFs.output_vcf_index,
         output_prefix = filter_set_name,
-        gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0",
+        gatk_docker = "us.gcr.io/broad-gatk/gatk:4.4.0.0",
         annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"],
         resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz",
         extract_extra_args = "-L ${interval_list} --use-allele-specific-annotations",
         score_extra_args = "-L ${interval_list} --use-allele-specific-annotations",
-        extract_runtime_attributes = {"command_mem_gb": 15}
+        extract_runtime_attributes = {"command_mem_gb": 27}
     }
 
     call Utils.MergeVCFs as MergeScoredVCFs {

From 068261584668bbb1196c5361aa3c321c76606723 Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Thu, 30 Mar 2023 15:57:48 -0400
Subject: [PATCH 08/25] Up memory on all of VQSR Lite

---
 scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index 77110028107..3d0849ca141 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -139,7 +139,9 @@ workflow GvsCreateFilterSet {
         resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz",
         extract_extra_args = "-L ${interval_list} --use-allele-specific-annotations",
         score_extra_args = "-L ${interval_list} --use-allele-specific-annotations",
-        extract_runtime_attributes = {"command_mem_gb": 27}
+        extract_runtime_attributes = {"command_mem_gb": 27},
+        train_runtime_attributes = {"command_mem_gb": 27},
+        score_runtime_attributes = {"command_mem_gb": 15},
     }
 
     call Utils.MergeVCFs as MergeScoredVCFs {

From 111fbba159dab47fcd8fa92269a60cc4dc8af2bd Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Mon, 10 Apr 2023 13:59:44 -0400
Subject: [PATCH 09/25] Fix the usage of the Axiom resource.

---
 .../variantstore/wdl/GvsCreateFilterSet.wdl   | 26 ++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index 3d0849ca141..9cb71a2c852 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -36,18 +36,42 @@ workflow GvsCreateFilterSet {
   Array[String] snp_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ]
 
   # reference files
+  # Axiom - Used only for indels
+  # Classic: known=false,training=true,truth=false
+  # Lite:    training=true,calibration=false
   File axiomPoly_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz"
   File axiomPoly_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi"
+
+  # DbSNP - BOTH SNPs and INDELs. But used only as known in classic (which isn't used in Lite and so dropped in lite)
+  # Classic: known=true,training=false,truth=false
+  # Lite:    Unused
   File dbsnp_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf"
   File dbsnp_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx"
+
+  # HapMap - SNPs
+  # Classic: known=false,training=true,truth=true
+  # Lite:    training=true,calibration=true
   File hapmap_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz"
   File hapmap_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi"
+
+  # Mills - Indels
+  # Classic: known=false,training=true,truth=true
+  # Lite:    training=true,calibration=true
   File mills_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
   File mills_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi"
+
+  # Omni - SNPs
+  # Classic: known=false,training=true,truth=true
+  # Lite:    training=true,calibration=true
   File omni_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz"
   File omni_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi"
+
+  # 1000G - SNPs
+  # Classic: known=false,training=true,truth=false
+  # Lite:    training=true,calibration=false
   File one_thousand_genomes_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz"
   File one_thousand_genomes_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi"
+
   File reference = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta"
   File reference_dict = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict"
   File reference_index = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai"
@@ -136,7 +160,7 @@ workflow GvsCreateFilterSet {
         output_prefix = filter_set_name,
         gatk_docker = "us.gcr.io/broad-gatk/gatk:4.4.0.0",
         annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"],
-        resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz",
+        resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz",
         extract_extra_args = "-L ${interval_list} --use-allele-specific-annotations",
         score_extra_args = "-L ${interval_list} --use-allele-specific-annotations",
         extract_runtime_attributes = {"command_mem_gb": 27},

From 96f90267a9f7a4c63829057bff2c5e060dfc0a90 Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Tue, 18 Apr 2023 11:39:09 -0400
Subject: [PATCH 10/25] Clean up from a bad merge

---
 .../vcf_site_level_filtering_wdl/JointVcfFiltering.wdl | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
index d0b059d24ce..9ffd5c63b99 100644
--- a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
+++ b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
@@ -116,6 +116,7 @@ workflow JointVcfFiltering {
         File extracted_vcf_idx = ExtractVariantAnnotations.extracted_vcf_idx
 
         Array[File] model_files = TrainVariantAnnotationsModel.model_files
+
         Array[File] scored_vcfs = ScoreVariantAnnotations.scored_vcf
         Array[File] scored_vcf_idxs = ScoreVariantAnnotations.scored_vcf_idx
         Array[File?] annotations_hdf5s = ScoreVariantAnnotations.annotations_hdf5
@@ -200,8 +201,9 @@ task TrainVariantAnnotationsModel {
 
         String gatk_docker
         File? gatk_override
-	}
-	Int disk_size = ceil(size(annots, "GB") + 100)
+
+        RuntimeAttributes runtime_attributes = {}
+    }
 
     command {
         set -e
@@ -254,6 +256,7 @@ task ScoreVariantAnnotations {
 
         String gatk_docker
         File? gatk_override
+
         RuntimeAttributes runtime_attributes = {}
     }
 
@@ -304,5 +307,4 @@ task ScoreVariantAnnotations {
         File? scores_hdf5 = "~{output_prefix}.score.scores.hdf5"        # this file will only be produced if the number of sites scored is nonzero
         File? monitoring_log = "monitoring.log"
     }
-  }
-}
+}
\ No newline at end of file

From ad5189a5bb81bea94ec3ee8b03d1675624c250e7 Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Tue, 18 Apr 2023 15:45:35 -0400
Subject: [PATCH 11/25] Refactoring to move all VQSR Classic code to separate
 WDL.

---
 .../variantstore/wdl/GvsCreateFilterSet.wdl   | 194 ++---------------
 scripts/variantstore/wdl/GvsVQSRClassic.wdl   | 203 ++++++++++++++++++
 2 files changed, 218 insertions(+), 179 deletions(-)
 create mode 100644 scripts/variantstore/wdl/GvsVQSRClassic.wdl

diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index b5965a75b71..5fb5cb6ad1a 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -1,7 +1,7 @@
 version 1.0
 
-import "GvsWarpTasks.wdl" as Tasks
 import "GvsUtils.wdl" as Utils
+import "GvsVQSRClassic.wdl" as VQSRClassic
 import "../../vcf_site_level_filtering_wdl/JointVcfFiltering.wdl" as VQSRLite
 
 workflow GvsCreateFilterSet {
@@ -12,66 +12,13 @@ workflow GvsCreateFilterSet {
     String call_set_identifier
 
     String filter_set_name
-    Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
-    Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
 
     File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
     File? gatk_override
 
     Boolean use_classic_VQSR = true
-    Int? INDEL_VQSR_max_gaussians_override = 4
-    Int? INDEL_VQSR_maximum_training_variants
-    Int? INDEL_VQSR_mem_gb_override
-    Int? SNP_VQSR_max_gaussians_override = 6
-    Int? SNP_VQSR_mem_gb_override
-    Int? SNP_VQSR_sample_every_nth_variant
-    Int? SNP_VQSR_maximum_training_variants
-    # This is the minimum number of samples where the SNP model will be created and applied in separate tasks
-    # (SNPsVariantRecalibratorClassic vs. SNPsVariantRecalibratorCreateModel and SNPsVariantRecalibratorScattered)
-    # For WARP classic this is done with 20k but the 10K Stroke Anderson dataset would not work unscattered (at least
-    # with the default VM memory settings) so this was adjusted down to 5K.
-    Int snps_variant_recalibration_threshold = 5000
   }
 
-  Array[String] snp_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ]
-
-  # reference files
-  # Axiom - Used only for indels
-  # Classic: known=false,training=true,truth=false
-  # Lite:    training=true,calibration=false
-  File axiomPoly_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz"
-  File axiomPoly_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi"
-
-  # DbSNP - BOTH SNPs and INDELs. But used only as known in classic (which isn't used in Lite and so dropped in lite)
-  # Classic: known=true,training=false,truth=false
-  # Lite:    Unused
-  File dbsnp_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf"
-  File dbsnp_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx"
-
-  # HapMap - SNPs
-  # Classic: known=false,training=true,truth=true
-  # Lite:    training=true,calibration=true
-  File hapmap_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz"
-  File hapmap_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi"
-
-  # Mills - Indels
-  # Classic: known=false,training=true,truth=true
-  # Lite:    training=true,calibration=true
-  File mills_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
-  File mills_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi"
-
-  # Omni - SNPs
-  # Classic: known=false,training=true,truth=true
-  # Lite:    training=true,calibration=true
-  File omni_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz"
-  File omni_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi"
-
-  # 1000G - SNPs
-  # Classic: known=false,training=true,truth=false
-  # Lite:    training=true,calibration=false
-  File one_thousand_genomes_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz"
-  File one_thousand_genomes_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi"
-
   File reference = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta"
   File reference_dict = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict"
   File reference_index = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai"
@@ -226,129 +173,24 @@ workflow GvsCreateFilterSet {
   }
 
   if (use_classic_VQSR) {
-
-    call Tasks.IndelsVariantRecalibrator {
+    call VQSRClassic.JointVcfFiltering as VQSRClassic {
       input:
+        base_name = filter_set_name,
+        num_samples_loaded = GetNumSamplesLoaded.num_samples,
         sites_only_variant_filtered_vcf = MergeVCFs.output_vcf,
-        sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index,
-        recalibration_filename = filter_set_name + ".indels.recal",
-        tranches_filename = filter_set_name + ".indels.tranches",
-        recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"],
-        recalibration_annotation_values = indel_recalibration_annotation_values,
-        mills_resource_vcf = mills_resource_vcf,
-        mills_resource_vcf_index = mills_resource_vcf_index,
-        axiomPoly_resource_vcf = axiomPoly_resource_vcf,
-        axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index,
-        dbsnp_resource_vcf = dbsnp_vcf,
-        dbsnp_resource_vcf_index = dbsnp_vcf_index,
-        use_allele_specific_annotations = true,
-        disk_size = "1000",
-        machine_mem_gb = INDEL_VQSR_mem_gb_override,
-        max_gaussians = INDEL_VQSR_max_gaussians_override,
-        maximum_training_variants = INDEL_VQSR_maximum_training_variants,
-    }
-
-    if (GetNumSamplesLoaded.num_samples > snps_variant_recalibration_threshold) {
-      call Tasks.SNPsVariantRecalibratorCreateModel {
-        input:
-          sites_only_variant_filtered_vcf = MergeVCFs.output_vcf,
-          sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index,
-          recalibration_filename = filter_set_name + ".snps.recal",
-          tranches_filename = filter_set_name + ".snps.tranches",
-          recalibration_tranche_values = snp_recalibration_tranche_values,
-          recalibration_annotation_values = snp_recalibration_annotation_values,
-          model_report_filename = filter_set_name + ".snps.model.report",
-          hapmap_resource_vcf = hapmap_resource_vcf,
-          hapmap_resource_vcf_index = hapmap_resource_vcf_index,
-          omni_resource_vcf = omni_resource_vcf,
-          omni_resource_vcf_index = omni_resource_vcf_index,
-          one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
-          one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
-          dbsnp_resource_vcf = dbsnp_vcf,
-          dbsnp_resource_vcf_index = dbsnp_vcf_index,
-          use_allele_specific_annotations = true,
-          disk_size = "1000",
-          machine_mem_gb = SNP_VQSR_mem_gb_override,
-          max_gaussians = SNP_VQSR_max_gaussians_override,
-          sample_every_nth_variant = SNP_VQSR_sample_every_nth_variant,
-          maximum_training_variants = SNP_VQSR_maximum_training_variants
-      }
-
-      scatter (idx in range(length(ExtractFilterTask.output_vcf))) {
-        call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered {
-          input:
-            sites_only_variant_filtered_vcf = ExtractFilterTask.output_vcf[idx],
-            sites_only_variant_filtered_vcf_index = ExtractFilterTask.output_vcf_index[idx],
-            recalibration_filename = filter_set_name + ".snps." + idx + ".recal",
-            tranches_filename = filter_set_name + ".snps." + idx + ".tranches",
-            recalibration_tranche_values = snp_recalibration_tranche_values,
-            recalibration_annotation_values = snp_recalibration_annotation_values,
-            model_report = SNPsVariantRecalibratorCreateModel.model_report,
-            hapmap_resource_vcf = hapmap_resource_vcf,
-            hapmap_resource_vcf_index = hapmap_resource_vcf_index,
-            omni_resource_vcf = omni_resource_vcf,
-            omni_resource_vcf_index = omni_resource_vcf_index,
-            one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
-            one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
-            dbsnp_resource_vcf = dbsnp_vcf,
-            dbsnp_resource_vcf_index = dbsnp_vcf_index,
-            use_allele_specific_annotations = true,
-            disk_size = "1000",
-            machine_mem_gb = SNP_VQSR_mem_gb_override
-        }
-      }
-
-      call Tasks.GatherTranches as SNPGatherTranches {
-        input:
-          tranches = SNPsVariantRecalibratorScattered.tranches,
-          output_filename = filter_set_name + ".snps.gathered.tranches",
-          output_tranche_values = snp_recalibration_tranche_values,
-          mode = "SNP",
-          disk_size = "200",
-          gatk_override = gatk_override
-      }
-
-      call Utils.MergeVCFs as MergeRecalibrationFiles {
-        input:
-          input_vcfs = SNPsVariantRecalibratorScattered.recalibration,
-          gather_type = "CONVENTIONAL",
-          output_vcf_name = "${filter_set_name}.vrecalibration.gz",
-          preemptible_tries = 3,
-      }
-    }
-
-    if (GetNumSamplesLoaded.num_samples <= snps_variant_recalibration_threshold) {
-      call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic {
-        input:
-          sites_only_variant_filtered_vcf = MergeVCFs.output_vcf,
-          sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index,
-          recalibration_filename = filter_set_name + ".snps.recal",
-          tranches_filename = filter_set_name + ".snps.tranches",
-          recalibration_tranche_values = snp_recalibration_tranche_values,
-          recalibration_annotation_values = snp_recalibration_annotation_values,
-          hapmap_resource_vcf = hapmap_resource_vcf,
-          hapmap_resource_vcf_index = hapmap_resource_vcf_index,
-          omni_resource_vcf = omni_resource_vcf,
-          omni_resource_vcf_index = omni_resource_vcf_index,
-          one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
-          one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
-          dbsnp_resource_vcf = dbsnp_vcf,
-          dbsnp_resource_vcf_index = dbsnp_vcf_index,
-          use_allele_specific_annotations = true,
-          disk_size = "1000",
-          machine_mem_gb = SNP_VQSR_mem_gb_override,
-          max_gaussians = SNP_VQSR_max_gaussians_override,
-      }
+        sites_only_variant_filtered_vcf_idx = MergeVCFs.output_vcf_index,
+        sites_only_variant_filtered_vcfs = ExtractFilterTask.output_vcf,
+        sites_only_variant_filtered_vcf_idxs = ExtractFilterTask.output_vcf_index,
     }
 
     call PopulateFilterSetInfo as PopulateFilterSetInfoClassic {
       input:
         gatk_override = gatk_override,
         filter_set_name = filter_set_name,
-        snp_recal_file = select_first([MergeRecalibrationFiles.output_vcf, SNPsVariantRecalibratorClassic.recalibration]),
-        snp_recal_file_index = select_first([MergeRecalibrationFiles.output_vcf_index, SNPsVariantRecalibratorClassic.recalibration_index]),
-        indel_recal_file = IndelsVariantRecalibrator.recalibration,
-        indel_recal_file_index = IndelsVariantRecalibrator.recalibration_index,
+        snp_recal_file = VQSRClassic.snps_variant_recalibration_file,
+        snp_recal_file_index = VQSRClassic.snps_variant_recalibration_file_index,
+        indel_recal_file = VQSRClassic.indels_variant_recalibration_file,
+        indel_recal_file_index = VQSRClassic.indels_variant_recalibration_file_index,
         fq_info_destination_table = fq_info_destination_table,
         filter_schema = fq_info_destination_table_schema,
         project_id = project_id,
@@ -369,8 +211,8 @@ workflow GvsCreateFilterSet {
       input:
         gatk_override = gatk_override,
         filter_set_name = filter_set_name,
-        snp_recal_tranches = select_first([SNPGatherTranches.tranches_file, SNPsVariantRecalibratorClassic.tranches]),
-        indel_recal_tranches = IndelsVariantRecalibrator.tranches,
+        snp_recal_tranches = VQSRClassic.snps_variant_tranches_file,
+        indel_recal_tranches = VQSRClassic.indels_variant_tranches_file,
         fq_tranches_destination_table = fq_tranches_destination_table,
         project_id = project_id
     }
@@ -387,19 +229,13 @@ workflow GvsCreateFilterSet {
                [AltAlleleTableDatetimeCheck.monitoring_log],
                ExtractFilterTask.monitoring_log,
                [MergeVCFs.monitoring_log],
-               select_first([JointVcfFiltering.monitoring_logs, []]),             # VQSR Lite Logging starts here
+               select_first([JointVcfFiltering.monitoring_logs, []]),
                [MergeScoredVCFs.monitoring_log],
                [CreateFilteredScoredSNPsVCF.monitoring_log],
                [CreateFilteredScoredINDELsVCF.monitoring_log],
                [PopulateFilterSetInfo.monitoring_log],
                [PopulateFilterSetSites.monitoring_log],
-               [IndelsVariantRecalibrator.monitoring_log],    # VQSR Classic Logging Starts here
-               [SNPsVariantRecalibratorCreateModel.monitoring_log],
-               select_first([SNPsVariantRecalibratorScattered.monitoring_log, []]),
-               [SNPGatherTranches.monitoring_log],
-               [MergeRecalibrationFiles.monitoring_log],
-               [IndelsVariantRecalibrator.monitoring_log],
-               [SNPsVariantRecalibratorClassic.monitoring_log],
+               select_first([VQSRClassic.monitoring_logs, []]),
                [PopulateFilterSetInfoClassic.monitoring_log],
                [PopulateFilterSetSitesClassic.monitoring_log],
                [PopulateFilterSetTranches.monitoring_log]
diff --git a/scripts/variantstore/wdl/GvsVQSRClassic.wdl b/scripts/variantstore/wdl/GvsVQSRClassic.wdl
new file mode 100644
index 00000000000..de93b2e0571
--- /dev/null
+++ b/scripts/variantstore/wdl/GvsVQSRClassic.wdl
@@ -0,0 +1,203 @@
+version 1.0
+
+import "GvsWarpTasks.wdl" as Tasks
+import "GvsUtils.wdl" as Utils
+
+workflow JointVcfFiltering {
+  input {
+    String base_name
+    Int num_samples_loaded
+    File sites_only_variant_filtered_vcf
+    File sites_only_variant_filtered_vcf_idx
+    Array[File] sites_only_variant_filtered_vcfs
+    Array[File] sites_only_variant_filtered_vcf_idxs
+
+    Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
+    Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
+
+    File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
+    File? gatk_override
+
+    Int? INDEL_VQSR_max_gaussians_override = 4
+    Int? INDEL_VQSR_maximum_training_variants
+    Int? INDEL_VQSR_mem_gb_override
+    Int? SNP_VQSR_max_gaussians_override = 6
+    Int? SNP_VQSR_mem_gb_override
+    Int? SNP_VQSR_sample_every_nth_variant
+    Int? SNP_VQSR_maximum_training_variants
+
+    # This is the minimum number of samples where the SNP model will be created and applied in separate tasks
+    # (SNPsVariantRecalibratorClassic vs. SNPsVariantRecalibratorCreateModel and SNPsVariantRecalibratorScattered)
+    # For VQSR classic this is done with 20k but the 10K Stroke Anderson dataset would not work unscattered (at least
+    # with the default VM memory settings) so this was adjusted down to 5K.
+    Int snps_variant_recalibration_threshold = 5000
+  }
+
+  Array[String] snp_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ]
+
+  # reference files
+  # Axiom - Used only for indels
+  # Classic: known=false,training=true,truth=false
+  File axiomPoly_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz"
+  File axiomPoly_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi"
+
+  # DbSNP - BOTH SNPs and INDELs. But used only as known in classic (which isn't used in Lite and so dropped in lite)
+  # Classic: known=true,training=false,truth=false
+  File dbsnp_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf"
+  File dbsnp_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx"
+
+  # HapMap - SNPs
+  # Classic: known=false,training=true,truth=true
+  File hapmap_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz"
+  File hapmap_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi"
+
+  # Mills - Indels
+  # Classic: known=false,training=true,truth=true
+  File mills_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
+  File mills_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi"
+
+  # Omni - SNPs
+  # Classic: known=false,training=true,truth=true
+  File omni_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz"
+  File omni_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi"
+
+  # 1000G - SNPs
+  # Classic: known=false,training=true,truth=false
+  File one_thousand_genomes_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz"
+  File one_thousand_genomes_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi"
+
+  call Tasks.IndelsVariantRecalibrator {
+    input:
+      sites_only_variant_filtered_vcf = sites_only_variant_filtered_vcf,
+      sites_only_variant_filtered_vcf_index = sites_only_variant_filtered_vcf_idx,
+      recalibration_filename = base_name + ".indels.recal",
+      tranches_filename = base_name + ".indels.tranches",
+      recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"],
+      recalibration_annotation_values = indel_recalibration_annotation_values,
+      mills_resource_vcf = mills_resource_vcf,
+      mills_resource_vcf_index = mills_resource_vcf_index,
+      axiomPoly_resource_vcf = axiomPoly_resource_vcf,
+      axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index,
+      dbsnp_resource_vcf = dbsnp_vcf,
+      dbsnp_resource_vcf_index = dbsnp_vcf_index,
+      use_allele_specific_annotations = true,
+      disk_size = "1000",
+      machine_mem_gb = INDEL_VQSR_mem_gb_override,
+      max_gaussians = INDEL_VQSR_max_gaussians_override,
+      maximum_training_variants = INDEL_VQSR_maximum_training_variants,
+  }
+
+  if (num_samples_loaded > snps_variant_recalibration_threshold) {
+    call Tasks.SNPsVariantRecalibratorCreateModel {
+      input:
+        sites_only_variant_filtered_vcf = sites_only_variant_filtered_vcf,
+        sites_only_variant_filtered_vcf_index = sites_only_variant_filtered_vcf_idx,
+        recalibration_filename = base_name + ".snps.recal",
+        tranches_filename = base_name + ".snps.tranches",
+        model_report_filename = base_name + ".snps.model.report",
+        recalibration_tranche_values = snp_recalibration_tranche_values,
+        recalibration_annotation_values = snp_recalibration_annotation_values,
+        hapmap_resource_vcf = hapmap_resource_vcf,
+        hapmap_resource_vcf_index = hapmap_resource_vcf_index,
+        omni_resource_vcf = omni_resource_vcf,
+        omni_resource_vcf_index = omni_resource_vcf_index,
+        one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
+        one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
+        dbsnp_resource_vcf = dbsnp_vcf,
+        dbsnp_resource_vcf_index = dbsnp_vcf_index,
+        use_allele_specific_annotations = true,
+        disk_size = "1000",
+        machine_mem_gb = SNP_VQSR_mem_gb_override,
+        max_gaussians = SNP_VQSR_max_gaussians_override,
+        sample_every_nth_variant = SNP_VQSR_sample_every_nth_variant,
+        maximum_training_variants = SNP_VQSR_maximum_training_variants
+    }
+
+    scatter (idx in range(length(sites_only_variant_filtered_vcfs))) {
+      call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered {
+        input:
+          sites_only_variant_filtered_vcf = sites_only_variant_filtered_vcfs[idx],
+          sites_only_variant_filtered_vcf_index = sites_only_variant_filtered_vcf_idxs[idx],
+          recalibration_filename = base_name + ".snps." + idx + ".recal",
+          tranches_filename = base_name + ".snps." + idx + ".tranches",
+          model_report = SNPsVariantRecalibratorCreateModel.model_report,
+          recalibration_tranche_values = snp_recalibration_tranche_values,
+          recalibration_annotation_values = snp_recalibration_annotation_values,
+          hapmap_resource_vcf = hapmap_resource_vcf,
+          hapmap_resource_vcf_index = hapmap_resource_vcf_index,
+          omni_resource_vcf = omni_resource_vcf,
+          omni_resource_vcf_index = omni_resource_vcf_index,
+          one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
+          one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
+          dbsnp_resource_vcf = dbsnp_vcf,
+          dbsnp_resource_vcf_index = dbsnp_vcf_index,
+          use_allele_specific_annotations = true,
+          disk_size = "1000",
+          machine_mem_gb = SNP_VQSR_mem_gb_override
+      }
+    }
+
+    call Tasks.GatherTranches as SNPGatherTranches {
+      input:
+        tranches = SNPsVariantRecalibratorScattered.tranches,
+        output_filename = base_name + ".snps.gathered.tranches",
+        output_tranche_values = snp_recalibration_tranche_values,
+        mode = "SNP",
+        disk_size = "200",
+        gatk_override = gatk_override
+    }
+
+    call Utils.MergeVCFs as MergeRecalibrationFiles {
+      input:
+        input_vcfs = SNPsVariantRecalibratorScattered.recalibration,
+        gather_type = "CONVENTIONAL",
+        output_vcf_name = "${base_name}.vrecalibration.vcf.gz",
+        preemptible_tries = 3,
+    }
+  }
+
+  if (num_samples_loaded <= snps_variant_recalibration_threshold) {
+    call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic {
+      input:
+        sites_only_variant_filtered_vcf = sites_only_variant_filtered_vcf,
+        sites_only_variant_filtered_vcf_index = sites_only_variant_filtered_vcf_idx,
+        recalibration_filename = base_name + ".snps.recal",
+        tranches_filename = base_name + ".snps.tranches",
+        recalibration_tranche_values = snp_recalibration_tranche_values,
+        recalibration_annotation_values = snp_recalibration_annotation_values,
+        hapmap_resource_vcf = hapmap_resource_vcf,
+        hapmap_resource_vcf_index = hapmap_resource_vcf_index,
+        omni_resource_vcf = omni_resource_vcf,
+        omni_resource_vcf_index = omni_resource_vcf_index,
+        one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
+        one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
+        dbsnp_resource_vcf = dbsnp_vcf,
+        dbsnp_resource_vcf_index = dbsnp_vcf_index,
+        use_allele_specific_annotations = true,
+        disk_size = "1000",
+        machine_mem_gb = SNP_VQSR_mem_gb_override,
+        max_gaussians = SNP_VQSR_max_gaussians_override,
+    }
+  }
+
+  output {
+    File snps_variant_recalibration_file = select_first([MergeRecalibrationFiles.output_vcf, SNPsVariantRecalibratorClassic.recalibration])
+    File snps_variant_recalibration_file_index = select_first([MergeRecalibrationFiles.output_vcf_index, SNPsVariantRecalibratorClassic.recalibration_index])
+    File snps_variant_tranches_file = select_first([SNPGatherTranches.tranches_file, SNPsVariantRecalibratorClassic.tranches])
+    File indels_variant_recalibration_file = IndelsVariantRecalibrator.recalibration
+    File indels_variant_recalibration_file_index = IndelsVariantRecalibrator.recalibration_index
+    File indels_variant_tranches_file = IndelsVariantRecalibrator.tranches
+    Array[File?] monitoring_logs = flatten(
+                                   [
+                                   [IndelsVariantRecalibrator.monitoring_log],
+                                   [SNPsVariantRecalibratorCreateModel.monitoring_log],
+                                   select_first([SNPsVariantRecalibratorScattered.monitoring_log, []]),
+                                   [SNPGatherTranches.monitoring_log],
+                                   [MergeRecalibrationFiles.monitoring_log],
+                                   [IndelsVariantRecalibrator.monitoring_log],
+                                   [SNPsVariantRecalibratorClassic.monitoring_log]
+                                   ])
+  }
+
+}
+

From fb1a90bd93bad1fc64d63a63938007913ff351d4 Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Tue, 18 Apr 2023 16:13:22 -0400
Subject: [PATCH 12/25] Allow to push some VQSR Classic parameters down from
 GvsUnified

---
 scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index 5fb5cb6ad1a..6de47e1a1a9 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -17,6 +17,14 @@ workflow GvsCreateFilterSet {
     File? gatk_override
 
     Boolean use_classic_VQSR = true
+
+    Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
+    Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
+
+    Int? INDEL_VQSR_max_gaussians_override = 4
+    Int? INDEL_VQSR_mem_gb_override
+    Int? SNP_VQSR_max_gaussians_override = 6
+    Int? SNP_VQSR_mem_gb_override
   }
 
   File reference = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta"
@@ -181,6 +189,12 @@ workflow GvsCreateFilterSet {
         sites_only_variant_filtered_vcf_idx = MergeVCFs.output_vcf_index,
         sites_only_variant_filtered_vcfs = ExtractFilterTask.output_vcf,
         sites_only_variant_filtered_vcf_idxs = ExtractFilterTask.output_vcf_index,
+        snp_recalibration_annotation_values = snp_recalibration_annotation_values,
+        indel_recalibration_annotation_values = indel_recalibration_annotation_values,
+        INDEL_VQSR_max_gaussians_override = INDEL_VQSR_max_gaussians_override,
+        INDEL_VQSR_mem_gb_override = INDEL_VQSR_mem_gb_override,
+        SNP_VQSR_max_gaussians_override = SNP_VQSR_max_gaussians_override,
+        SNP_VQSR_mem_gb_override = SNP_VQSR_mem_gb_override
     }
 
     call PopulateFilterSetInfo as PopulateFilterSetInfoClassic {

From 12a8d7d3d7aac199dd1117fa6efea9e5055a2ec5 Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Tue, 18 Apr 2023 19:36:09 -0400
Subject: [PATCH 13/25] Comment out for debugging

---
 scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index 6de47e1a1a9..529493ef098 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -249,7 +249,7 @@ workflow GvsCreateFilterSet {
                [CreateFilteredScoredINDELsVCF.monitoring_log],
                [PopulateFilterSetInfo.monitoring_log],
                [PopulateFilterSetSites.monitoring_log],
-               select_first([VQSRClassic.monitoring_logs, []]),
+#               select_first([VQSRClassic.monitoring_logs, []]),
                [PopulateFilterSetInfoClassic.monitoring_log],
                [PopulateFilterSetSitesClassic.monitoring_log],
                [PopulateFilterSetTranches.monitoring_log]

From b95340cb2eec927ba1bd5a23c8da5c2b4fc4fd76 Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Tue, 18 Apr 2023 19:38:00 -0400
Subject: [PATCH 14/25] Actually, try it like this.

---
 scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index 529493ef098..e59de5b834e 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -249,7 +249,7 @@ workflow GvsCreateFilterSet {
                [CreateFilteredScoredINDELsVCF.monitoring_log],
                [PopulateFilterSetInfo.monitoring_log],
                [PopulateFilterSetSites.monitoring_log],
-#               select_first([VQSRClassic.monitoring_logs, []]),
+               VQSRClassic.monitoring_logs,
                [PopulateFilterSetInfoClassic.monitoring_log],
                [PopulateFilterSetSitesClassic.monitoring_log],
                [PopulateFilterSetTranches.monitoring_log]

From bd542c39706c6d96433f38a152cb841d741f4860 Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Tue, 18 Apr 2023 19:40:50 -0400
Subject: [PATCH 15/25] No, actually not

---
 scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index e59de5b834e..529493ef098 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -249,7 +249,7 @@ workflow GvsCreateFilterSet {
                [CreateFilteredScoredINDELsVCF.monitoring_log],
                [PopulateFilterSetInfo.monitoring_log],
                [PopulateFilterSetSites.monitoring_log],
-               VQSRClassic.monitoring_logs,
+#               select_first([VQSRClassic.monitoring_logs, []]),
                [PopulateFilterSetInfoClassic.monitoring_log],
                [PopulateFilterSetSitesClassic.monitoring_log],
                [PopulateFilterSetTranches.monitoring_log]

From fa0f0ee543df5705ca6a17ce07ca01c53cf92ea6 Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Tue, 18 Apr 2023 19:58:06 -0400
Subject: [PATCH 16/25] More debugging, one fix.

---
 .../variantstore/wdl/GvsCreateFilterSet.wdl   | 24 +++++++++----------
 scripts/variantstore/wdl/GvsVQSRClassic.wdl   |  1 -
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index 529493ef098..6bdbd29cbd4 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -238,20 +238,20 @@ workflow GvsCreateFilterSet {
                flatten(
                [
                [SamplesTableDatetimeCheck.monitoring_log],
-               [GetNumSamplesLoaded.monitoring_log],
-               [SplitIntervals.monitoring_log],
-               [AltAlleleTableDatetimeCheck.monitoring_log],
+#               [GetNumSamplesLoaded.monitoring_log],
+#               [SplitIntervals.monitoring_log],
+#               [AltAlleleTableDatetimeCheck.monitoring_log],
                ExtractFilterTask.monitoring_log,
-               [MergeVCFs.monitoring_log],
+#               [MergeVCFs.monitoring_log],
                select_first([JointVcfFiltering.monitoring_logs, []]),
-               [MergeScoredVCFs.monitoring_log],
-               [CreateFilteredScoredSNPsVCF.monitoring_log],
-               [CreateFilteredScoredINDELsVCF.monitoring_log],
-               [PopulateFilterSetInfo.monitoring_log],
-               [PopulateFilterSetSites.monitoring_log],
-#               select_first([VQSRClassic.monitoring_logs, []]),
-               [PopulateFilterSetInfoClassic.monitoring_log],
-               [PopulateFilterSetSitesClassic.monitoring_log],
+#               [MergeScoredVCFs.monitoring_log],
+#               [CreateFilteredScoredSNPsVCF.monitoring_log],
+#               [CreateFilteredScoredINDELsVCF.monitoring_log],
+#               [PopulateFilterSetInfo.monitoring_log],
+#               [PopulateFilterSetSites.monitoring_log],
+               select_first([VQSRClassic.monitoring_logs, []]),
+#               [PopulateFilterSetInfoClassic.monitoring_log],
+#               [PopulateFilterSetSitesClassic.monitoring_log],
                [PopulateFilterSetTranches.monitoring_log]
                ]
                )
diff --git a/scripts/variantstore/wdl/GvsVQSRClassic.wdl b/scripts/variantstore/wdl/GvsVQSRClassic.wdl
index de93b2e0571..efdcf045a6b 100644
--- a/scripts/variantstore/wdl/GvsVQSRClassic.wdl
+++ b/scripts/variantstore/wdl/GvsVQSRClassic.wdl
@@ -194,7 +194,6 @@ workflow JointVcfFiltering {
                                    select_first([SNPsVariantRecalibratorScattered.monitoring_log, []]),
                                    [SNPGatherTranches.monitoring_log],
                                    [MergeRecalibrationFiles.monitoring_log],
-                                   [IndelsVariantRecalibrator.monitoring_log],
                                    [SNPsVariantRecalibratorClassic.monitoring_log]
                                    ])
   }

From dde74e7ca86cde196ba39300dd1ebbb74d076c91 Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Tue, 18 Apr 2023 22:00:51 -0400
Subject: [PATCH 17/25] Add a select_all

---
 scripts/variantstore/wdl/GvsVQSRClassic.wdl                | 5 +++--
 scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsVQSRClassic.wdl b/scripts/variantstore/wdl/GvsVQSRClassic.wdl
index efdcf045a6b..158bbec8faa 100644
--- a/scripts/variantstore/wdl/GvsVQSRClassic.wdl
+++ b/scripts/variantstore/wdl/GvsVQSRClassic.wdl
@@ -187,7 +187,8 @@ workflow JointVcfFiltering {
     File indels_variant_recalibration_file = IndelsVariantRecalibrator.recalibration
     File indels_variant_recalibration_file_index = IndelsVariantRecalibrator.recalibration_index
     File indels_variant_tranches_file = IndelsVariantRecalibrator.tranches
-    Array[File?] monitoring_logs = flatten(
+    Array[File] monitoring_logs = select_all(
+                                  flatten(
                                    [
                                    [IndelsVariantRecalibrator.monitoring_log],
                                    [SNPsVariantRecalibratorCreateModel.monitoring_log],
@@ -195,7 +196,7 @@ workflow JointVcfFiltering {
                                    [SNPGatherTranches.monitoring_log],
                                    [MergeRecalibrationFiles.monitoring_log],
                                    [SNPsVariantRecalibratorClassic.monitoring_log]
-                                   ])
+                                   ]))
   }
 
 }
diff --git a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
index 9ffd5c63b99..55b5fa1c390 100644
--- a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
+++ b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
@@ -184,7 +184,7 @@ task ExtractVariantAnnotations {
         File? unlabeled_annotations_hdf5 = "~{output_prefix}.extract.unlabeled.annot.hdf5"
         File extracted_vcf = "~{output_prefix}.extract.vcf.gz"          # this line will break if extra_args includes the do-not-gzip-vcf-output argument
         File extracted_vcf_idx = "~{output_prefix}.extract.vcf.gz.tbi"  # this line will break if extra_args includes the do-not-gzip-vcf-output argument
-        File monitoring_log = "monitoring.log"
+        File? monitoring_log = "monitoring.log"
     }
 }
 

From 8137b7bd158efb79419b5d198a9a04f6612d9639 Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Wed, 19 Apr 2023 08:00:38 -0400
Subject: [PATCH 18/25] Okay - put all the logs back for final test of classic.

---
 .../variantstore/wdl/GvsCreateFilterSet.wdl   | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index 6bdbd29cbd4..6de47e1a1a9 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -238,20 +238,20 @@ workflow GvsCreateFilterSet {
                flatten(
                [
                [SamplesTableDatetimeCheck.monitoring_log],
-#               [GetNumSamplesLoaded.monitoring_log],
-#               [SplitIntervals.monitoring_log],
-#               [AltAlleleTableDatetimeCheck.monitoring_log],
+               [GetNumSamplesLoaded.monitoring_log],
+               [SplitIntervals.monitoring_log],
+               [AltAlleleTableDatetimeCheck.monitoring_log],
                ExtractFilterTask.monitoring_log,
-#               [MergeVCFs.monitoring_log],
+               [MergeVCFs.monitoring_log],
                select_first([JointVcfFiltering.monitoring_logs, []]),
-#               [MergeScoredVCFs.monitoring_log],
-#               [CreateFilteredScoredSNPsVCF.monitoring_log],
-#               [CreateFilteredScoredINDELsVCF.monitoring_log],
-#               [PopulateFilterSetInfo.monitoring_log],
-#               [PopulateFilterSetSites.monitoring_log],
+               [MergeScoredVCFs.monitoring_log],
+               [CreateFilteredScoredSNPsVCF.monitoring_log],
+               [CreateFilteredScoredINDELsVCF.monitoring_log],
+               [PopulateFilterSetInfo.monitoring_log],
+               [PopulateFilterSetSites.monitoring_log],
                select_first([VQSRClassic.monitoring_logs, []]),
-#               [PopulateFilterSetInfoClassic.monitoring_log],
-#               [PopulateFilterSetSitesClassic.monitoring_log],
+               [PopulateFilterSetInfoClassic.monitoring_log],
+               [PopulateFilterSetSitesClassic.monitoring_log],
                [PopulateFilterSetTranches.monitoring_log]
                ]
                )

From 8e45dd6a4a6c02a67626c71dcd69c97ccd893c8a Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Wed, 19 Apr 2023 16:09:49 -0400
Subject: [PATCH 19/25] Update .dockstore.yml for testing VQSR Classic
 Refactoring.

---
 .dockstore.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.dockstore.yml b/.dockstore.yml
index aea4b176091..5896f106add 100644
--- a/.dockstore.yml
+++ b/.dockstore.yml
@@ -224,6 +224,7 @@ workflows:
          - master
          - ah_var_store
          - vs_866_update_variants_base_image
+         - gg_VS-776_UpdateToLatestVQSRLite
    - name: GvsQuickstartHailIntegration
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl

From 99f956d08c2e954761a1815f8a7eeff5394e2c4f Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Wed, 19 Apr 2023 16:14:55 -0400
Subject: [PATCH 20/25] Fix the .dockstore.yml

---
 .dockstore.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.dockstore.yml b/.dockstore.yml
index 5896f106add..9586879f7ad 100644
--- a/.dockstore.yml
+++ b/.dockstore.yml
@@ -224,7 +224,6 @@ workflows:
          - master
          - ah_var_store
          - vs_866_update_variants_base_image
-         - gg_VS-776_UpdateToLatestVQSRLite
    - name: GvsQuickstartHailIntegration
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl
@@ -241,6 +240,7 @@ workflows:
          - master
          - ah_var_store
          - vs_888_fix_broken_gsutil_docker
+         - gg_VS-776_UpdateToLatestVQSRLite
    - name: GvsIngestTieout
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsIngestTieout.wdl

From 3f7234e2515ae5d7bed2910e4da903ed077354a3 Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Thu, 20 Apr 2023 16:20:44 -0400
Subject: [PATCH 21/25] Renamed some VQSR Classic Specific inputs. Added test
 files.

---
 .../variantstore/wdl/GvsCreateFilterSet.wdl   | 32 +++++++++++--------
 scripts/variantstore/wdl/GvsUnified.wdl       | 29 ++++++++++-------
 scripts/variantstore/wdl/GvsVQSRClassic.wdl   | 15 +++++----
 .../test_10_samples.22.avg.vcf.gz             |  3 --
 .../test_10_samples.22.avg.vcf.gz.tbi         |  3 --
 .../test_10_samples.23.avg.vcf.gz             |  3 --
 .../test_10_samples.23.avg.vcf.gz.tbi         |  3 --
 .../test_10_samples.chr21.avg.vcf.gz          |  3 ++
 .../test_10_samples.chr21.avg.vcf.gz.tbi      |  3 ++
 ...t_10_samples.chr21_chr22.sites_only.vcf.gz |  3 ++
 ..._samples.chr21_chr22.sites_only.vcf.gz.tbi |  3 ++
 .../test_10_samples.chr22.avg.vcf.gz          |  3 ++
 .../test_10_samples.chr22.avg.vcf.gz.tbi      |  3 ++
 .../test_10_samples.empty.avg.vcf.gz          |  3 ++
 .../test_10_samples.empty.avg.vcf.gz.tbi      |  3 ++
 .../test_10_samples.sites_only.vcf.gz         |  3 --
 .../test_10_samples.sites_only.vcf.gz.tbi     |  3 --
 17 files changed, 67 insertions(+), 51 deletions(-)
 delete mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz
 delete mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi
 delete mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz
 delete mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi
 create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz
 create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz.tbi
 create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz
 create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz.tbi
 create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz
 create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz.tbi
 create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz
 create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz.tbi
 delete mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz
 delete mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi

diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index 6de47e1a1a9..4775a5501c9 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -17,14 +17,17 @@ workflow GvsCreateFilterSet {
     File? gatk_override
 
     Boolean use_classic_VQSR = true
+    # These are the SNP and INDEL annotations used for VQSR Classic, the order matters for consistency between runs.
+    Array[String] vqsr_classic_indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
+    Array[String] vqsr_classic_snp_recalibration_annotations   = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
 
-    Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
-    Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
+    Int? INDEL_VQSR_CLASSIC_max_gaussians_override = 4
+    Int? INDEL_VQSR_CLASSIC_mem_gb_override
+    Int? SNP_VQSR_CLASSIC_max_gaussians_override = 6
+    Int? SNP_VQSR_CLASSIC_mem_gb_override
 
-    Int? INDEL_VQSR_max_gaussians_override = 4
-    Int? INDEL_VQSR_mem_gb_override
-    Int? SNP_VQSR_max_gaussians_override = 6
-    Int? SNP_VQSR_mem_gb_override
+    # These are the (unified) annotations used for VQSR Lite. The order matters for consistency between runs.
+    Array[String] vqsr_lite_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
   }
 
   File reference = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta"
@@ -113,14 +116,15 @@ workflow GvsCreateFilterSet {
         sites_only_vcf = MergeVCFs.output_vcf,
         sites_only_vcf_idx = MergeVCFs.output_vcf_index,
         output_prefix = filter_set_name,
-        gatk_docker = "us.gcr.io/broad-gatk/gatk:4.4.0.0",
-        annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"],
+        annotations = vqsr_lite_recalibration_annotations,
         resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz",
         extract_extra_args = "-L ${interval_list} --use-allele-specific-annotations",
         score_extra_args = "-L ${interval_list} --use-allele-specific-annotations",
         extract_runtime_attributes = {"command_mem_gb": 27},
         train_runtime_attributes = {"command_mem_gb": 27},
         score_runtime_attributes = {"command_mem_gb": 15},
+        gatk_docker = "us.gcr.io/broad-gatk/gatk:4.4.0.0",
+        gatk_override = gatk_override,
         monitoring_script = "gs://gvs-internal/cromwell_monitoring_script.sh"
     }
 
@@ -189,12 +193,12 @@ workflow GvsCreateFilterSet {
         sites_only_variant_filtered_vcf_idx = MergeVCFs.output_vcf_index,
         sites_only_variant_filtered_vcfs = ExtractFilterTask.output_vcf,
         sites_only_variant_filtered_vcf_idxs = ExtractFilterTask.output_vcf_index,
-        snp_recalibration_annotation_values = snp_recalibration_annotation_values,
-        indel_recalibration_annotation_values = indel_recalibration_annotation_values,
-        INDEL_VQSR_max_gaussians_override = INDEL_VQSR_max_gaussians_override,
-        INDEL_VQSR_mem_gb_override = INDEL_VQSR_mem_gb_override,
-        SNP_VQSR_max_gaussians_override = SNP_VQSR_max_gaussians_override,
-        SNP_VQSR_mem_gb_override = SNP_VQSR_mem_gb_override
+        snp_recalibration_annotations = vqsr_classic_snp_recalibration_annotations,
+        indel_recalibration_annotations = vqsr_classic_indel_recalibration_annotations,
+        INDEL_VQSR_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override,
+        INDEL_VQSR_mem_gb_override = INDEL_VQSR_CLASSIC_mem_gb_override,
+        SNP_VQSR_max_gaussians_override = SNP_VQSR_CLASSIC_max_gaussians_override,
+        SNP_VQSR_mem_gb_override = SNP_VQSR_CLASSIC_mem_gb_override
     }
 
     call PopulateFilterSetInfo as PopulateFilterSetInfoClassic {
diff --git a/scripts/variantstore/wdl/GvsUnified.wdl b/scripts/variantstore/wdl/GvsUnified.wdl
index 80bfdd1afef..880cc302201 100644
--- a/scripts/variantstore/wdl/GvsUnified.wdl
+++ b/scripts/variantstore/wdl/GvsUnified.wdl
@@ -39,13 +39,18 @@ workflow GvsUnified {
 
         # Begin GvsCreateFilterSet
         String filter_set_name = call_set_identifier
-        Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
-        Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
 
-        Int? INDEL_VQSR_max_gaussians_override = 4
-        Int? INDEL_VQSR_mem_gb_override
-        Int? SNP_VQSR_max_gaussians_override = 6
-        Int? SNP_VQSR_mem_gb_override
+        # These are the SNP and INDEL annotations used for VQSR Classic, the order matters.
+        Array[String] vqsr_classic_indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
+        Array[String] vqsr_classic_snp_recalibration_annotations   = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
+
+        # These are the (unified) annotations used for VQSR Lite. The order matters.
+        Array[String] vqsr_lite_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
+
+        Int? INDEL_VQSR_CLASSIC_max_gaussians_override = 4
+        Int? INDEL_VQSR_CLASSIC_mem_gb_override
+        Int? SNP_VQSR_CLASSIC_max_gaussians_override = 6
+        Int? SNP_VQSR_CLASSIC_mem_gb_override
         # End GvsCreateFilterSet
 
         # Begin GvsPrepareRangesCallset
@@ -116,14 +121,14 @@ workflow GvsUnified {
             project_id = project_id,
             call_set_identifier = call_set_identifier,
             filter_set_name = filter_set_name,
-            indel_recalibration_annotation_values = indel_recalibration_annotation_values,
-            snp_recalibration_annotation_values = snp_recalibration_annotation_values,
+            vqsr_classic_indel_recalibration_annotations = vqsr_classic_indel_recalibration_annotations,
+            vqsr_classic_snp_recalibration_annotations = vqsr_classic_snp_recalibration_annotations,
             interval_list = interval_list,
             gatk_override = gatk_override,
-            INDEL_VQSR_max_gaussians_override = INDEL_VQSR_max_gaussians_override,
-            INDEL_VQSR_mem_gb_override = INDEL_VQSR_mem_gb_override,
-            SNP_VQSR_max_gaussians_override = SNP_VQSR_max_gaussians_override,
-            SNP_VQSR_mem_gb_override = SNP_VQSR_mem_gb_override
+            INDEL_VQSR_CLASSIC_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override,
+            INDEL_VQSR_CLASSIC_mem_gb_override = INDEL_VQSR_CLASSIC_mem_gb_override,
+            SNP_VQSR_CLASSIC_max_gaussians_override = SNP_VQSR_CLASSIC_max_gaussians_override,
+            SNP_VQSR_CLASSIC_mem_gb_override = SNP_VQSR_CLASSIC_mem_gb_override
     }
 
     call PrepareRangesCallset.GvsPrepareCallset {
diff --git a/scripts/variantstore/wdl/GvsVQSRClassic.wdl b/scripts/variantstore/wdl/GvsVQSRClassic.wdl
index 158bbec8faa..181b1eaf8ce 100644
--- a/scripts/variantstore/wdl/GvsVQSRClassic.wdl
+++ b/scripts/variantstore/wdl/GvsVQSRClassic.wdl
@@ -12,8 +12,8 @@ workflow JointVcfFiltering {
     Array[File] sites_only_variant_filtered_vcfs
     Array[File] sites_only_variant_filtered_vcf_idxs
 
-    Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
-    Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
+    Array[String] indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
+    Array[String] snp_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
 
     File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
     File? gatk_override
@@ -34,6 +34,7 @@ workflow JointVcfFiltering {
   }
 
   Array[String] snp_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ]
+  Array[String] indel_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"]
 
   # reference files
   # Axiom - Used only for indels
@@ -72,8 +73,8 @@ workflow JointVcfFiltering {
       sites_only_variant_filtered_vcf_index = sites_only_variant_filtered_vcf_idx,
       recalibration_filename = base_name + ".indels.recal",
       tranches_filename = base_name + ".indels.tranches",
-      recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"],
-      recalibration_annotation_values = indel_recalibration_annotation_values,
+      recalibration_tranche_values = indel_recalibration_tranche_values,
+      recalibration_annotation_values = indel_recalibration_annotations,
       mills_resource_vcf = mills_resource_vcf,
       mills_resource_vcf_index = mills_resource_vcf_index,
       axiomPoly_resource_vcf = axiomPoly_resource_vcf,
@@ -96,7 +97,7 @@ workflow JointVcfFiltering {
         tranches_filename = base_name + ".snps.tranches",
         model_report_filename = base_name + ".snps.model.report",
         recalibration_tranche_values = snp_recalibration_tranche_values,
-        recalibration_annotation_values = snp_recalibration_annotation_values,
+        recalibration_annotation_values = snp_recalibration_annotations,
         hapmap_resource_vcf = hapmap_resource_vcf,
         hapmap_resource_vcf_index = hapmap_resource_vcf_index,
         omni_resource_vcf = omni_resource_vcf,
@@ -122,7 +123,7 @@ workflow JointVcfFiltering {
           tranches_filename = base_name + ".snps." + idx + ".tranches",
           model_report = SNPsVariantRecalibratorCreateModel.model_report,
           recalibration_tranche_values = snp_recalibration_tranche_values,
-          recalibration_annotation_values = snp_recalibration_annotation_values,
+          recalibration_annotation_values = snp_recalibration_annotations,
           hapmap_resource_vcf = hapmap_resource_vcf,
           hapmap_resource_vcf_index = hapmap_resource_vcf_index,
           omni_resource_vcf = omni_resource_vcf,
@@ -164,7 +165,7 @@ workflow JointVcfFiltering {
         recalibration_filename = base_name + ".snps.recal",
         tranches_filename = base_name + ".snps.tranches",
         recalibration_tranche_values = snp_recalibration_tranche_values,
-        recalibration_annotation_values = snp_recalibration_annotation_values,
+        recalibration_annotation_values = snp_recalibration_annotations,
         hapmap_resource_vcf = hapmap_resource_vcf,
         hapmap_resource_vcf_index = hapmap_resource_vcf_index,
         omni_resource_vcf = omni_resource_vcf,
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz
deleted file mode 100644
index 31cba1e00f8..00000000000
--- a/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dcf1dbda2255fbe1372d09d364835452d610822070b6b9b56b1733388aa3cd19
-size 140900871
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi
deleted file mode 100644
index 5fd47681849..00000000000
--- a/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:af32939cd4f63a0a9251a50cc5658738285d4cee4833bcf1cda6b92d90c4b99b
-size 100153
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz
deleted file mode 100644
index 55dde2493e4..00000000000
--- a/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4144805bd8fabc74f3eea39a910dbd5c24017b844c44640efda49e3b0febe693
-size 112076612
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi
deleted file mode 100644
index 114d43936c5..00000000000
--- a/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f3484a38abb76952b02863099c383eae26d50f44514c5045992f63cc3294ebe8
-size 114295
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz
new file mode 100644
index 00000000000..304adc48127
--- /dev/null
+++ b/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8556461fa8933187bea3708d72e55927a67ff9d73938f5cc26bf33c54cd58e2a
+size 34193100
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz.tbi
new file mode 100644
index 00000000000..80332910b23
--- /dev/null
+++ b/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz.tbi
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:320f43cc7cbee56a7cc06fbd4686041d970b496d205a3e6a7665b2a82f0214ed
+size 30439
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz
new file mode 100644
index 00000000000..b74468a5c83
--- /dev/null
+++ b/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7bef1de9d95dd0d336e4285f6e97b5db274ff5bd980a229ba4e2b64f9b2e3e50
+size 46372366
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz.tbi
new file mode 100644
index 00000000000..2abbb9be100
--- /dev/null
+++ b/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz.tbi
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0e67ffa564076f4dc501c6d0d92825aaffb2c0327bdfc1db31d02c392b6c664
+size 57950
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz
new file mode 100644
index 00000000000..4d52121291a
--- /dev/null
+++ b/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa01d2fdb8558701b87e661cd6b191392db29ee2fc29a5eeb7eb565fba1448c8
+size 34534658
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz.tbi
new file mode 100644
index 00000000000..87c3471681f
--- /dev/null
+++ b/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz.tbi
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f110fb2bb242433375b44ecd2c756846574881e857ae803b89b0f7138949a3bd
+size 30502
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz
new file mode 100644
index 00000000000..026244d096e
--- /dev/null
+++ b/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea51baa7a90a550ff4ae363728a21de88ef78aa5b6ebb6b7807aa9bd93e04459
+size 27578
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz.tbi
new file mode 100644
index 00000000000..0fc3d144274
--- /dev/null
+++ b/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz.tbi
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c9e74e847b0ca3ca72ab2bcc803ac43efaff23fa701271af7d9208df054c08e
+size 72
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz
deleted file mode 100644
index f75a07bd09c..00000000000
--- a/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:00212a6387eba259a2d060eef08f50f3de512a155ed4e746d38530310a582e14
-size 134260565
diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi
deleted file mode 100644
index 475b5ba83a0..00000000000
--- a/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c99412c88d072d494e545f56acdf621f6c960cbb8f2d734532cf9d5d11e83104
-size 133485

From 662605b41b2f2ac4463472300a855360dfd7407f Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Thu, 20 Apr 2023 16:37:16 -0400
Subject: [PATCH 22/25] Missed a dependency

---
 .../wdl/GvsJointVariantCalling.wdl            | 27 ++++++++++---------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl
index 32c8688cf1c..a25c5de4989 100644
--- a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl
+++ b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl
@@ -30,16 +30,19 @@ workflow GvsJointVariantCalling {
       File sample_names_to_extract = ""
       Int split_intervals_disk_size_override = ""
       Int split_intervals_mem_override = ""
-      Int INDEL_VQSR_max_gaussians_override = 4
-      Int INDEL_VQSR_mem_gb_override = ""
-      Int SNP_VQSR_max_gaussians_override = 6
-      Int SNP_VQSR_mem_gb_override = ""
+      Int INDEL_VQSR_CLASSIC_max_gaussians_override = 4
+      Int INDEL_VQSR_CLASSIC_mem_gb_override = ""
+      Int SNP_VQSR_CLASSIC_max_gaussians_override = 6
+      Int SNP_VQSR_CLASSIC_mem_gb_override = ""
     }
     # This is the most updated snapshot of the code as of Feb 10, 2023
     File gatk_override = "gs://gvs_quickstart_storage/jars/gatk-package-4.2.0.0-654-g4a1c203-SNAPSHOT-local.jar"
     File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
-    Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
-    Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
+
+    # These are the SNP and INDEL annotations used for VQSR Classic, the order matters.
+    Array[String] vqsr_classic_indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
+    Array[String] vqsr_classic_snp_recalibration_annotations   = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
+
     File interval_weights_bed = "gs://broad-public-datasets/gvs/weights/gvs_vet_weights_1kb.bed"
     # do we ever want non-beta customers to use this instead of using GvsUnified directly?  If so, we can make this an
     # argument that just defaults to true
@@ -65,7 +68,7 @@ workflow GvsJointVariantCalling {
             extract_table_prefix = extract_table_prefix,
             fq_temp_table_dataset = "~{project_id}.~{dataset_name}",
             gatk_override = gatk_override,
-            indel_recalibration_annotation_values = indel_recalibration_annotation_values,
+            vqsr_classic_indel_recalibration_annotations = vqsr_classic_indel_recalibration_annotations,
             interval_list = interval_list,
             interval_weights_bed = interval_weights_bed,
             load_data_batch_size = load_data_batch_size,
@@ -74,13 +77,13 @@ workflow GvsJointVariantCalling {
             query_labels = query_labels,
             query_project = project_id,
             sample_names_to_extract = sample_names_to_extract,
-            snp_recalibration_annotation_values = snp_recalibration_annotation_values,
+            vqsr_classic_snp_recalibration_annotations = vqsr_classic_snp_recalibration_annotations,
             split_intervals_disk_size_override = split_intervals_disk_size_override,
             split_intervals_mem_override = split_intervals_mem_override,
-            INDEL_VQSR_max_gaussians_override = INDEL_VQSR_max_gaussians_override,
-            INDEL_VQSR_mem_gb_override = INDEL_VQSR_mem_gb_override,
-            SNP_VQSR_max_gaussians_override = SNP_VQSR_max_gaussians_override,
-            SNP_VQSR_mem_gb_override = SNP_VQSR_mem_gb_override,
+            INDEL_VQSR_CLASSIC_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override,
+            INDEL_VQSR_CLASSIC_mem_gb_override = INDEL_VQSR_CLASSIC_mem_gb_override,
+            SNP_VQSR_CLASSIC_max_gaussians_override = SNP_VQSR_CLASSIC_max_gaussians_override,
+            SNP_VQSR_CLASSIC_mem_gb_override = SNP_VQSR_CLASSIC_mem_gb_override,
             drop_state = drop_state,
             is_beta_user = is_beta_user,
     }

From bf9eea8b031452d4969ae39e01f4b29a605c57c1 Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Fri, 21 Apr 2023 09:57:26 -0400
Subject: [PATCH 23/25] Addressing code review comments.

---
 scripts/variantstore/wdl/GvsCreateFilterSet.wdl    | 12 +++---------
 .../variantstore/wdl/GvsJointVariantCalling.wdl    |  6 ------
 scripts/variantstore/wdl/GvsUnified.wdl            |  9 ---------
 scripts/variantstore/wdl/GvsVQSRClassic.wdl        | 14 ++++----------
 4 files changed, 7 insertions(+), 34 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index 4775a5501c9..a50a97494fc 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -17,17 +17,13 @@ workflow GvsCreateFilterSet {
     File? gatk_override
 
     Boolean use_classic_VQSR = true
-    # These are the SNP and INDEL annotations used for VQSR Classic, the order matters for consistency between runs.
-    Array[String] vqsr_classic_indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
-    Array[String] vqsr_classic_snp_recalibration_annotations   = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
 
     Int? INDEL_VQSR_CLASSIC_max_gaussians_override = 4
     Int? INDEL_VQSR_CLASSIC_mem_gb_override
     Int? SNP_VQSR_CLASSIC_max_gaussians_override = 6
     Int? SNP_VQSR_CLASSIC_mem_gb_override
 
-    # These are the (unified) annotations used for VQSR Lite. The order matters for consistency between runs.
-    Array[String] vqsr_lite_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
+    RuntimeAttributes? vqsr_lite_extract_runtime_attributes = {"command_mem_gb": 27}
   }
 
   File reference = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta"
@@ -116,11 +112,11 @@ workflow GvsCreateFilterSet {
         sites_only_vcf = MergeVCFs.output_vcf,
         sites_only_vcf_idx = MergeVCFs.output_vcf_index,
         output_prefix = filter_set_name,
-        annotations = vqsr_lite_recalibration_annotations,
+        annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"],
         resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz",
         extract_extra_args = "-L ${interval_list} --use-allele-specific-annotations",
         score_extra_args = "-L ${interval_list} --use-allele-specific-annotations",
-        extract_runtime_attributes = {"command_mem_gb": 27},
+        extract_runtime_attributes = vqsr_lite_extract_runtime_attributes,
         train_runtime_attributes = {"command_mem_gb": 27},
         score_runtime_attributes = {"command_mem_gb": 15},
         gatk_docker = "us.gcr.io/broad-gatk/gatk:4.4.0.0",
@@ -193,8 +189,6 @@ workflow GvsCreateFilterSet {
         sites_only_variant_filtered_vcf_idx = MergeVCFs.output_vcf_index,
         sites_only_variant_filtered_vcfs = ExtractFilterTask.output_vcf,
         sites_only_variant_filtered_vcf_idxs = ExtractFilterTask.output_vcf_index,
-        snp_recalibration_annotations = vqsr_classic_snp_recalibration_annotations,
-        indel_recalibration_annotations = vqsr_classic_indel_recalibration_annotations,
         INDEL_VQSR_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override,
         INDEL_VQSR_mem_gb_override = INDEL_VQSR_CLASSIC_mem_gb_override,
         SNP_VQSR_max_gaussians_override = SNP_VQSR_CLASSIC_max_gaussians_override,
diff --git a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl
index a25c5de4989..eef57f7e1e6 100644
--- a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl
+++ b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl
@@ -39,10 +39,6 @@ workflow GvsJointVariantCalling {
     File gatk_override = "gs://gvs_quickstart_storage/jars/gatk-package-4.2.0.0-654-g4a1c203-SNAPSHOT-local.jar"
     File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
 
-    # These are the SNP and INDEL annotations used for VQSR Classic, the order matters.
-    Array[String] vqsr_classic_indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
-    Array[String] vqsr_classic_snp_recalibration_annotations   = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
-
     File interval_weights_bed = "gs://broad-public-datasets/gvs/weights/gvs_vet_weights_1kb.bed"
     # do we ever want non-beta customers to use this instead of using GvsUnified directly?  If so, we can make this an
     # argument that just defaults to true
@@ -68,7 +64,6 @@ workflow GvsJointVariantCalling {
             extract_table_prefix = extract_table_prefix,
             fq_temp_table_dataset = "~{project_id}.~{dataset_name}",
             gatk_override = gatk_override,
-            vqsr_classic_indel_recalibration_annotations = vqsr_classic_indel_recalibration_annotations,
             interval_list = interval_list,
             interval_weights_bed = interval_weights_bed,
             load_data_batch_size = load_data_batch_size,
@@ -77,7 +72,6 @@ workflow GvsJointVariantCalling {
             query_labels = query_labels,
             query_project = project_id,
             sample_names_to_extract = sample_names_to_extract,
-            vqsr_classic_snp_recalibration_annotations = vqsr_classic_snp_recalibration_annotations,
             split_intervals_disk_size_override = split_intervals_disk_size_override,
             split_intervals_mem_override = split_intervals_mem_override,
             INDEL_VQSR_CLASSIC_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override,
diff --git a/scripts/variantstore/wdl/GvsUnified.wdl b/scripts/variantstore/wdl/GvsUnified.wdl
index 880cc302201..88d561b3cf3 100644
--- a/scripts/variantstore/wdl/GvsUnified.wdl
+++ b/scripts/variantstore/wdl/GvsUnified.wdl
@@ -40,13 +40,6 @@ workflow GvsUnified {
         # Begin GvsCreateFilterSet
         String filter_set_name = call_set_identifier
 
-        # These are the SNP and INDEL annotations used for VQSR Classic, the order matters.
-        Array[String] vqsr_classic_indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
-        Array[String] vqsr_classic_snp_recalibration_annotations   = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
-
-        # These are the (unified) annotations used for VQSR Lite. The order matters.
-        Array[String] vqsr_lite_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
-
         Int? INDEL_VQSR_CLASSIC_max_gaussians_override = 4
         Int? INDEL_VQSR_CLASSIC_mem_gb_override
         Int? SNP_VQSR_CLASSIC_max_gaussians_override = 6
@@ -121,8 +114,6 @@ workflow GvsUnified {
             project_id = project_id,
             call_set_identifier = call_set_identifier,
             filter_set_name = filter_set_name,
-            vqsr_classic_indel_recalibration_annotations = vqsr_classic_indel_recalibration_annotations,
-            vqsr_classic_snp_recalibration_annotations = vqsr_classic_snp_recalibration_annotations,
             interval_list = interval_list,
             gatk_override = gatk_override,
             INDEL_VQSR_CLASSIC_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override,
diff --git a/scripts/variantstore/wdl/GvsVQSRClassic.wdl b/scripts/variantstore/wdl/GvsVQSRClassic.wdl
index 181b1eaf8ce..4683d422823 100644
--- a/scripts/variantstore/wdl/GvsVQSRClassic.wdl
+++ b/scripts/variantstore/wdl/GvsVQSRClassic.wdl
@@ -12,9 +12,6 @@ workflow JointVcfFiltering {
     Array[File] sites_only_variant_filtered_vcfs
     Array[File] sites_only_variant_filtered_vcf_idxs
 
-    Array[String] indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
-    Array[String] snp_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
-
     File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
     File? gatk_override
 
@@ -33,37 +30,34 @@ workflow JointVcfFiltering {
     Int snps_variant_recalibration_threshold = 5000
   }
 
+  Array[String] indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
+  Array[String] snp_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]
+
   Array[String] snp_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ]
   Array[String] indel_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"]
 
   # reference files
   # Axiom - Used only for indels
-  # Classic: known=false,training=true,truth=false
   File axiomPoly_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz"
   File axiomPoly_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi"
 
-  # DbSNP - BOTH SNPs and INDELs. But used only as known in classic (which isn't used in Lite and so dropped in lite)
-  # Classic: known=true,training=false,truth=false
+  # DbSNP - BOTH SNPs and INDELs.
   File dbsnp_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf"
   File dbsnp_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx"
 
   # HapMap - SNPs
-  # Classic: known=false,training=true,truth=true
   File hapmap_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz"
   File hapmap_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi"
 
   # Mills - Indels
-  # Classic: known=false,training=true,truth=true
   File mills_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
   File mills_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi"
 
   # Omni - SNPs
-  # Classic: known=false,training=true,truth=true
   File omni_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz"
   File omni_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi"
 
   # 1000G - SNPs
-  # Classic: known=false,training=true,truth=false
   File one_thousand_genomes_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz"
   File one_thousand_genomes_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi"
 

From 841152192bef1078bfe155db13d6630e5c9010da Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Fri, 21 Apr 2023 12:03:32 -0400
Subject: [PATCH 24/25] Pass runtime attributes to VQSR Lite wdl.

---
 scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
index a50a97494fc..9dcb37a8fef 100644
--- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
+++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -24,6 +24,8 @@ workflow GvsCreateFilterSet {
     Int? SNP_VQSR_CLASSIC_mem_gb_override
 
     RuntimeAttributes? vqsr_lite_extract_runtime_attributes = {"command_mem_gb": 27}
+    RuntimeAttributes? vqsr_lite_train_runtime_attributes = {"command_mem_gb": 27}
+    RuntimeAttributes? vqsr_lite_score_runtime_attributes = {"command_mem_gb": 15}
   }
 
   File reference = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta"
@@ -117,8 +119,8 @@ workflow GvsCreateFilterSet {
         extract_extra_args = "-L ${interval_list} --use-allele-specific-annotations",
         score_extra_args = "-L ${interval_list} --use-allele-specific-annotations",
         extract_runtime_attributes = vqsr_lite_extract_runtime_attributes,
-        train_runtime_attributes = {"command_mem_gb": 27},
-        score_runtime_attributes = {"command_mem_gb": 15},
+        train_runtime_attributes = vqsr_lite_train_runtime_attributes,
+        score_runtime_attributes = vqsr_lite_score_runtime_attributes,
         gatk_docker = "us.gcr.io/broad-gatk/gatk:4.4.0.0",
         gatk_override = gatk_override,
         monitoring_script = "gs://gvs-internal/cromwell_monitoring_script.sh"

From e276681494c07f0e28a22c49a7fde165eab844f3 Mon Sep 17 00:00:00 2001
From: gbggrant <ggrant@broadinstitute.org>
Date: Fri, 21 Apr 2023 13:50:09 -0400
Subject: [PATCH 25/25] Remove branches from .dockstore.yml

---
 .dockstore.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.dockstore.yml b/.dockstore.yml
index 9586879f7ad..037ea056c57 100644
--- a/.dockstore.yml
+++ b/.dockstore.yml
@@ -95,7 +95,6 @@ workflows:
        branches:
          - master
          - ah_var_store
-         - gg_VS-776_UpdateToLatestVQSRLite
    - name: GvsPopulateAltAllele
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsPopulateAltAllele.wdl
@@ -240,7 +239,6 @@ workflows:
          - master
          - ah_var_store
          - vs_888_fix_broken_gsutil_docker
-         - gg_VS-776_UpdateToLatestVQSRLite
    - name: GvsIngestTieout
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsIngestTieout.wdl