Skip to content

Commit

Permalink
VS-776. Update to latest version of VQSR Lite. (#8269)
Browse files Browse the repository at this point in the history
* Updated to latest version of VQSR Lite (from Master)
* Ported tests and files for VQSR Lite over
* Refactored VQSR Classic code into its own WDL
  • Loading branch information
gbggrant authored Apr 21, 2023
1 parent 17afee4 commit f4a355c
Show file tree
Hide file tree
Showing 24 changed files with 636 additions and 572 deletions.
231 changes: 49 additions & 182 deletions scripts/variantstore/wdl/GvsCreateFilterSet.wdl

Large diffs are not rendered by default.

21 changes: 9 additions & 12 deletions scripts/variantstore/wdl/GvsJointVariantCalling.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,15 @@ workflow GvsJointVariantCalling {
File sample_names_to_extract = ""
Int split_intervals_disk_size_override = ""
Int split_intervals_mem_override = ""
Int INDEL_VQSR_max_gaussians_override = 4
Int INDEL_VQSR_mem_gb_override = ""
Int SNP_VQSR_max_gaussians_override = 6
Int SNP_VQSR_mem_gb_override = ""
Int INDEL_VQSR_CLASSIC_max_gaussians_override = 4
Int INDEL_VQSR_CLASSIC_mem_gb_override = ""
Int SNP_VQSR_CLASSIC_max_gaussians_override = 6
Int SNP_VQSR_CLASSIC_mem_gb_override = ""
}
# This is the most updated snapshot of the code as of Feb 10, 2023
File gatk_override = "gs://gvs_quickstart_storage/jars/gatk-package-4.2.0.0-654-g4a1c203-SNAPSHOT-local.jar"
File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]

File interval_weights_bed = "gs://broad-public-datasets/gvs/weights/gvs_vet_weights_1kb.bed"
# do we ever want non-beta customers to use this instead of using GvsUnified directly? If so, we can make this an
# argument that just defaults to true
Expand All @@ -65,7 +64,6 @@ workflow GvsJointVariantCalling {
extract_table_prefix = extract_table_prefix,
fq_temp_table_dataset = "~{project_id}.~{dataset_name}",
gatk_override = gatk_override,
indel_recalibration_annotation_values = indel_recalibration_annotation_values,
interval_list = interval_list,
interval_weights_bed = interval_weights_bed,
load_data_batch_size = load_data_batch_size,
Expand All @@ -74,13 +72,12 @@ workflow GvsJointVariantCalling {
query_labels = query_labels,
query_project = project_id,
sample_names_to_extract = sample_names_to_extract,
snp_recalibration_annotation_values = snp_recalibration_annotation_values,
split_intervals_disk_size_override = split_intervals_disk_size_override,
split_intervals_mem_override = split_intervals_mem_override,
INDEL_VQSR_max_gaussians_override = INDEL_VQSR_max_gaussians_override,
INDEL_VQSR_mem_gb_override = INDEL_VQSR_mem_gb_override,
SNP_VQSR_max_gaussians_override = SNP_VQSR_max_gaussians_override,
SNP_VQSR_mem_gb_override = SNP_VQSR_mem_gb_override,
INDEL_VQSR_CLASSIC_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override,
INDEL_VQSR_CLASSIC_mem_gb_override = INDEL_VQSR_CLASSIC_mem_gb_override,
SNP_VQSR_CLASSIC_max_gaussians_override = SNP_VQSR_CLASSIC_max_gaussians_override,
SNP_VQSR_CLASSIC_mem_gb_override = SNP_VQSR_CLASSIC_mem_gb_override,
drop_state = drop_state,
is_beta_user = is_beta_user,
}
Expand Down
20 changes: 8 additions & 12 deletions scripts/variantstore/wdl/GvsUnified.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,11 @@ workflow GvsUnified {
# Begin GvsCreateFilterSet
String filter_set_name = call_set_identifier
Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]

Int? INDEL_VQSR_max_gaussians_override = 4
Int? INDEL_VQSR_mem_gb_override
Int? SNP_VQSR_max_gaussians_override = 6
Int? SNP_VQSR_mem_gb_override
Int? INDEL_VQSR_CLASSIC_max_gaussians_override = 4
Int? INDEL_VQSR_CLASSIC_mem_gb_override
Int? SNP_VQSR_CLASSIC_max_gaussians_override = 6
Int? SNP_VQSR_CLASSIC_mem_gb_override
# End GvsCreateFilterSet
# Begin GvsPrepareRangesCallset
Expand Down Expand Up @@ -116,14 +114,12 @@ workflow GvsUnified {
project_id = project_id,
call_set_identifier = call_set_identifier,
filter_set_name = filter_set_name,
indel_recalibration_annotation_values = indel_recalibration_annotation_values,
snp_recalibration_annotation_values = snp_recalibration_annotation_values,
interval_list = interval_list,
gatk_override = gatk_override,
INDEL_VQSR_max_gaussians_override = INDEL_VQSR_max_gaussians_override,
INDEL_VQSR_mem_gb_override = INDEL_VQSR_mem_gb_override,
SNP_VQSR_max_gaussians_override = SNP_VQSR_max_gaussians_override,
SNP_VQSR_mem_gb_override = SNP_VQSR_mem_gb_override
INDEL_VQSR_CLASSIC_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override,
INDEL_VQSR_CLASSIC_mem_gb_override = INDEL_VQSR_CLASSIC_mem_gb_override,
SNP_VQSR_CLASSIC_max_gaussians_override = SNP_VQSR_CLASSIC_max_gaussians_override,
SNP_VQSR_CLASSIC_mem_gb_override = SNP_VQSR_CLASSIC_mem_gb_override
}

call PrepareRangesCallset.GvsPrepareCallset {
Expand Down
198 changes: 198 additions & 0 deletions scripts/variantstore/wdl/GvsVQSRClassic.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
version 1.0

import "GvsWarpTasks.wdl" as Tasks
import "GvsUtils.wdl" as Utils

workflow JointVcfFiltering {
input {
String base_name
Int num_samples_loaded
File sites_only_variant_filtered_vcf
File sites_only_variant_filtered_vcf_idx
Array[File] sites_only_variant_filtered_vcfs
Array[File] sites_only_variant_filtered_vcf_idxs

File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
File? gatk_override

Int? INDEL_VQSR_max_gaussians_override = 4
Int? INDEL_VQSR_maximum_training_variants
Int? INDEL_VQSR_mem_gb_override
Int? SNP_VQSR_max_gaussians_override = 6
Int? SNP_VQSR_mem_gb_override
Int? SNP_VQSR_sample_every_nth_variant
Int? SNP_VQSR_maximum_training_variants

# This is the minimum number of samples where the SNP model will be created and applied in separate tasks
# (SNPsVariantRecalibratorClassic vs. SNPsVariantRecalibratorCreateModel and SNPsVariantRecalibratorScattered)
# For VQSR classic this is done with 20k but the 10K Stroke Anderson dataset would not work unscattered (at least
# with the default VM memory settings) so this was adjusted down to 5K.
Int snps_variant_recalibration_threshold = 5000
}

Array[String] indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
Array[String] snp_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]

Array[String] snp_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ]
Array[String] indel_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"]

# reference files
# Axiom - Used only for indels
File axiomPoly_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz"
File axiomPoly_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi"

# DbSNP - BOTH SNPs and INDELs.
File dbsnp_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf"
File dbsnp_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx"

# HapMap - SNPs
File hapmap_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz"
File hapmap_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi"

# Mills - Indels
File mills_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
File mills_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi"

# Omni - SNPs
File omni_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz"
File omni_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi"

# 1000G - SNPs
File one_thousand_genomes_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz"
File one_thousand_genomes_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi"

call Tasks.IndelsVariantRecalibrator {
input:
sites_only_variant_filtered_vcf = sites_only_variant_filtered_vcf,
sites_only_variant_filtered_vcf_index = sites_only_variant_filtered_vcf_idx,
recalibration_filename = base_name + ".indels.recal",
tranches_filename = base_name + ".indels.tranches",
recalibration_tranche_values = indel_recalibration_tranche_values,
recalibration_annotation_values = indel_recalibration_annotations,
mills_resource_vcf = mills_resource_vcf,
mills_resource_vcf_index = mills_resource_vcf_index,
axiomPoly_resource_vcf = axiomPoly_resource_vcf,
axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index,
dbsnp_resource_vcf = dbsnp_vcf,
dbsnp_resource_vcf_index = dbsnp_vcf_index,
use_allele_specific_annotations = true,
disk_size = "1000",
machine_mem_gb = INDEL_VQSR_mem_gb_override,
max_gaussians = INDEL_VQSR_max_gaussians_override,
maximum_training_variants = INDEL_VQSR_maximum_training_variants,
}

if (num_samples_loaded > snps_variant_recalibration_threshold) {
call Tasks.SNPsVariantRecalibratorCreateModel {
input:
sites_only_variant_filtered_vcf = sites_only_variant_filtered_vcf,
sites_only_variant_filtered_vcf_index = sites_only_variant_filtered_vcf_idx,
recalibration_filename = base_name + ".snps.recal",
tranches_filename = base_name + ".snps.tranches",
model_report_filename = base_name + ".snps.model.report",
recalibration_tranche_values = snp_recalibration_tranche_values,
recalibration_annotation_values = snp_recalibration_annotations,
hapmap_resource_vcf = hapmap_resource_vcf,
hapmap_resource_vcf_index = hapmap_resource_vcf_index,
omni_resource_vcf = omni_resource_vcf,
omni_resource_vcf_index = omni_resource_vcf_index,
one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
dbsnp_resource_vcf = dbsnp_vcf,
dbsnp_resource_vcf_index = dbsnp_vcf_index,
use_allele_specific_annotations = true,
disk_size = "1000",
machine_mem_gb = SNP_VQSR_mem_gb_override,
max_gaussians = SNP_VQSR_max_gaussians_override,
sample_every_nth_variant = SNP_VQSR_sample_every_nth_variant,
maximum_training_variants = SNP_VQSR_maximum_training_variants
}

scatter (idx in range(length(sites_only_variant_filtered_vcfs))) {
call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered {
input:
sites_only_variant_filtered_vcf = sites_only_variant_filtered_vcfs[idx],
sites_only_variant_filtered_vcf_index = sites_only_variant_filtered_vcf_idxs[idx],
recalibration_filename = base_name + ".snps." + idx + ".recal",
tranches_filename = base_name + ".snps." + idx + ".tranches",
model_report = SNPsVariantRecalibratorCreateModel.model_report,
recalibration_tranche_values = snp_recalibration_tranche_values,
recalibration_annotation_values = snp_recalibration_annotations,
hapmap_resource_vcf = hapmap_resource_vcf,
hapmap_resource_vcf_index = hapmap_resource_vcf_index,
omni_resource_vcf = omni_resource_vcf,
omni_resource_vcf_index = omni_resource_vcf_index,
one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
dbsnp_resource_vcf = dbsnp_vcf,
dbsnp_resource_vcf_index = dbsnp_vcf_index,
use_allele_specific_annotations = true,
disk_size = "1000",
machine_mem_gb = SNP_VQSR_mem_gb_override
}
}

call Tasks.GatherTranches as SNPGatherTranches {
input:
tranches = SNPsVariantRecalibratorScattered.tranches,
output_filename = base_name + ".snps.gathered.tranches",
output_tranche_values = snp_recalibration_tranche_values,
mode = "SNP",
disk_size = "200",
gatk_override = gatk_override
}

call Utils.MergeVCFs as MergeRecalibrationFiles {
input:
input_vcfs = SNPsVariantRecalibratorScattered.recalibration,
gather_type = "CONVENTIONAL",
output_vcf_name = "${base_name}.vrecalibration.vcf.gz",
preemptible_tries = 3,
}
}

if (num_samples_loaded <= snps_variant_recalibration_threshold) {
call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic {
input:
sites_only_variant_filtered_vcf = sites_only_variant_filtered_vcf,
sites_only_variant_filtered_vcf_index = sites_only_variant_filtered_vcf_idx,
recalibration_filename = base_name + ".snps.recal",
tranches_filename = base_name + ".snps.tranches",
recalibration_tranche_values = snp_recalibration_tranche_values,
recalibration_annotation_values = snp_recalibration_annotations,
hapmap_resource_vcf = hapmap_resource_vcf,
hapmap_resource_vcf_index = hapmap_resource_vcf_index,
omni_resource_vcf = omni_resource_vcf,
omni_resource_vcf_index = omni_resource_vcf_index,
one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
dbsnp_resource_vcf = dbsnp_vcf,
dbsnp_resource_vcf_index = dbsnp_vcf_index,
use_allele_specific_annotations = true,
disk_size = "1000",
machine_mem_gb = SNP_VQSR_mem_gb_override,
max_gaussians = SNP_VQSR_max_gaussians_override,
}
}

output {
File snps_variant_recalibration_file = select_first([MergeRecalibrationFiles.output_vcf, SNPsVariantRecalibratorClassic.recalibration])
File snps_variant_recalibration_file_index = select_first([MergeRecalibrationFiles.output_vcf_index, SNPsVariantRecalibratorClassic.recalibration_index])
File snps_variant_tranches_file = select_first([SNPGatherTranches.tranches_file, SNPsVariantRecalibratorClassic.tranches])
File indels_variant_recalibration_file = IndelsVariantRecalibrator.recalibration
File indels_variant_recalibration_file_index = IndelsVariantRecalibrator.recalibration_index
File indels_variant_tranches_file = IndelsVariantRecalibrator.tranches
Array[File] monitoring_logs = select_all(
flatten(
[
[IndelsVariantRecalibrator.monitoring_log],
[SNPsVariantRecalibratorCreateModel.monitoring_log],
select_first([SNPsVariantRecalibratorScattered.monitoring_log, []]),
[SNPGatherTranches.monitoring_log],
[MergeRecalibrationFiles.monitoring_log],
[SNPsVariantRecalibratorClassic.monitoring_log]
]))
}

}

2 changes: 1 addition & 1 deletion scripts/vcf_site_level_filtering_cromwell_tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

**This directory is for GATK devs only**

This directory contains scripts for running Variant Site Level WDL tests in the automated travis build environment.
This directory contains scripts for running Variant Site Level WDL tests in the automated build environment.

Please note that this only tests whether the WDL will complete successfully.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,16 @@ set -e
script_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P )
cd "$script_path"

WORKING_DIR=/home/runner/work/gatk
WORKING_DIR=/home/runner/work/gatk/gatk
WDL_DIR=$WORKING_DIR/scripts/vcf_site_level_filtering_wdl
CROMWELL_TEST_DIR=$WORKING_DIR/scripts/vcf_site_level_filtering_cromwell_tests

set -e
echo "Building docker image for VCF Site Level Filtering WDL tests (skipping unit tests)..."

#assume Dockerfile is in root
echo "Building docker without running unit tests... ========="
cd $WORKING_DIR/gatk
cd $WORKING_DIR

# IMPORTANT: This code is duplicated in the cnv and M2 WDL test.
if [ ! -z "$CI_PULL_REQUEST" ]; then
Expand All @@ -21,18 +23,17 @@ if [ ! -z "$CI_PULL_REQUEST" ]; then
else
HASH_TO_USE=${CI_COMMIT}
sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/;
echo "using travis commit:"$HASH_TO_USE
echo "using commit:"$HASH_TO_USE
fi
echo "Docker build done =========="

cd $WORKING_DIR/gatk/scripts/
sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json >$WORKING_DIR/vcf_site_level_filtering_travis.json
echo "JSON FILES (modified) ======="
cat $WORKING_DIR/vcf_site_level_filtering_travis.json
echo "=================="

sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/vcf_site_level_filtering.json >$WORKING_DIR/vcf_site_level_filtering_mod.json
sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/vcf_site_level_filtering_pos_neg.json >$WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json

echo "Running Filtering WDL through cromwell"
ln -fs $WORKING_DIR/gatk/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
cd $WORKING_DIR/gatk/scripts/vcf_site_level_filtering_wdl/
java -jar $CROMWELL_JAR run JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_travis.json

cat $WORKING_DIR/vcf_site_level_filtering_mod.json
java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_mod.json

cat $WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json
java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json
Loading

0 comments on commit f4a355c

Please sign in to comment.