From 1a2b3de9ba7f223b44b14facbd08ed1478e9f680 Mon Sep 17 00:00:00 2001 From: gbggrant Date: Wed, 29 Mar 2023 11:36:16 -0400 Subject: [PATCH 01/25] First cut - wanna run wdl validation. --- .../variantstore/wdl/GvsCreateFilterSet.wdl | 10 +- .../JointVcfFiltering.wdl | 551 +++++++++--------- 2 files changed, 272 insertions(+), 289 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index 8a78b0db858..3bc0a5b2cef 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -129,16 +129,16 @@ workflow GvsCreateFilterSet { if (!use_classic_VQSR) { call VQSRLite.JointVcfFiltering as JointVcfFiltering { input: - vcf = ExtractFilterTask.output_vcf, - vcf_index = ExtractFilterTask.output_vcf_index, + input_vcfs = ExtractFilterTask.output_vcf, + input_vcf_idxs = ExtractFilterTask.output_vcf_index, sites_only_vcf = MergeVCFs.output_vcf, sites_only_vcf_index = MergeVCFs.output_vcf_index, - basename = filter_set_name, + output_prefix = filter_set_name, gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0", extract_interval_list = interval_list, score_interval_list = interval_list, - snp_annotations = "-A AS_QD -A AS_MQRankSum -A AS_ReadPosRankSum -A AS_FS -A AS_MQ -A AS_SOR", - indel_annotations = "-A AS_FS -A AS_ReadPosRankSum -A AS_MQRankSum -A AS_QD -A AS_SOR", + annotations = "-A AS_QD -A AS_MQRankSum -A AS_ReadPosRankSum -A AS_FS -A AS_MQ -A AS_SOR", + resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", use_allele_specific_annotations = true, } diff --git a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl index ccd80df201f..b7c4fddae73 100644 --- a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl +++ b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl @@ -1,297 +1,280 @@ version 1.0 -# This is a workflow for filtering a joint callset VCF using INFO level annotations (so filtering is at the site level). -# Note that the input VCFs here may be sharded by genomic position which may be helpful for large cohorts. The script -# will output the same number of shards that are input. -# This portion of the filtering pipeline will assign a SCORE INFO field annotation to each site, but does not yet apply -# the filtering threshold to the final VCF. +# Workflow for scoring and optionally filtering a VCF based on site-level annotations using the +# ExtractVariationAnnotations-TrainVariantAnnotationsModel-ScoreVariantAnnotations toolchain, +# which supersedes the corresponding VariantRecalibrator-ApplyVQSR toolchain. +# See the parameter_meta section below for descriptions of the workflow inputs. +# Also see the GATK documentation for these tools for descriptions of the corresponding methods and additional details. + +struct RuntimeAttributes { + Int? cpu + Int? command_mem_gb + Int? additional_mem_gb + Int? disk_size_gb + Int? boot_disk_size_gb + Boolean? use_ssd + Int? preemptible + Int? max_retries +} workflow JointVcfFiltering { - input { - Array[File] vcf - Array[File] vcf_index - File sites_only_vcf - File sites_only_vcf_index - String basename - - String? model_backend - File? training_python_script - File? scoring_python_script - File? hyperparameters_json - - String gatk_docker - File? extract_interval_list - File? score_interval_list - - String snp_annotations - String indel_annotations - File? gatk_override - - Boolean use_allele_specific_annotations - - String snp_resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz" - String indel_resource_args = "--resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz" - } - - parameter_meta { - vcf: "An array of input VCFs that are one callset sharded by genomic region." - sites_only_vcf: "The full VCF callset without any genotype or sample level information." - basename: "Desired output file basename." - } - - call ExtractVariantAnnotations as ExtractVariantAnnotationsSNPs { - input: - input_vcf = sites_only_vcf, - input_vcf_index = sites_only_vcf_index, - mode = "SNP", - annotations = snp_annotations, - resource_args = snp_resource_args, - basename = basename, - interval_list = extract_interval_list, - use_allele_specific_annotations = use_allele_specific_annotations, - gatk_override = gatk_override, - gatk_docker = gatk_docker - } - - call ExtractVariantAnnotations as ExtractVariantAnnotationsINDELs { - input: - input_vcf = sites_only_vcf, - input_vcf_index = sites_only_vcf_index, - mode = "INDEL", - annotations = indel_annotations, - resource_args = indel_resource_args, - basename = basename, - interval_list = extract_interval_list, - use_allele_specific_annotations = use_allele_specific_annotations, - gatk_override = gatk_override, - gatk_docker = gatk_docker - } - - call TrainVariantAnnotationModel as TrainVariantAnnotationModelSNPs { - input: - annots = ExtractVariantAnnotationsSNPs.annots, - basename = basename, - mode = "snp", - model_backend = model_backend, - python_script = training_python_script, - hyperparameters_json = hyperparameters_json, - gatk_override = gatk_override, - gatk_docker = gatk_docker - } - - call TrainVariantAnnotationModel as TrainVariantAnnotationModelINDELs { - input: - annots = ExtractVariantAnnotationsINDELs.annots, - basename = basename, - mode = "indel", - model_backend = model_backend, - python_script = training_python_script, - hyperparameters_json = hyperparameters_json, - gatk_override = gatk_override, - gatk_docker = gatk_docker - } - - scatter(idx in range(length(vcf))) { - call ScoreVariantAnnotations as ScoreVariantAnnotationsSNPs { - input: - vcf = vcf[idx], - vcf_index = vcf_index[idx], - basename = basename, - mode = "SNP", - model_backend = model_backend, - python_script = scoring_python_script, - annotations = snp_annotations, - extracted_training_vcf = ExtractVariantAnnotationsSNPs.extracted_training_vcf, - extracted_training_vcf_index = ExtractVariantAnnotationsSNPs.extracted_training_vcf_index, - interval_list = score_interval_list, - model_files = TrainVariantAnnotationModelSNPs.outputs, - resource_args = snp_resource_args, - use_allele_specific_annotations = use_allele_specific_annotations, - gatk_override = gatk_override, - gatk_docker = gatk_docker - } - - call ScoreVariantAnnotations as ScoreVariantAnnotationsINDELs { - input: - vcf = vcf[idx], - vcf_index = vcf_index[idx], - basename = basename, - mode = "INDEL", - model_backend = model_backend, - python_script = scoring_python_script, - annotations = indel_annotations, - extracted_training_vcf = ExtractVariantAnnotationsINDELs.extracted_training_vcf, - extracted_training_vcf_index = ExtractVariantAnnotationsINDELs.extracted_training_vcf_index, - interval_list = score_interval_list, - model_files = TrainVariantAnnotationModelINDELs.outputs, - resource_args = indel_resource_args, - use_allele_specific_annotations = use_allele_specific_annotations, - gatk_override = gatk_override, - gatk_docker = gatk_docker - } - - } - - output { - Array[File] indels_variant_scored_vcf = ScoreVariantAnnotationsINDELs.output_vcf - Array[File] indels_variant_scored_vcf_index = ScoreVariantAnnotationsINDELs.output_vcf_index - Array[File] snps_variant_scored_vcf = ScoreVariantAnnotationsSNPs.output_vcf - Array[File] snps_variant_scored_vcf_index = ScoreVariantAnnotationsSNPs.output_vcf_index - } - + input { + Array[File] input_vcfs + Array[File] input_vcf_idxs + File sites_only_vcf + File sites_only_vcf_idx + String output_prefix + + Array[String] annotations + String resource_args + + String? model_backend + File? python_script + File? hyperparameters_json + + String? extract_extra_args + String? train_extra_args + String? score_extra_args + + String gatk_docker + File? gatk_override + + RuntimeAttributes? extract_runtime_attributes + RuntimeAttributes? train_runtime_attributes + RuntimeAttributes? score_runtime_attributes + } + + parameter_meta { + input_vcfs: "Sharded input VCFs to be scored and optionally filtered." + sites_only_vcf: "A concatenated, sites-only version of the sharded input VCFs; used for extracting training and calibration sets." + output_prefix: "Base prefix for output files. Sharded output VCFs will be named following the pattern \"{output_prefix}.{zero_based_shard_index}.score.vcf.gz\"." + annotations: "Annotations to be used for extraction, training, and scoring." + resource_args: "Resource arguments to be used for extraction and scoring. For example, \"--resource:training_and_calibration_set,training=true,calibration=true gs://path-to-training-and-calibration-set ...\".\n See GATK documentation for the ExtractVariantAnnotations and ScoreVariantAnnotations tools." + model_backend: "(Optional) Model backend to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool." + python_script: "(Optional) Python script specifying custom model backend to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool." + hyperparameters_json: "(Optional) JSON file specifying model hyperparameters to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool." + extract_extra_args: "(Optional) Catch-all string to provide additional arguments for ExtractVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-negative training, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool." + train_extra_args: "(Optional) Catch-all string to provide additional arguments for TrainVariantAnnotationsModel. This can include variant-type modes, arguments for enabling positive-negative training, etc. See GATK documentation for this tool." + score_extra_args: "(Optional) Catch-all string to provide additional arguments for ScoreVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-negative training and hard filtering, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool." + } + + call ExtractVariantAnnotations { + input: + input_vcf = sites_only_vcf, + input_vcf_idx = sites_only_vcf_idx, + output_prefix = output_prefix, + annotations = annotations, + resource_args = resource_args, + extra_args = extract_extra_args, + gatk_docker = gatk_docker, + gatk_override = gatk_override, + runtime_attributes = extract_runtime_attributes + } + + call TrainVariantAnnotationsModel { + input: + annotations_hdf5 = ExtractVariantAnnotations.annotations_hdf5, + unlabeled_annotations_hdf5 = ExtractVariantAnnotations.unlabeled_annotations_hdf5, + model_backend = model_backend, + python_script = python_script, + hyperparameters_json = hyperparameters_json, + output_prefix = output_prefix, + extra_args = train_extra_args, + gatk_docker = gatk_docker, + gatk_override = gatk_override, + runtime_attributes = train_runtime_attributes + } + + scatter (i in range(length(input_vcfs))) { + call ScoreVariantAnnotations { + input: + input_vcf = input_vcfs[i], + input_vcf_idx = input_vcf_idxs[i], + output_prefix = "~{output_prefix}.~{i}", + annotations = annotations, + resource_args = resource_args, + extracted_vcf = ExtractVariantAnnotations.extracted_vcf, + extracted_vcf_idx = ExtractVariantAnnotations.extracted_vcf_idx, + model_prefix = output_prefix, + model_files = TrainVariantAnnotationsModel.model_files, + extra_args = score_extra_args, + gatk_docker = gatk_docker, + gatk_override = gatk_override, + runtime_attributes = score_runtime_attributes + } + } + + output { + File extracted_annotations_hdf5 = ExtractVariantAnnotations.annotations_hdf5 + File? extracted_unlabeled_annotations_hdf5 = ExtractVariantAnnotations.unlabeled_annotations_hdf5 + File extracted_vcf = ExtractVariantAnnotations.extracted_vcf + File extracted_vcf_idx = ExtractVariantAnnotations.extracted_vcf_idx + + Array[File] model_files = TrainVariantAnnotationsModel.model_files + + Array[File] scored_vcfs = ScoreVariantAnnotations.scored_vcf + Array[File] scored_vcf_idxs = ScoreVariantAnnotations.scored_vcf_idx + Array[File?] annotations_hdf5s = ScoreVariantAnnotations.annotations_hdf5 + Array[File?] scores_hdf5s = ScoreVariantAnnotations.scores_hdf5 + } } task ExtractVariantAnnotations { - input { - String gatk_docker - File? gatk_override - File input_vcf - File input_vcf_index - String basename - String mode - String annotations - String resource_args - File? interval_list - Boolean use_allele_specific_annotations - - Int memory_mb = 28000 - Int command_mem = memory_mb - 1000 - } - Int disk_size = ceil(size(input_vcf, "GB") + size(input_vcf_index, "GB") + 100) - - command { - set -e - - export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx~{command_mem}m" \ - ExtractVariantAnnotations \ - -V ~{input_vcf} \ - -O ~{basename}.~{mode} \ - ~{annotations} \ - ~{if use_allele_specific_annotations then "--use-allele-specific-annotations" else ""} \ - ~{"-L " + interval_list} \ - --mode ~{mode} \ - ~{resource_args} - } - output { - File annots = "~{basename}.~{mode}.annot.hdf5" - File extracted_training_vcf = "~{basename}.~{mode}.vcf.gz" - File extracted_training_vcf_index = "~{basename}.~{mode}.vcf.gz.tbi" - Array[File] outputs = glob("~{basename}.~{mode}.*") - } - runtime { - docker: gatk_docker - disks: "local-disk " + disk_size + " HDD" - memory: memory_mb + " MiB" - } + input { + File input_vcf + File input_vcf_idx + String output_prefix + Array[String] annotations + String resource_args + String? extra_args + + String gatk_docker + File? gatk_override + + RuntimeAttributes runtime_attributes = {} + } + + parameter_meta { + input_vcf: {localization_optional: true} + input_vcf_idx: {localization_optional: true} + } + + command { + set -e + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} + + gatk --java-options "-Xmx~{default=6 runtime_attributes.command_mem_gb}G" \ + ExtractVariantAnnotations \ + -V ~{input_vcf} \ + -O ~{output_prefix}.extract \ + -A ~{sep=" -A " annotations} \ + ~{resource_args} \ + ~{extra_args} + } + + runtime { + docker: gatk_docker + cpu: select_first([runtime_attributes.cpu, 1]) + memory: select_first([runtime_attributes.command_mem_gb, 6]) + select_first([runtime_attributes.additional_mem_gb, 1]) + " GB" + disks: "local-disk " + select_first([runtime_attributes.disk_size_gb, 100]) + if select_first([runtime_attributes.use_ssd, false]) then " SSD" else " HDD" + bootDiskSizeGb: select_first([runtime_attributes.boot_disk_size_gb, 15]) + preemptible: select_first([runtime_attributes.preemptible, 2]) + maxRetries: select_first([runtime_attributes.max_retries, 1]) + } + + output { + File annotations_hdf5 = "~{output_prefix}.extract.annot.hdf5" + File? unlabeled_annotations_hdf5 = "~{output_prefix}.extract.unlabeled.annot.hdf5" + File extracted_vcf = "~{output_prefix}.extract.vcf.gz" # this line will break if extra_args includes the do-not-gzip-vcf-output argument + File extracted_vcf_idx = "~{output_prefix}.extract.vcf.gz.tbi" # this line will break if extra_args includes the do-not-gzip-vcf-output argument + } } -task TrainVariantAnnotationModel { - input { - String gatk_docker - File? gatk_override - File annots - String basename - String mode - String? model_backend - File? python_script - File? hyperparameters_json - - Int memory_mb = 28000 - Int command_mem = memory_mb - 1000 - } - Int disk_size = ceil(size(annots, "GB") + 100) - - command <<< - set -e - - export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} - - mode=$(echo "~{mode}" | awk '{print toupper($0)}') - - gatk --java-options "-Xmx~{command_mem}m" \ - TrainVariantAnnotationsModel \ - --annotations-hdf5 ~{annots} \ - -O ~{basename} \ - ~{"--model-backend " + model_backend} \ - ~{"--python-script " + python_script} \ - ~{"--hyperparameters-json " + hyperparameters_json} \ - --mode $mode - - >>> - output { - Array[File] outputs = glob("~{basename}.~{mode}.*") - } - runtime { - docker: gatk_docker - disks: "local-disk " + disk_size + " HDD" - memory: memory_mb + " MiB" - } +task TrainVariantAnnotationsModel { + input { + File annotations_hdf5 + File? unlabeled_annotations_hdf5 + String? model_backend + File? python_script + File? hyperparameters_json + String output_prefix + String? extra_args + + String gatk_docker + File? gatk_override + + RuntimeAttributes runtime_attributes = {} + } + + command { + set -e + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} + + gatk --java-options "-Xmx~{default=6 runtime_attributes.command_mem_gb}G" \ + TrainVariantAnnotationsModel \ + --annotations-hdf5 ~{annotations_hdf5} \ + ~{"--unlabeled-annotations-hdf5 " + unlabeled_annotations_hdf5} \ + ~{"--model-backend " + model_backend} \ + ~{"--python-script " + python_script} \ + ~{"--hyperparameters-json " + hyperparameters_json} \ + -O ~{output_prefix}.train \ + ~{extra_args} + } + + runtime { + docker: gatk_docker + cpu: select_first([runtime_attributes.cpu, 1]) + memory: select_first([runtime_attributes.command_mem_gb, 6]) + select_first([runtime_attributes.additional_mem_gb, 1]) + " GB" + disks: "local-disk " + select_first([runtime_attributes.disk_size_gb, 100]) + if select_first([runtime_attributes.use_ssd, false]) then " SSD" else " HDD" + bootDiskSizeGb: select_first([runtime_attributes.boot_disk_size_gb, 15]) + preemptible: select_first([runtime_attributes.preemptible, 2]) + maxRetries: select_first([runtime_attributes.max_retries, 1]) + } + + output { + Array[File] model_files = glob("~{output_prefix}.train.*") + } } task ScoreVariantAnnotations { - input { - String gatk_docker - File? gatk_override - File vcf - File vcf_index - String basename - String mode - String? model_backend - File? python_script - String annotations - String resource_args - File extracted_training_vcf - File extracted_training_vcf_index - File? interval_list - Array[File] model_files - Boolean use_allele_specific_annotations - - Int memory_mb = 16000 - Int command_mem = memory_mb - 1000 - } - Int disk_size = ceil(size(vcf, "GB") * 2 + 50) - - command { - zgrep -v '#' ~{vcf} > empty.txt - set -e - - if [ -s empty.txt ]; then - ln -s ~{sep=" . && ln -s " model_files} . - - export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx~{command_mem}m" \ - ScoreVariantAnnotations \ - ~{"-L " + interval_list} \ - -V ~{vcf} \ - -O ~{basename}.~{mode} \ - ~{"--model-backend " + model_backend} \ - ~{"--python-script " + python_script} \ - --model-prefix ~{basename} \ - ~{annotations} \ - ~{if use_allele_specific_annotations then "--use-allele-specific-annotations" else ""} \ - -mode ~{mode} \ - --resource:extracted,extracted=true ~{extracted_training_vcf} \ - ~{resource_args} - else - echo "Input VCF was empty so we'll return the same VCF that was input." - echo "Scores and annot hdf5 files will not be produced since the input was empty." - ln -s ~{vcf} ~{basename}.~{mode}.vcf.gz - ln -s ~{vcf_index} ~{basename}.~{mode}.vcf.gz.tbi - fi - } - output { - File? scores = "~{basename}.~{mode}.scores.hdf5" - File? annots = "~{basename}.~{mode}.annot.hdf5" - File output_vcf = "~{basename}.~{mode}.vcf.gz" - File output_vcf_index = "~{basename}.~{mode}.vcf.gz.tbi" - } - runtime { - docker: gatk_docker - disks: "local-disk " + disk_size + " HDD" - memory: memory_mb + " MiB" - } + input { + File input_vcf + File input_vcf_idx + String output_prefix + Array[String] annotations + String resource_args + File extracted_vcf + File extracted_vcf_idx + String model_prefix + Array[File] model_files + String? extra_args + + String gatk_docker + File? gatk_override + + RuntimeAttributes runtime_attributes = {} + } + + parameter_meta { + input_vcf: {localization_optional: true} + input_vcf_idx: {localization_optional: true} + extracted_vcf: {localization_optional: true} + extracted_vcf_idx: {localization_optional: true} + } + + command { + set -e + + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} + + mkdir model-files + ln -s ~{sep=" model-files && ln -s " model_files} model-files + + gatk --java-options "-Xmx~{default=2 runtime_attributes.command_mem_gb}G" \ + ScoreVariantAnnotations \ + -V ~{input_vcf} \ + -O ~{output_prefix}.score \ + -A ~{sep=" -A " annotations} \ + ~{resource_args} \ + --resource:extracted,extracted=true ~{extracted_vcf} \ + --model-prefix model-files/~{model_prefix}.train \ + ~{extra_args} + } + + runtime { + docker: gatk_docker + cpu: select_first([runtime_attributes.cpu, 1]) + memory: select_first([runtime_attributes.command_mem_gb, 2]) + select_first([runtime_attributes.additional_mem_gb, 1]) + " GB" + disks: "local-disk " + select_first([runtime_attributes.disk_size_gb, 100]) + if select_first([runtime_attributes.use_ssd, false]) then " SSD" else " HDD" + bootDiskSizeGb: select_first([runtime_attributes.boot_disk_size_gb, 15]) + preemptible: select_first([runtime_attributes.preemptible, 2]) + maxRetries: select_first([runtime_attributes.max_retries, 1]) + } + + output { + File scored_vcf = "~{output_prefix}.score.vcf.gz" # this line will break if extra_args includes the do-not-gzip-vcf-output argument + File scored_vcf_idx = "~{output_prefix}.score.vcf.gz.tbi" # this line will break if extra_args includes the do-not-gzip-vcf-output argument + File? annotations_hdf5 = "~{output_prefix}.score.annot.hdf5" # this file will only be produced if the number of sites scored is nonzero + File? scores_hdf5 = "~{output_prefix}.score.scores.hdf5" # this file will only be produced if the number of sites scored is nonzero + } } - From 01151c6c5937da2dc302377c1f678d53fcaec21b Mon Sep 17 00:00:00 2001 From: gbggrant Date: Wed, 29 Mar 2023 13:10:23 -0400 Subject: [PATCH 02/25] Fixed the annotations, and I think I have the flag for allele-specific annotations right now. --- .dockstore.yml | 1 + scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.dockstore.yml b/.dockstore.yml index df45322584e..218fe684052 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -96,6 +96,7 @@ workflows: - master - ah_var_store - gg_VS-695_RunPandSForVQSR_Lite + - gg_VS-776_UpdateToLatestVQSRLite - name: GvsPopulateAltAllele subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/GvsPopulateAltAllele.wdl diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index 3bc0a5b2cef..546e3a93cf2 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -137,9 +137,10 @@ workflow GvsCreateFilterSet { gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0", extract_interval_list = interval_list, score_interval_list = interval_list, - annotations = "-A AS_QD -A AS_MQRankSum -A AS_ReadPosRankSum -A AS_FS -A AS_MQ -A AS_SOR", + annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"], resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", - use_allele_specific_annotations = true, + extract_extra_args = "--use-allele-specific-annotations", + score_extra_args = "--use-allele-specific-annotations" } call Utils.MergeVCFs as MergeINDELScoredVCFs { From 3f5931820d38dd3989707d5272ef41c88689933e Mon Sep 17 00:00:00 2001 From: gbggrant Date: Wed, 29 Mar 2023 13:23:30 -0400 Subject: [PATCH 03/25] Pass the interval list right? Correct for the newly renamed output file. --- .../variantstore/wdl/GvsCreateFilterSet.wdl | 28 ++++++------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index 546e3a93cf2..6bb7667be76 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -135,27 +135,17 @@ workflow GvsCreateFilterSet { sites_only_vcf_index = MergeVCFs.output_vcf_index, output_prefix = filter_set_name, gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0", - extract_interval_list = interval_list, - score_interval_list = interval_list, annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"], resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", - extract_extra_args = "--use-allele-specific-annotations", - score_extra_args = "--use-allele-specific-annotations" + extract_extra_args = "-L ${interval_list} --use-allele-specific-annotations", + score_extra_args = "-L ${interval_list} --use-allele-specific-annotations" } - call Utils.MergeVCFs as MergeINDELScoredVCFs { + call Utils.MergeVCFs as MergeScoredVCFs { input: - input_vcfs = JointVcfFiltering.indels_variant_scored_vcf, + input_vcfs = JointVcfFiltering.scored_vcfs, gather_type = "CONVENTIONAL", - output_vcf_name = "${filter_set_name}.indel.vrecalibration.gz", - preemptible_tries = 3, - } - - call Utils.MergeVCFs as MergeSNPScoredVCFs { - input: - input_vcfs = JointVcfFiltering.snps_variant_scored_vcf, - gather_type = "CONVENTIONAL", - output_vcf_name = "${filter_set_name}.snp.vrecalibration.gz", + output_vcf_name = "${filter_set_name}.vrecalibration.gz", preemptible_tries = 3, } @@ -166,8 +156,8 @@ workflow GvsCreateFilterSet { # which we don't want to put into the filter_set_info_vqsr_lite table. call Utils.SelectVariants as CreateFilteredScoredSNPsVCF { input: - input_vcf = MergeSNPScoredVCFs.output_vcf, - input_vcf_index = MergeSNPScoredVCFs.output_vcf_index, + input_vcf = MergeScoredVCFs.output_vcf, + input_vcf_index = MergeScoredVCFs.output_vcf_index, type_to_include = "SNP", exclude_filtered = true, output_basename = "${filter_set_name}.filtered.scored.snps" @@ -175,8 +165,8 @@ workflow GvsCreateFilterSet { call Utils.SelectVariants as CreateFilteredScoredINDELsVCF { input: - input_vcf = MergeINDELScoredVCFs.output_vcf, - input_vcf_index = MergeINDELScoredVCFs.output_vcf_index, + input_vcf = MergeScoredVCFs.output_vcf, + input_vcf_index = MergeScoredVCFs.output_vcf_index, type_to_include = "INDEL", exclude_filtered = true, output_basename = "${filter_set_name}.filtered.scored.indels" From a155bbeb2c062b08339755ebdd08109a4c9fe2df Mon Sep 17 00:00:00 2001 From: gbggrant Date: Wed, 29 Mar 2023 13:32:47 -0400 Subject: [PATCH 04/25] Fixed another bug. --- scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index 6bb7667be76..e3b02cac119 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -132,7 +132,7 @@ workflow GvsCreateFilterSet { input_vcfs = ExtractFilterTask.output_vcf, input_vcf_idxs = ExtractFilterTask.output_vcf_index, sites_only_vcf = MergeVCFs.output_vcf, - sites_only_vcf_index = MergeVCFs.output_vcf_index, + sites_only_vcf_idx = MergeVCFs.output_vcf_index, output_prefix = filter_set_name, gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0", annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"], From ee626966511ffde55df7e7331ffdadb99aad9c5b Mon Sep 17 00:00:00 2001 From: gbggrant Date: Wed, 29 Mar 2023 16:23:21 -0400 Subject: [PATCH 05/25] Manually copying over the vcf_site_level_filtering tests. --- .../README.md | 2 +- .../run_vcf_site_level_filtering_wdl.sh | 25 ++++++++++--------- .../vcf_site_level_filtering.json | 17 +++++++++++++ .../vcf_site_level_filtering_pos_neg.json | 19 ++++++++++++++ .../vcf_site_level_filtering_travis.json | 14 ----------- 5 files changed, 50 insertions(+), 27 deletions(-) create mode 100644 scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering.json create mode 100644 scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_pos_neg.json delete mode 100644 scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/README.md b/scripts/vcf_site_level_filtering_cromwell_tests/README.md index 6f9950fa36d..28ee43d9b8c 100644 --- a/scripts/vcf_site_level_filtering_cromwell_tests/README.md +++ b/scripts/vcf_site_level_filtering_cromwell_tests/README.md @@ -2,7 +2,7 @@ **This directory is for GATK devs only** -This directory contains scripts for running Variant Site Level WDL tests in the automated travis build environment. +This directory contains scripts for running Variant Site Level WDL tests in the automated build environment. Please note that this only tests whether the WDL will complete successfully. diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh b/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh index 1c19d18c3b6..1f5955aa1f1 100644 --- a/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh +++ b/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh @@ -4,14 +4,16 @@ set -e script_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P ) cd "$script_path" -WORKING_DIR=/home/runner/work/gatk +WORKING_DIR=/home/runner/work/gatk/gatk +WDL_DIR=$WORKING_DIR/scripts/vcf_site_level_filtering_wdl +CROMWELL_TEST_DIR=$WORKING_DIR/scripts/vcf_site_level_filtering_cromwell_tests set -e echo "Building docker image for VCF Site Level Filtering WDL tests (skipping unit tests)..." #assume Dockerfile is in root echo "Building docker without running unit tests... =========" -cd $WORKING_DIR/gatk +cd $WORKING_DIR # IMPORTANT: This code is duplicated in the cnv and M2 WDL test. if [ ! -z "$CI_PULL_REQUEST" ]; then @@ -21,18 +23,17 @@ if [ ! -z "$CI_PULL_REQUEST" ]; then else HASH_TO_USE=${CI_COMMIT} sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/; - echo "using travis commit:"$HASH_TO_USE + echo "using commit:"$HASH_TO_USE fi echo "Docker build done ==========" -cd $WORKING_DIR/gatk/scripts/ -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json >$WORKING_DIR/vcf_site_level_filtering_travis.json -echo "JSON FILES (modified) =======" -cat $WORKING_DIR/vcf_site_level_filtering_travis.json -echo "==================" - +sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/vcf_site_level_filtering.json >$WORKING_DIR/vcf_site_level_filtering_mod.json +sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/vcf_site_level_filtering_pos_neg.json >$WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json echo "Running Filtering WDL through cromwell" -ln -fs $WORKING_DIR/gatk/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl -cd $WORKING_DIR/gatk/scripts/vcf_site_level_filtering_wdl/ -java -jar $CROMWELL_JAR run JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_travis.json + +cat $WORKING_DIR/vcf_site_level_filtering_mod.json +java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_mod.json + +cat $WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json +java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json \ No newline at end of file diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering.json b/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering.json new file mode 100644 index 00000000000..37cba35ad9d --- /dev/null +++ b/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering.json @@ -0,0 +1,17 @@ +{ + "JointVcfFiltering.gatk_docker": "__GATK_DOCKER__", + "JointVcfFiltering.input_vcfs": [ + "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz", + "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz", + "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz"], + "JointVcfFiltering.input_vcf_idxs": [ + "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz.tbi", + "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz.tbi", + "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz.tbi"], + "JointVcfFiltering.sites_only_vcf": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz", + "JointVcfFiltering.sites_only_vcf_idx": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz.tbi", + "JointVcfFiltering.annotations": ["ReadPosRankSum", "FS", "SOR", "QD"], + "JointVcfFiltering.output_prefix": "test_10_samples", + "JointVcfFiltering.resource_args": "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", + "JointVcfFiltering.extract_extra_args": "-L chr21" +} \ No newline at end of file diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_pos_neg.json b/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_pos_neg.json new file mode 100644 index 00000000000..ee2d116e1d4 --- /dev/null +++ b/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_pos_neg.json @@ -0,0 +1,19 @@ +{ + "JointVcfFiltering.gatk_docker": "__GATK_DOCKER__", + "JointVcfFiltering.input_vcfs": [ + "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz", + "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz", + "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz"], + "JointVcfFiltering.input_vcf_idxs": [ + "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz.tbi", + "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz.tbi", + "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz.tbi"], + "JointVcfFiltering.sites_only_vcf": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz", + "JointVcfFiltering.sites_only_vcf_idx": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz.tbi", + "JointVcfFiltering.annotations": ["ReadPosRankSum", "FS", "SOR", "QD"], + "JointVcfFiltering.output_prefix": "test_10_samples", + "JointVcfFiltering.resource_args": "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", + "JointVcfFiltering.extract_extra_args": "-L chr21 --maximum-number-of-unlabeled-variants 10000000", + "JointVcfFiltering.train_extra_args": "--calibration-sensitivity-threshold 0.95", + "JointVcfFiltering.score_extra_args": "--snp-calibration-sensitivity-threshold 0.95 --indel-calibration-sensitivity-threshold 0.95" +} \ No newline at end of file diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json b/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json deleted file mode 100644 index 8165e199d22..00000000000 --- a/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "JointVcfFiltering.gatk_docker": "__GATK_DOCKER__", - "JointVcfFiltering.vcf": ["/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz", - "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz"], - "JointVcfFiltering.vcf_index": ["/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi", - "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi"], - "JointVcfFiltering.sites_only_vcf": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz", - "JointVcfFiltering.sites_only_vcf_index": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi", - "JointVcfFiltering.basename": "test_10_samples", - "JointVcfFiltering.snp_annotations": "-A ReadPosRankSum -A FS -A SOR -A QD -A AVERAGE_TREE_SCORE -A AVERAGE_ASSEMBLED_HAPS -A AVERAGE_FILTERED_HAPS", - "JointVcfFiltering.indel_annotations": "-A MQRankSum -A ReadPosRankSum -A FS -A SOR -A QD -A AVERAGE_TREE_SCORE", - "JointVcfFiltering.model_backend": "PYTHON_IFOREST", - "JointVcfFiltering.use_allele_specific_annotations": false -} From 42fbd2db3d8e66f463c1cbd06c467e76910987ca Mon Sep 17 00:00:00 2001 From: gbggrant Date: Thu, 30 Mar 2023 08:21:59 -0400 Subject: [PATCH 06/25] Upping command memory for ExtractTask --- scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 3 ++- scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index a269c6782eb..f9da79feaa8 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -138,7 +138,8 @@ workflow GvsCreateFilterSet { annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"], resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", extract_extra_args = "-L ${interval_list} --use-allele-specific-annotations", - score_extra_args = "-L ${interval_list} --use-allele-specific-annotations" + score_extra_args = "-L ${interval_list} --use-allele-specific-annotations", + extract_runtime_attributes = {"command_mem_gb": 15} } call Utils.MergeVCFs as MergeScoredVCFs { diff --git a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl index b7c4fddae73..cd73c8e1ef2 100644 --- a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl +++ b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl @@ -139,10 +139,14 @@ task ExtractVariantAnnotations { input_vcf_idx: {localization_optional: true} } + File monitoring_script = "gs://gvs_quickstart_storage/cromwell_monitoring_script.sh" + command { set -e export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} + bash ~{monitoring_script} > monitoring.log & + gatk --java-options "-Xmx~{default=6 runtime_attributes.command_mem_gb}G" \ ExtractVariantAnnotations \ -V ~{input_vcf} \ @@ -167,6 +171,7 @@ task ExtractVariantAnnotations { File? unlabeled_annotations_hdf5 = "~{output_prefix}.extract.unlabeled.annot.hdf5" File extracted_vcf = "~{output_prefix}.extract.vcf.gz" # this line will break if extra_args includes the do-not-gzip-vcf-output argument File extracted_vcf_idx = "~{output_prefix}.extract.vcf.gz.tbi" # this line will break if extra_args includes the do-not-gzip-vcf-output argument + File monitoring_log = "monitoring.log" } } From fc3eb3aa2d0a756cfc10c548d16e993c4ee6b1d4 Mon Sep 17 00:00:00 2001 From: gbggrant Date: Thu, 30 Mar 2023 11:17:33 -0400 Subject: [PATCH 07/25] Still more memory and use newer gatk --- scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index f9da79feaa8..77110028107 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -134,12 +134,12 @@ workflow GvsCreateFilterSet { sites_only_vcf = MergeVCFs.output_vcf, sites_only_vcf_idx = MergeVCFs.output_vcf_index, output_prefix = filter_set_name, - gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0", + gatk_docker = "us.gcr.io/broad-gatk/gatk:4.4.0.0", annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"], resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", extract_extra_args = "-L ${interval_list} --use-allele-specific-annotations", score_extra_args = "-L ${interval_list} --use-allele-specific-annotations", - extract_runtime_attributes = {"command_mem_gb": 15} + extract_runtime_attributes = {"command_mem_gb": 27} } call Utils.MergeVCFs as MergeScoredVCFs { From 068261584668bbb1196c5361aa3c321c76606723 Mon Sep 17 00:00:00 2001 From: gbggrant Date: Thu, 30 Mar 2023 15:57:48 -0400 Subject: [PATCH 08/25] Up memory on all of VQSR Lite --- scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index 77110028107..3d0849ca141 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -139,7 +139,9 @@ workflow GvsCreateFilterSet { resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", extract_extra_args = "-L ${interval_list} --use-allele-specific-annotations", score_extra_args = "-L ${interval_list} --use-allele-specific-annotations", - extract_runtime_attributes = {"command_mem_gb": 27} + extract_runtime_attributes = {"command_mem_gb": 27}, + train_runtime_attributes = {"command_mem_gb": 27}, + score_runtime_attributes = {"command_mem_gb": 15}, } call Utils.MergeVCFs as MergeScoredVCFs { From 111fbba159dab47fcd8fa92269a60cc4dc8af2bd Mon Sep 17 00:00:00 2001 From: gbggrant Date: Mon, 10 Apr 2023 13:59:44 -0400 Subject: [PATCH 09/25] Fix the usage of the Axiom resource. --- .../variantstore/wdl/GvsCreateFilterSet.wdl | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index 3d0849ca141..9cb71a2c852 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -36,18 +36,42 @@ workflow GvsCreateFilterSet { Array[String] snp_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ] # reference files + # Axiom - Used only for indels + # Classic: known=false,training=true,truth=false + # Lite: training=true,calibration=false File axiomPoly_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz" File axiomPoly_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi" + + # DbSNP - BOTH SNPs and INDELs. But used only as known in classic (which isn't used in Lite and so dropped in lite) + # Classic: known=true,training=false,truth=false + # Lite: Unused File dbsnp_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf" File dbsnp_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx" + + # HapMap - SNPs + # Classic: known=false,training=true,truth=true + # Lite: training=true,calibration=true File hapmap_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz" File hapmap_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi" + + # Mills - Indels + # Classic: known=false,training=true,truth=true + # Lite: training=true,calibration=true File mills_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz" File mills_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi" + + # Omni - SNPs + # Classic: known=false,training=true,truth=true + # Lite: training=true,calibration=true File omni_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz" File omni_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi" + + # 1000G - SNPs + # Classic: known=false,training=true,truth=false + # Lite: training=true,calibration=false File one_thousand_genomes_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz" File one_thousand_genomes_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi" + File reference = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta" File reference_dict = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" File reference_index = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai" @@ -136,7 +160,7 @@ workflow GvsCreateFilterSet { output_prefix = filter_set_name, gatk_docker = "us.gcr.io/broad-gatk/gatk:4.4.0.0", annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"], - resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", + resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", extract_extra_args = "-L ${interval_list} --use-allele-specific-annotations", score_extra_args = "-L ${interval_list} --use-allele-specific-annotations", extract_runtime_attributes = {"command_mem_gb": 27}, From 96f90267a9f7a4c63829057bff2c5e060dfc0a90 Mon Sep 17 00:00:00 2001 From: gbggrant Date: Tue, 18 Apr 2023 11:39:09 -0400 Subject: [PATCH 10/25] Clean up from a bad merge --- .../vcf_site_level_filtering_wdl/JointVcfFiltering.wdl | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl index d0b059d24ce..9ffd5c63b99 100644 --- a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl +++ b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl @@ -116,6 +116,7 @@ workflow JointVcfFiltering { File extracted_vcf_idx = ExtractVariantAnnotations.extracted_vcf_idx Array[File] model_files = TrainVariantAnnotationsModel.model_files + Array[File] scored_vcfs = ScoreVariantAnnotations.scored_vcf Array[File] scored_vcf_idxs = ScoreVariantAnnotations.scored_vcf_idx Array[File?] annotations_hdf5s = ScoreVariantAnnotations.annotations_hdf5 @@ -200,8 +201,9 @@ task TrainVariantAnnotationsModel { String gatk_docker File? gatk_override - } - Int disk_size = ceil(size(annots, "GB") + 100) + + RuntimeAttributes runtime_attributes = {} + } command { set -e @@ -254,6 +256,7 @@ task ScoreVariantAnnotations { String gatk_docker File? gatk_override + RuntimeAttributes runtime_attributes = {} } @@ -304,5 +307,4 @@ task ScoreVariantAnnotations { File? scores_hdf5 = "~{output_prefix}.score.scores.hdf5" # this file will only be produced if the number of sites scored is nonzero File? monitoring_log = "monitoring.log" } - } -} +} \ No newline at end of file From ad5189a5bb81bea94ec3ee8b03d1675624c250e7 Mon Sep 17 00:00:00 2001 From: gbggrant Date: Tue, 18 Apr 2023 15:45:35 -0400 Subject: [PATCH 11/25] Refactoring to move all VQSR Classic code to separate WDL. --- .../variantstore/wdl/GvsCreateFilterSet.wdl | 194 ++--------------- scripts/variantstore/wdl/GvsVQSRClassic.wdl | 203 ++++++++++++++++++ 2 files changed, 218 insertions(+), 179 deletions(-) create mode 100644 scripts/variantstore/wdl/GvsVQSRClassic.wdl diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index b5965a75b71..5fb5cb6ad1a 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -1,7 +1,7 @@ version 1.0 -import "GvsWarpTasks.wdl" as Tasks import "GvsUtils.wdl" as Utils +import "GvsVQSRClassic.wdl" as VQSRClassic import "../../vcf_site_level_filtering_wdl/JointVcfFiltering.wdl" as VQSRLite workflow GvsCreateFilterSet { @@ -12,66 +12,13 @@ workflow GvsCreateFilterSet { String call_set_identifier String filter_set_name - Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"] - Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list" File? gatk_override Boolean use_classic_VQSR = true - Int? INDEL_VQSR_max_gaussians_override = 4 - Int? INDEL_VQSR_maximum_training_variants - Int? INDEL_VQSR_mem_gb_override - Int? SNP_VQSR_max_gaussians_override = 6 - Int? SNP_VQSR_mem_gb_override - Int? SNP_VQSR_sample_every_nth_variant - Int? SNP_VQSR_maximum_training_variants - # This is the minimum number of samples where the SNP model will be created and applied in separate tasks - # (SNPsVariantRecalibratorClassic vs. SNPsVariantRecalibratorCreateModel and SNPsVariantRecalibratorScattered) - # For WARP classic this is done with 20k but the 10K Stroke Anderson dataset would not work unscattered (at least - # with the default VM memory settings) so this was adjusted down to 5K. - Int snps_variant_recalibration_threshold = 5000 } - Array[String] snp_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ] - - # reference files - # Axiom - Used only for indels - # Classic: known=false,training=true,truth=false - # Lite: training=true,calibration=false - File axiomPoly_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz" - File axiomPoly_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi" - - # DbSNP - BOTH SNPs and INDELs. But used only as known in classic (which isn't used in Lite and so dropped in lite) - # Classic: known=true,training=false,truth=false - # Lite: Unused - File dbsnp_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf" - File dbsnp_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx" - - # HapMap - SNPs - # Classic: known=false,training=true,truth=true - # Lite: training=true,calibration=true - File hapmap_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz" - File hapmap_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi" - - # Mills - Indels - # Classic: known=false,training=true,truth=true - # Lite: training=true,calibration=true - File mills_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz" - File mills_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi" - - # Omni - SNPs - # Classic: known=false,training=true,truth=true - # Lite: training=true,calibration=true - File omni_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz" - File omni_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi" - - # 1000G - SNPs - # Classic: known=false,training=true,truth=false - # Lite: training=true,calibration=false - File one_thousand_genomes_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz" - File one_thousand_genomes_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi" - File reference = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta" File reference_dict = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" File reference_index = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai" @@ -226,129 +173,24 @@ workflow GvsCreateFilterSet { } if (use_classic_VQSR) { - - call Tasks.IndelsVariantRecalibrator { + call VQSRClassic.JointVcfFiltering as VQSRClassic { input: + base_name = filter_set_name, + num_samples_loaded = GetNumSamplesLoaded.num_samples, sites_only_variant_filtered_vcf = MergeVCFs.output_vcf, - sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index, - recalibration_filename = filter_set_name + ".indels.recal", - tranches_filename = filter_set_name + ".indels.tranches", - recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"], - recalibration_annotation_values = indel_recalibration_annotation_values, - mills_resource_vcf = mills_resource_vcf, - mills_resource_vcf_index = mills_resource_vcf_index, - axiomPoly_resource_vcf = axiomPoly_resource_vcf, - axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_vcf, - dbsnp_resource_vcf_index = dbsnp_vcf_index, - use_allele_specific_annotations = true, - disk_size = "1000", - machine_mem_gb = INDEL_VQSR_mem_gb_override, - max_gaussians = INDEL_VQSR_max_gaussians_override, - maximum_training_variants = INDEL_VQSR_maximum_training_variants, - } - - if (GetNumSamplesLoaded.num_samples > snps_variant_recalibration_threshold) { - call Tasks.SNPsVariantRecalibratorCreateModel { - input: - sites_only_variant_filtered_vcf = MergeVCFs.output_vcf, - sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index, - recalibration_filename = filter_set_name + ".snps.recal", - tranches_filename = filter_set_name + ".snps.tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - model_report_filename = filter_set_name + ".snps.model.report", - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_vcf, - dbsnp_resource_vcf_index = dbsnp_vcf_index, - use_allele_specific_annotations = true, - disk_size = "1000", - machine_mem_gb = SNP_VQSR_mem_gb_override, - max_gaussians = SNP_VQSR_max_gaussians_override, - sample_every_nth_variant = SNP_VQSR_sample_every_nth_variant, - maximum_training_variants = SNP_VQSR_maximum_training_variants - } - - scatter (idx in range(length(ExtractFilterTask.output_vcf))) { - call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered { - input: - sites_only_variant_filtered_vcf = ExtractFilterTask.output_vcf[idx], - sites_only_variant_filtered_vcf_index = ExtractFilterTask.output_vcf_index[idx], - recalibration_filename = filter_set_name + ".snps." + idx + ".recal", - tranches_filename = filter_set_name + ".snps." + idx + ".tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - model_report = SNPsVariantRecalibratorCreateModel.model_report, - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_vcf, - dbsnp_resource_vcf_index = dbsnp_vcf_index, - use_allele_specific_annotations = true, - disk_size = "1000", - machine_mem_gb = SNP_VQSR_mem_gb_override - } - } - - call Tasks.GatherTranches as SNPGatherTranches { - input: - tranches = SNPsVariantRecalibratorScattered.tranches, - output_filename = filter_set_name + ".snps.gathered.tranches", - output_tranche_values = snp_recalibration_tranche_values, - mode = "SNP", - disk_size = "200", - gatk_override = gatk_override - } - - call Utils.MergeVCFs as MergeRecalibrationFiles { - input: - input_vcfs = SNPsVariantRecalibratorScattered.recalibration, - gather_type = "CONVENTIONAL", - output_vcf_name = "${filter_set_name}.vrecalibration.gz", - preemptible_tries = 3, - } - } - - if (GetNumSamplesLoaded.num_samples <= snps_variant_recalibration_threshold) { - call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic { - input: - sites_only_variant_filtered_vcf = MergeVCFs.output_vcf, - sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index, - recalibration_filename = filter_set_name + ".snps.recal", - tranches_filename = filter_set_name + ".snps.tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_vcf, - dbsnp_resource_vcf_index = dbsnp_vcf_index, - use_allele_specific_annotations = true, - disk_size = "1000", - machine_mem_gb = SNP_VQSR_mem_gb_override, - max_gaussians = SNP_VQSR_max_gaussians_override, - } + sites_only_variant_filtered_vcf_idx = MergeVCFs.output_vcf_index, + sites_only_variant_filtered_vcfs = ExtractFilterTask.output_vcf, + sites_only_variant_filtered_vcf_idxs = ExtractFilterTask.output_vcf_index, } call PopulateFilterSetInfo as PopulateFilterSetInfoClassic { input: gatk_override = gatk_override, filter_set_name = filter_set_name, - snp_recal_file = select_first([MergeRecalibrationFiles.output_vcf, SNPsVariantRecalibratorClassic.recalibration]), - snp_recal_file_index = select_first([MergeRecalibrationFiles.output_vcf_index, SNPsVariantRecalibratorClassic.recalibration_index]), - indel_recal_file = IndelsVariantRecalibrator.recalibration, - indel_recal_file_index = IndelsVariantRecalibrator.recalibration_index, + snp_recal_file = VQSRClassic.snps_variant_recalibration_file, + snp_recal_file_index = VQSRClassic.snps_variant_recalibration_file_index, + indel_recal_file = VQSRClassic.indels_variant_recalibration_file, + indel_recal_file_index = VQSRClassic.indels_variant_recalibration_file_index, fq_info_destination_table = fq_info_destination_table, filter_schema = fq_info_destination_table_schema, project_id = project_id, @@ -369,8 +211,8 @@ workflow GvsCreateFilterSet { input: gatk_override = gatk_override, filter_set_name = filter_set_name, - snp_recal_tranches = select_first([SNPGatherTranches.tranches_file, SNPsVariantRecalibratorClassic.tranches]), - indel_recal_tranches = IndelsVariantRecalibrator.tranches, + snp_recal_tranches = VQSRClassic.snps_variant_tranches_file, + indel_recal_tranches = VQSRClassic.indels_variant_tranches_file, fq_tranches_destination_table = fq_tranches_destination_table, project_id = project_id } @@ -387,19 +229,13 @@ workflow GvsCreateFilterSet { [AltAlleleTableDatetimeCheck.monitoring_log], ExtractFilterTask.monitoring_log, [MergeVCFs.monitoring_log], - select_first([JointVcfFiltering.monitoring_logs, []]), # VQSR Lite Logging starts here + select_first([JointVcfFiltering.monitoring_logs, []]), [MergeScoredVCFs.monitoring_log], [CreateFilteredScoredSNPsVCF.monitoring_log], [CreateFilteredScoredINDELsVCF.monitoring_log], [PopulateFilterSetInfo.monitoring_log], [PopulateFilterSetSites.monitoring_log], - [IndelsVariantRecalibrator.monitoring_log], # VQSR Classic Logging Starts here - [SNPsVariantRecalibratorCreateModel.monitoring_log], - select_first([SNPsVariantRecalibratorScattered.monitoring_log, []]), - [SNPGatherTranches.monitoring_log], - [MergeRecalibrationFiles.monitoring_log], - [IndelsVariantRecalibrator.monitoring_log], - [SNPsVariantRecalibratorClassic.monitoring_log], + select_first([VQSRClassic.monitoring_logs, []]), [PopulateFilterSetInfoClassic.monitoring_log], [PopulateFilterSetSitesClassic.monitoring_log], [PopulateFilterSetTranches.monitoring_log] diff --git a/scripts/variantstore/wdl/GvsVQSRClassic.wdl b/scripts/variantstore/wdl/GvsVQSRClassic.wdl new file mode 100644 index 00000000000..de93b2e0571 --- /dev/null +++ b/scripts/variantstore/wdl/GvsVQSRClassic.wdl @@ -0,0 +1,203 @@ +version 1.0 + +import "GvsWarpTasks.wdl" as Tasks +import "GvsUtils.wdl" as Utils + +workflow JointVcfFiltering { + input { + String base_name + Int num_samples_loaded + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_idx + Array[File] sites_only_variant_filtered_vcfs + Array[File] sites_only_variant_filtered_vcf_idxs + + Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"] + Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] + + File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list" + File? gatk_override + + Int? INDEL_VQSR_max_gaussians_override = 4 + Int? INDEL_VQSR_maximum_training_variants + Int? INDEL_VQSR_mem_gb_override + Int? SNP_VQSR_max_gaussians_override = 6 + Int? SNP_VQSR_mem_gb_override + Int? SNP_VQSR_sample_every_nth_variant + Int? SNP_VQSR_maximum_training_variants + + # This is the minimum number of samples where the SNP model will be created and applied in separate tasks + # (SNPsVariantRecalibratorClassic vs. SNPsVariantRecalibratorCreateModel and SNPsVariantRecalibratorScattered) + # For VQSR classic this is done with 20k but the 10K Stroke Anderson dataset would not work unscattered (at least + # with the default VM memory settings) so this was adjusted down to 5K. + Int snps_variant_recalibration_threshold = 5000 + } + + Array[String] snp_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ] + + # reference files + # Axiom - Used only for indels + # Classic: known=false,training=true,truth=false + File axiomPoly_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz" + File axiomPoly_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi" + + # DbSNP - BOTH SNPs and INDELs. But used only as known in classic (which isn't used in Lite and so dropped in lite) + # Classic: known=true,training=false,truth=false + File dbsnp_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf" + File dbsnp_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx" + + # HapMap - SNPs + # Classic: known=false,training=true,truth=true + File hapmap_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz" + File hapmap_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi" + + # Mills - Indels + # Classic: known=false,training=true,truth=true + File mills_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz" + File mills_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi" + + # Omni - SNPs + # Classic: known=false,training=true,truth=true + File omni_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz" + File omni_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi" + + # 1000G - SNPs + # Classic: known=false,training=true,truth=false + File one_thousand_genomes_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz" + File one_thousand_genomes_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi" + + call Tasks.IndelsVariantRecalibrator { + input: + sites_only_variant_filtered_vcf = sites_only_variant_filtered_vcf, + sites_only_variant_filtered_vcf_index = sites_only_variant_filtered_vcf_idx, + recalibration_filename = base_name + ".indels.recal", + tranches_filename = base_name + ".indels.tranches", + recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"], + recalibration_annotation_values = indel_recalibration_annotation_values, + mills_resource_vcf = mills_resource_vcf, + mills_resource_vcf_index = mills_resource_vcf_index, + axiomPoly_resource_vcf = axiomPoly_resource_vcf, + axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_vcf, + dbsnp_resource_vcf_index = dbsnp_vcf_index, + use_allele_specific_annotations = true, + disk_size = "1000", + machine_mem_gb = INDEL_VQSR_mem_gb_override, + max_gaussians = INDEL_VQSR_max_gaussians_override, + maximum_training_variants = INDEL_VQSR_maximum_training_variants, + } + + if (num_samples_loaded > snps_variant_recalibration_threshold) { + call Tasks.SNPsVariantRecalibratorCreateModel { + input: + sites_only_variant_filtered_vcf = sites_only_variant_filtered_vcf, + sites_only_variant_filtered_vcf_index = sites_only_variant_filtered_vcf_idx, + recalibration_filename = base_name + ".snps.recal", + tranches_filename = base_name + ".snps.tranches", + model_report_filename = base_name + ".snps.model.report", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_vcf, + dbsnp_resource_vcf_index = dbsnp_vcf_index, + use_allele_specific_annotations = true, + disk_size = "1000", + machine_mem_gb = SNP_VQSR_mem_gb_override, + max_gaussians = SNP_VQSR_max_gaussians_override, + sample_every_nth_variant = SNP_VQSR_sample_every_nth_variant, + maximum_training_variants = SNP_VQSR_maximum_training_variants + } + + scatter (idx in range(length(sites_only_variant_filtered_vcfs))) { + call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered { + input: + sites_only_variant_filtered_vcf = sites_only_variant_filtered_vcfs[idx], + sites_only_variant_filtered_vcf_index = sites_only_variant_filtered_vcf_idxs[idx], + recalibration_filename = base_name + ".snps." + idx + ".recal", + tranches_filename = base_name + ".snps." + idx + ".tranches", + model_report = SNPsVariantRecalibratorCreateModel.model_report, + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_vcf, + dbsnp_resource_vcf_index = dbsnp_vcf_index, + use_allele_specific_annotations = true, + disk_size = "1000", + machine_mem_gb = SNP_VQSR_mem_gb_override + } + } + + call Tasks.GatherTranches as SNPGatherTranches { + input: + tranches = SNPsVariantRecalibratorScattered.tranches, + output_filename = base_name + ".snps.gathered.tranches", + output_tranche_values = snp_recalibration_tranche_values, + mode = "SNP", + disk_size = "200", + gatk_override = gatk_override + } + + call Utils.MergeVCFs as MergeRecalibrationFiles { + input: + input_vcfs = SNPsVariantRecalibratorScattered.recalibration, + gather_type = "CONVENTIONAL", + output_vcf_name = "${base_name}.vrecalibration.vcf.gz", + preemptible_tries = 3, + } + } + + if (num_samples_loaded <= snps_variant_recalibration_threshold) { + call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic { + input: + sites_only_variant_filtered_vcf = sites_only_variant_filtered_vcf, + sites_only_variant_filtered_vcf_index = sites_only_variant_filtered_vcf_idx, + recalibration_filename = base_name + ".snps.recal", + tranches_filename = base_name + ".snps.tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_vcf, + dbsnp_resource_vcf_index = dbsnp_vcf_index, + use_allele_specific_annotations = true, + disk_size = "1000", + machine_mem_gb = SNP_VQSR_mem_gb_override, + max_gaussians = SNP_VQSR_max_gaussians_override, + } + } + + output { + File snps_variant_recalibration_file = select_first([MergeRecalibrationFiles.output_vcf, SNPsVariantRecalibratorClassic.recalibration]) + File snps_variant_recalibration_file_index = select_first([MergeRecalibrationFiles.output_vcf_index, SNPsVariantRecalibratorClassic.recalibration_index]) + File snps_variant_tranches_file = select_first([SNPGatherTranches.tranches_file, SNPsVariantRecalibratorClassic.tranches]) + File indels_variant_recalibration_file = IndelsVariantRecalibrator.recalibration + File indels_variant_recalibration_file_index = IndelsVariantRecalibrator.recalibration_index + File indels_variant_tranches_file = IndelsVariantRecalibrator.tranches + Array[File?] monitoring_logs = flatten( + [ + [IndelsVariantRecalibrator.monitoring_log], + [SNPsVariantRecalibratorCreateModel.monitoring_log], + select_first([SNPsVariantRecalibratorScattered.monitoring_log, []]), + [SNPGatherTranches.monitoring_log], + [MergeRecalibrationFiles.monitoring_log], + [IndelsVariantRecalibrator.monitoring_log], + [SNPsVariantRecalibratorClassic.monitoring_log] + ]) + } + +} + From fb1a90bd93bad1fc64d63a63938007913ff351d4 Mon Sep 17 00:00:00 2001 From: gbggrant Date: Tue, 18 Apr 2023 16:13:22 -0400 Subject: [PATCH 12/25] Allow to push some VQSR Classic parameters down from GvsUnified --- scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index 5fb5cb6ad1a..6de47e1a1a9 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -17,6 +17,14 @@ workflow GvsCreateFilterSet { File? gatk_override Boolean use_classic_VQSR = true + + Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"] + Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] + + Int? INDEL_VQSR_max_gaussians_override = 4 + Int? INDEL_VQSR_mem_gb_override + Int? SNP_VQSR_max_gaussians_override = 6 + Int? SNP_VQSR_mem_gb_override } File reference = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta" @@ -181,6 +189,12 @@ workflow GvsCreateFilterSet { sites_only_variant_filtered_vcf_idx = MergeVCFs.output_vcf_index, sites_only_variant_filtered_vcfs = ExtractFilterTask.output_vcf, sites_only_variant_filtered_vcf_idxs = ExtractFilterTask.output_vcf_index, + snp_recalibration_annotation_values = snp_recalibration_annotation_values, + indel_recalibration_annotation_values = indel_recalibration_annotation_values, + INDEL_VQSR_max_gaussians_override = INDEL_VQSR_max_gaussians_override, + INDEL_VQSR_mem_gb_override = INDEL_VQSR_mem_gb_override, + SNP_VQSR_max_gaussians_override = SNP_VQSR_max_gaussians_override, + SNP_VQSR_mem_gb_override = SNP_VQSR_mem_gb_override } call PopulateFilterSetInfo as PopulateFilterSetInfoClassic { From 12a8d7d3d7aac199dd1117fa6efea9e5055a2ec5 Mon Sep 17 00:00:00 2001 From: gbggrant Date: Tue, 18 Apr 2023 19:36:09 -0400 Subject: [PATCH 13/25] Comment out for debugging --- scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index 6de47e1a1a9..529493ef098 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -249,7 +249,7 @@ workflow GvsCreateFilterSet { [CreateFilteredScoredINDELsVCF.monitoring_log], [PopulateFilterSetInfo.monitoring_log], [PopulateFilterSetSites.monitoring_log], - select_first([VQSRClassic.monitoring_logs, []]), +# select_first([VQSRClassic.monitoring_logs, []]), [PopulateFilterSetInfoClassic.monitoring_log], [PopulateFilterSetSitesClassic.monitoring_log], [PopulateFilterSetTranches.monitoring_log] From b95340cb2eec927ba1bd5a23c8da5c2b4fc4fd76 Mon Sep 17 00:00:00 2001 From: gbggrant Date: Tue, 18 Apr 2023 19:38:00 -0400 Subject: [PATCH 14/25] Actually, try it like this. --- scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index 529493ef098..e59de5b834e 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -249,7 +249,7 @@ workflow GvsCreateFilterSet { [CreateFilteredScoredINDELsVCF.monitoring_log], [PopulateFilterSetInfo.monitoring_log], [PopulateFilterSetSites.monitoring_log], -# select_first([VQSRClassic.monitoring_logs, []]), + VQSRClassic.monitoring_logs, [PopulateFilterSetInfoClassic.monitoring_log], [PopulateFilterSetSitesClassic.monitoring_log], [PopulateFilterSetTranches.monitoring_log] From bd542c39706c6d96433f38a152cb841d741f4860 Mon Sep 17 00:00:00 2001 From: gbggrant Date: Tue, 18 Apr 2023 19:40:50 -0400 Subject: [PATCH 15/25] No, actually not --- scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index e59de5b834e..529493ef098 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -249,7 +249,7 @@ workflow GvsCreateFilterSet { [CreateFilteredScoredINDELsVCF.monitoring_log], [PopulateFilterSetInfo.monitoring_log], [PopulateFilterSetSites.monitoring_log], - VQSRClassic.monitoring_logs, +# select_first([VQSRClassic.monitoring_logs, []]), [PopulateFilterSetInfoClassic.monitoring_log], [PopulateFilterSetSitesClassic.monitoring_log], [PopulateFilterSetTranches.monitoring_log] From fa0f0ee543df5705ca6a17ce07ca01c53cf92ea6 Mon Sep 17 00:00:00 2001 From: gbggrant Date: Tue, 18 Apr 2023 19:58:06 -0400 Subject: [PATCH 16/25] More debugging, one fix. --- .../variantstore/wdl/GvsCreateFilterSet.wdl | 24 +++++++++---------- scripts/variantstore/wdl/GvsVQSRClassic.wdl | 1 - 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index 529493ef098..6bdbd29cbd4 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -238,20 +238,20 @@ workflow GvsCreateFilterSet { flatten( [ [SamplesTableDatetimeCheck.monitoring_log], - [GetNumSamplesLoaded.monitoring_log], - [SplitIntervals.monitoring_log], - [AltAlleleTableDatetimeCheck.monitoring_log], +# [GetNumSamplesLoaded.monitoring_log], +# [SplitIntervals.monitoring_log], +# [AltAlleleTableDatetimeCheck.monitoring_log], ExtractFilterTask.monitoring_log, - [MergeVCFs.monitoring_log], +# [MergeVCFs.monitoring_log], select_first([JointVcfFiltering.monitoring_logs, []]), - [MergeScoredVCFs.monitoring_log], - [CreateFilteredScoredSNPsVCF.monitoring_log], - [CreateFilteredScoredINDELsVCF.monitoring_log], - [PopulateFilterSetInfo.monitoring_log], - [PopulateFilterSetSites.monitoring_log], -# select_first([VQSRClassic.monitoring_logs, []]), - [PopulateFilterSetInfoClassic.monitoring_log], - [PopulateFilterSetSitesClassic.monitoring_log], +# [MergeScoredVCFs.monitoring_log], +# [CreateFilteredScoredSNPsVCF.monitoring_log], +# [CreateFilteredScoredINDELsVCF.monitoring_log], +# [PopulateFilterSetInfo.monitoring_log], +# [PopulateFilterSetSites.monitoring_log], + select_first([VQSRClassic.monitoring_logs, []]), +# [PopulateFilterSetInfoClassic.monitoring_log], +# [PopulateFilterSetSitesClassic.monitoring_log], [PopulateFilterSetTranches.monitoring_log] ] ) diff --git a/scripts/variantstore/wdl/GvsVQSRClassic.wdl b/scripts/variantstore/wdl/GvsVQSRClassic.wdl index de93b2e0571..efdcf045a6b 100644 --- a/scripts/variantstore/wdl/GvsVQSRClassic.wdl +++ b/scripts/variantstore/wdl/GvsVQSRClassic.wdl @@ -194,7 +194,6 @@ workflow JointVcfFiltering { select_first([SNPsVariantRecalibratorScattered.monitoring_log, []]), [SNPGatherTranches.monitoring_log], [MergeRecalibrationFiles.monitoring_log], - [IndelsVariantRecalibrator.monitoring_log], [SNPsVariantRecalibratorClassic.monitoring_log] ]) } From dde74e7ca86cde196ba39300dd1ebbb74d076c91 Mon Sep 17 00:00:00 2001 From: gbggrant Date: Tue, 18 Apr 2023 22:00:51 -0400 Subject: [PATCH 17/25] Add a select_all --- scripts/variantstore/wdl/GvsVQSRClassic.wdl | 5 +++-- scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/variantstore/wdl/GvsVQSRClassic.wdl b/scripts/variantstore/wdl/GvsVQSRClassic.wdl index efdcf045a6b..158bbec8faa 100644 --- a/scripts/variantstore/wdl/GvsVQSRClassic.wdl +++ b/scripts/variantstore/wdl/GvsVQSRClassic.wdl @@ -187,7 +187,8 @@ workflow JointVcfFiltering { File indels_variant_recalibration_file = IndelsVariantRecalibrator.recalibration File indels_variant_recalibration_file_index = IndelsVariantRecalibrator.recalibration_index File indels_variant_tranches_file = IndelsVariantRecalibrator.tranches - Array[File?] monitoring_logs = flatten( + Array[File] monitoring_logs = select_all( + flatten( [ [IndelsVariantRecalibrator.monitoring_log], [SNPsVariantRecalibratorCreateModel.monitoring_log], @@ -195,7 +196,7 @@ workflow JointVcfFiltering { [SNPGatherTranches.monitoring_log], [MergeRecalibrationFiles.monitoring_log], [SNPsVariantRecalibratorClassic.monitoring_log] - ]) + ])) } } diff --git a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl index 9ffd5c63b99..55b5fa1c390 100644 --- a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl +++ b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl @@ -184,7 +184,7 @@ task ExtractVariantAnnotations { File? unlabeled_annotations_hdf5 = "~{output_prefix}.extract.unlabeled.annot.hdf5" File extracted_vcf = "~{output_prefix}.extract.vcf.gz" # this line will break if extra_args includes the do-not-gzip-vcf-output argument File extracted_vcf_idx = "~{output_prefix}.extract.vcf.gz.tbi" # this line will break if extra_args includes the do-not-gzip-vcf-output argument - File monitoring_log = "monitoring.log" + File? monitoring_log = "monitoring.log" } } From 8137b7bd158efb79419b5d198a9a04f6612d9639 Mon Sep 17 00:00:00 2001 From: gbggrant Date: Wed, 19 Apr 2023 08:00:38 -0400 Subject: [PATCH 18/25] Okay - put all the logs back for final test of classic. --- .../variantstore/wdl/GvsCreateFilterSet.wdl | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index 6bdbd29cbd4..6de47e1a1a9 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -238,20 +238,20 @@ workflow GvsCreateFilterSet { flatten( [ [SamplesTableDatetimeCheck.monitoring_log], -# [GetNumSamplesLoaded.monitoring_log], -# [SplitIntervals.monitoring_log], -# [AltAlleleTableDatetimeCheck.monitoring_log], + [GetNumSamplesLoaded.monitoring_log], + [SplitIntervals.monitoring_log], + [AltAlleleTableDatetimeCheck.monitoring_log], ExtractFilterTask.monitoring_log, -# [MergeVCFs.monitoring_log], + [MergeVCFs.monitoring_log], select_first([JointVcfFiltering.monitoring_logs, []]), -# [MergeScoredVCFs.monitoring_log], -# [CreateFilteredScoredSNPsVCF.monitoring_log], -# [CreateFilteredScoredINDELsVCF.monitoring_log], -# [PopulateFilterSetInfo.monitoring_log], -# [PopulateFilterSetSites.monitoring_log], + [MergeScoredVCFs.monitoring_log], + [CreateFilteredScoredSNPsVCF.monitoring_log], + [CreateFilteredScoredINDELsVCF.monitoring_log], + [PopulateFilterSetInfo.monitoring_log], + [PopulateFilterSetSites.monitoring_log], select_first([VQSRClassic.monitoring_logs, []]), -# [PopulateFilterSetInfoClassic.monitoring_log], -# [PopulateFilterSetSitesClassic.monitoring_log], + [PopulateFilterSetInfoClassic.monitoring_log], + [PopulateFilterSetSitesClassic.monitoring_log], [PopulateFilterSetTranches.monitoring_log] ] ) From 8e45dd6a4a6c02a67626c71dcd69c97ccd893c8a Mon Sep 17 00:00:00 2001 From: gbggrant Date: Wed, 19 Apr 2023 16:09:49 -0400 Subject: [PATCH 19/25] Update .dockstore.yml for testing VQSR Classic Refactoring. --- .dockstore.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.dockstore.yml b/.dockstore.yml index aea4b176091..5896f106add 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -224,6 +224,7 @@ workflows: - master - ah_var_store - vs_866_update_variants_base_image + - gg_VS-776_UpdateToLatestVQSRLite - name: GvsQuickstartHailIntegration subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl From 99f956d08c2e954761a1815f8a7eeff5394e2c4f Mon Sep 17 00:00:00 2001 From: gbggrant Date: Wed, 19 Apr 2023 16:14:55 -0400 Subject: [PATCH 20/25] Fix the .dockstore.yml --- .dockstore.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dockstore.yml b/.dockstore.yml index 5896f106add..9586879f7ad 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -224,7 +224,6 @@ workflows: - master - ah_var_store - vs_866_update_variants_base_image - - gg_VS-776_UpdateToLatestVQSRLite - name: GvsQuickstartHailIntegration subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl @@ -241,6 +240,7 @@ workflows: - master - ah_var_store - vs_888_fix_broken_gsutil_docker + - gg_VS-776_UpdateToLatestVQSRLite - name: GvsIngestTieout subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/GvsIngestTieout.wdl From 3f7234e2515ae5d7bed2910e4da903ed077354a3 Mon Sep 17 00:00:00 2001 From: gbggrant Date: Thu, 20 Apr 2023 16:20:44 -0400 Subject: [PATCH 21/25] Renamed some VQSR Classic Specific inputs. Added test files. --- .../variantstore/wdl/GvsCreateFilterSet.wdl | 32 +++++++++++-------- scripts/variantstore/wdl/GvsUnified.wdl | 29 ++++++++++------- scripts/variantstore/wdl/GvsVQSRClassic.wdl | 15 +++++---- .../test_10_samples.22.avg.vcf.gz | 3 -- .../test_10_samples.22.avg.vcf.gz.tbi | 3 -- .../test_10_samples.23.avg.vcf.gz | 3 -- .../test_10_samples.23.avg.vcf.gz.tbi | 3 -- .../test_10_samples.chr21.avg.vcf.gz | 3 ++ .../test_10_samples.chr21.avg.vcf.gz.tbi | 3 ++ ...t_10_samples.chr21_chr22.sites_only.vcf.gz | 3 ++ ..._samples.chr21_chr22.sites_only.vcf.gz.tbi | 3 ++ .../test_10_samples.chr22.avg.vcf.gz | 3 ++ .../test_10_samples.chr22.avg.vcf.gz.tbi | 3 ++ .../test_10_samples.empty.avg.vcf.gz | 3 ++ .../test_10_samples.empty.avg.vcf.gz.tbi | 3 ++ .../test_10_samples.sites_only.vcf.gz | 3 -- .../test_10_samples.sites_only.vcf.gz.tbi | 3 -- 17 files changed, 67 insertions(+), 51 deletions(-) delete mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz delete mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi delete mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz delete mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz.tbi create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz.tbi create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz.tbi create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz create mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz.tbi delete mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz delete mode 100644 src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index 6de47e1a1a9..4775a5501c9 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -17,14 +17,17 @@ workflow GvsCreateFilterSet { File? gatk_override Boolean use_classic_VQSR = true + # These are the SNP and INDEL annotations used for VQSR Classic, the order matters for consistency between runs. + Array[String] vqsr_classic_indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"] + Array[String] vqsr_classic_snp_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] - Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"] - Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] + Int? INDEL_VQSR_CLASSIC_max_gaussians_override = 4 + Int? INDEL_VQSR_CLASSIC_mem_gb_override + Int? SNP_VQSR_CLASSIC_max_gaussians_override = 6 + Int? SNP_VQSR_CLASSIC_mem_gb_override - Int? INDEL_VQSR_max_gaussians_override = 4 - Int? INDEL_VQSR_mem_gb_override - Int? SNP_VQSR_max_gaussians_override = 6 - Int? SNP_VQSR_mem_gb_override + # These are the (unified) annotations used for VQSR Lite. The order matters for consistency between runs. + Array[String] vqsr_lite_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] } File reference = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta" @@ -113,14 +116,15 @@ workflow GvsCreateFilterSet { sites_only_vcf = MergeVCFs.output_vcf, sites_only_vcf_idx = MergeVCFs.output_vcf_index, output_prefix = filter_set_name, - gatk_docker = "us.gcr.io/broad-gatk/gatk:4.4.0.0", - annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"], + annotations = vqsr_lite_recalibration_annotations, resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", extract_extra_args = "-L ${interval_list} --use-allele-specific-annotations", score_extra_args = "-L ${interval_list} --use-allele-specific-annotations", extract_runtime_attributes = {"command_mem_gb": 27}, train_runtime_attributes = {"command_mem_gb": 27}, score_runtime_attributes = {"command_mem_gb": 15}, + gatk_docker = "us.gcr.io/broad-gatk/gatk:4.4.0.0", + gatk_override = gatk_override, monitoring_script = "gs://gvs-internal/cromwell_monitoring_script.sh" } @@ -189,12 +193,12 @@ workflow GvsCreateFilterSet { sites_only_variant_filtered_vcf_idx = MergeVCFs.output_vcf_index, sites_only_variant_filtered_vcfs = ExtractFilterTask.output_vcf, sites_only_variant_filtered_vcf_idxs = ExtractFilterTask.output_vcf_index, - snp_recalibration_annotation_values = snp_recalibration_annotation_values, - indel_recalibration_annotation_values = indel_recalibration_annotation_values, - INDEL_VQSR_max_gaussians_override = INDEL_VQSR_max_gaussians_override, - INDEL_VQSR_mem_gb_override = INDEL_VQSR_mem_gb_override, - SNP_VQSR_max_gaussians_override = SNP_VQSR_max_gaussians_override, - SNP_VQSR_mem_gb_override = SNP_VQSR_mem_gb_override + snp_recalibration_annotations = vqsr_classic_snp_recalibration_annotations, + indel_recalibration_annotations = vqsr_classic_indel_recalibration_annotations, + INDEL_VQSR_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override, + INDEL_VQSR_mem_gb_override = INDEL_VQSR_CLASSIC_mem_gb_override, + SNP_VQSR_max_gaussians_override = SNP_VQSR_CLASSIC_max_gaussians_override, + SNP_VQSR_mem_gb_override = SNP_VQSR_CLASSIC_mem_gb_override } call PopulateFilterSetInfo as PopulateFilterSetInfoClassic { diff --git a/scripts/variantstore/wdl/GvsUnified.wdl b/scripts/variantstore/wdl/GvsUnified.wdl index 80bfdd1afef..880cc302201 100644 --- a/scripts/variantstore/wdl/GvsUnified.wdl +++ b/scripts/variantstore/wdl/GvsUnified.wdl @@ -39,13 +39,18 @@ workflow GvsUnified { # Begin GvsCreateFilterSet String filter_set_name = call_set_identifier - Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"] - Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] - Int? INDEL_VQSR_max_gaussians_override = 4 - Int? INDEL_VQSR_mem_gb_override - Int? SNP_VQSR_max_gaussians_override = 6 - Int? SNP_VQSR_mem_gb_override + # These are the SNP and INDEL annotations used for VQSR Classic, the order matters. + Array[String] vqsr_classic_indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"] + Array[String] vqsr_classic_snp_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] + + # These are the (unified) annotations used for VQSR Lite. The order matters. + Array[String] vqsr_lite_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] + + Int? INDEL_VQSR_CLASSIC_max_gaussians_override = 4 + Int? INDEL_VQSR_CLASSIC_mem_gb_override + Int? SNP_VQSR_CLASSIC_max_gaussians_override = 6 + Int? SNP_VQSR_CLASSIC_mem_gb_override # End GvsCreateFilterSet # Begin GvsPrepareRangesCallset @@ -116,14 +121,14 @@ workflow GvsUnified { project_id = project_id, call_set_identifier = call_set_identifier, filter_set_name = filter_set_name, - indel_recalibration_annotation_values = indel_recalibration_annotation_values, - snp_recalibration_annotation_values = snp_recalibration_annotation_values, + vqsr_classic_indel_recalibration_annotations = vqsr_classic_indel_recalibration_annotations, + vqsr_classic_snp_recalibration_annotations = vqsr_classic_snp_recalibration_annotations, interval_list = interval_list, gatk_override = gatk_override, - INDEL_VQSR_max_gaussians_override = INDEL_VQSR_max_gaussians_override, - INDEL_VQSR_mem_gb_override = INDEL_VQSR_mem_gb_override, - SNP_VQSR_max_gaussians_override = SNP_VQSR_max_gaussians_override, - SNP_VQSR_mem_gb_override = SNP_VQSR_mem_gb_override + INDEL_VQSR_CLASSIC_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override, + INDEL_VQSR_CLASSIC_mem_gb_override = INDEL_VQSR_CLASSIC_mem_gb_override, + SNP_VQSR_CLASSIC_max_gaussians_override = SNP_VQSR_CLASSIC_max_gaussians_override, + SNP_VQSR_CLASSIC_mem_gb_override = SNP_VQSR_CLASSIC_mem_gb_override } call PrepareRangesCallset.GvsPrepareCallset { diff --git a/scripts/variantstore/wdl/GvsVQSRClassic.wdl b/scripts/variantstore/wdl/GvsVQSRClassic.wdl index 158bbec8faa..181b1eaf8ce 100644 --- a/scripts/variantstore/wdl/GvsVQSRClassic.wdl +++ b/scripts/variantstore/wdl/GvsVQSRClassic.wdl @@ -12,8 +12,8 @@ workflow JointVcfFiltering { Array[File] sites_only_variant_filtered_vcfs Array[File] sites_only_variant_filtered_vcf_idxs - Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"] - Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] + Array[String] indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"] + Array[String] snp_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list" File? gatk_override @@ -34,6 +34,7 @@ workflow JointVcfFiltering { } Array[String] snp_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ] + Array[String] indel_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"] # reference files # Axiom - Used only for indels @@ -72,8 +73,8 @@ workflow JointVcfFiltering { sites_only_variant_filtered_vcf_index = sites_only_variant_filtered_vcf_idx, recalibration_filename = base_name + ".indels.recal", tranches_filename = base_name + ".indels.tranches", - recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"], - recalibration_annotation_values = indel_recalibration_annotation_values, + recalibration_tranche_values = indel_recalibration_tranche_values, + recalibration_annotation_values = indel_recalibration_annotations, mills_resource_vcf = mills_resource_vcf, mills_resource_vcf_index = mills_resource_vcf_index, axiomPoly_resource_vcf = axiomPoly_resource_vcf, @@ -96,7 +97,7 @@ workflow JointVcfFiltering { tranches_filename = base_name + ".snps.tranches", model_report_filename = base_name + ".snps.model.report", recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, + recalibration_annotation_values = snp_recalibration_annotations, hapmap_resource_vcf = hapmap_resource_vcf, hapmap_resource_vcf_index = hapmap_resource_vcf_index, omni_resource_vcf = omni_resource_vcf, @@ -122,7 +123,7 @@ workflow JointVcfFiltering { tranches_filename = base_name + ".snps." + idx + ".tranches", model_report = SNPsVariantRecalibratorCreateModel.model_report, recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, + recalibration_annotation_values = snp_recalibration_annotations, hapmap_resource_vcf = hapmap_resource_vcf, hapmap_resource_vcf_index = hapmap_resource_vcf_index, omni_resource_vcf = omni_resource_vcf, @@ -164,7 +165,7 @@ workflow JointVcfFiltering { recalibration_filename = base_name + ".snps.recal", tranches_filename = base_name + ".snps.tranches", recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, + recalibration_annotation_values = snp_recalibration_annotations, hapmap_resource_vcf = hapmap_resource_vcf, hapmap_resource_vcf_index = hapmap_resource_vcf_index, omni_resource_vcf = omni_resource_vcf, diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz deleted file mode 100644 index 31cba1e00f8..00000000000 --- a/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dcf1dbda2255fbe1372d09d364835452d610822070b6b9b56b1733388aa3cd19 -size 140900871 diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi deleted file mode 100644 index 5fd47681849..00000000000 --- a/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:af32939cd4f63a0a9251a50cc5658738285d4cee4833bcf1cda6b92d90c4b99b -size 100153 diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz deleted file mode 100644 index 55dde2493e4..00000000000 --- a/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4144805bd8fabc74f3eea39a910dbd5c24017b844c44640efda49e3b0febe693 -size 112076612 diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi deleted file mode 100644 index 114d43936c5..00000000000 --- a/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f3484a38abb76952b02863099c383eae26d50f44514c5045992f63cc3294ebe8 -size 114295 diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz new file mode 100644 index 00000000000..304adc48127 --- /dev/null +++ b/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8556461fa8933187bea3708d72e55927a67ff9d73938f5cc26bf33c54cd58e2a +size 34193100 diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz.tbi new file mode 100644 index 00000000000..80332910b23 --- /dev/null +++ b/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:320f43cc7cbee56a7cc06fbd4686041d970b496d205a3e6a7665b2a82f0214ed +size 30439 diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz new file mode 100644 index 00000000000..b74468a5c83 --- /dev/null +++ b/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bef1de9d95dd0d336e4285f6e97b5db274ff5bd980a229ba4e2b64f9b2e3e50 +size 46372366 diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz.tbi new file mode 100644 index 00000000000..2abbb9be100 --- /dev/null +++ b/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0e67ffa564076f4dc501c6d0d92825aaffb2c0327bdfc1db31d02c392b6c664 +size 57950 diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz new file mode 100644 index 00000000000..4d52121291a --- /dev/null +++ b/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa01d2fdb8558701b87e661cd6b191392db29ee2fc29a5eeb7eb565fba1448c8 +size 34534658 diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz.tbi new file mode 100644 index 00000000000..87c3471681f --- /dev/null +++ b/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f110fb2bb242433375b44ecd2c756846574881e857ae803b89b0f7138949a3bd +size 30502 diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz new file mode 100644 index 00000000000..026244d096e --- /dev/null +++ b/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea51baa7a90a550ff4ae363728a21de88ef78aa5b6ebb6b7807aa9bd93e04459 +size 27578 diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz.tbi new file mode 100644 index 00000000000..0fc3d144274 --- /dev/null +++ b/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c9e74e847b0ca3ca72ab2bcc803ac43efaff23fa701271af7d9208df054c08e +size 72 diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz deleted file mode 100644 index f75a07bd09c..00000000000 --- a/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:00212a6387eba259a2d060eef08f50f3de512a155ed4e746d38530310a582e14 -size 134260565 diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi deleted file mode 100644 index 475b5ba83a0..00000000000 --- a/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c99412c88d072d494e545f56acdf621f6c960cbb8f2d734532cf9d5d11e83104 -size 133485 From 662605b41b2f2ac4463472300a855360dfd7407f Mon Sep 17 00:00:00 2001 From: gbggrant Date: Thu, 20 Apr 2023 16:37:16 -0400 Subject: [PATCH 22/25] Missed a dependency --- .../wdl/GvsJointVariantCalling.wdl | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl index 32c8688cf1c..a25c5de4989 100644 --- a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl +++ b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl @@ -30,16 +30,19 @@ workflow GvsJointVariantCalling { File sample_names_to_extract = "" Int split_intervals_disk_size_override = "" Int split_intervals_mem_override = "" - Int INDEL_VQSR_max_gaussians_override = 4 - Int INDEL_VQSR_mem_gb_override = "" - Int SNP_VQSR_max_gaussians_override = 6 - Int SNP_VQSR_mem_gb_override = "" + Int INDEL_VQSR_CLASSIC_max_gaussians_override = 4 + Int INDEL_VQSR_CLASSIC_mem_gb_override = "" + Int SNP_VQSR_CLASSIC_max_gaussians_override = 6 + Int SNP_VQSR_CLASSIC_mem_gb_override = "" } # This is the most updated snapshot of the code as of Feb 10, 2023 File gatk_override = "gs://gvs_quickstart_storage/jars/gatk-package-4.2.0.0-654-g4a1c203-SNAPSHOT-local.jar" File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list" - Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"] - Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] + + # These are the SNP and INDEL annotations used for VQSR Classic, the order matters. + Array[String] vqsr_classic_indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"] + Array[String] vqsr_classic_snp_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] + File interval_weights_bed = "gs://broad-public-datasets/gvs/weights/gvs_vet_weights_1kb.bed" # do we ever want non-beta customers to use this instead of using GvsUnified directly? If so, we can make this an # argument that just defaults to true @@ -65,7 +68,7 @@ workflow GvsJointVariantCalling { extract_table_prefix = extract_table_prefix, fq_temp_table_dataset = "~{project_id}.~{dataset_name}", gatk_override = gatk_override, - indel_recalibration_annotation_values = indel_recalibration_annotation_values, + vqsr_classic_indel_recalibration_annotations = vqsr_classic_indel_recalibration_annotations, interval_list = interval_list, interval_weights_bed = interval_weights_bed, load_data_batch_size = load_data_batch_size, @@ -74,13 +77,13 @@ workflow GvsJointVariantCalling { query_labels = query_labels, query_project = project_id, sample_names_to_extract = sample_names_to_extract, - snp_recalibration_annotation_values = snp_recalibration_annotation_values, + vqsr_classic_snp_recalibration_annotations = vqsr_classic_snp_recalibration_annotations, split_intervals_disk_size_override = split_intervals_disk_size_override, split_intervals_mem_override = split_intervals_mem_override, - INDEL_VQSR_max_gaussians_override = INDEL_VQSR_max_gaussians_override, - INDEL_VQSR_mem_gb_override = INDEL_VQSR_mem_gb_override, - SNP_VQSR_max_gaussians_override = SNP_VQSR_max_gaussians_override, - SNP_VQSR_mem_gb_override = SNP_VQSR_mem_gb_override, + INDEL_VQSR_CLASSIC_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override, + INDEL_VQSR_CLASSIC_mem_gb_override = INDEL_VQSR_CLASSIC_mem_gb_override, + SNP_VQSR_CLASSIC_max_gaussians_override = SNP_VQSR_CLASSIC_max_gaussians_override, + SNP_VQSR_CLASSIC_mem_gb_override = SNP_VQSR_CLASSIC_mem_gb_override, drop_state = drop_state, is_beta_user = is_beta_user, } From bf9eea8b031452d4969ae39e01f4b29a605c57c1 Mon Sep 17 00:00:00 2001 From: gbggrant Date: Fri, 21 Apr 2023 09:57:26 -0400 Subject: [PATCH 23/25] Addressing code review comments. --- scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 12 +++--------- .../variantstore/wdl/GvsJointVariantCalling.wdl | 6 ------ scripts/variantstore/wdl/GvsUnified.wdl | 9 --------- scripts/variantstore/wdl/GvsVQSRClassic.wdl | 14 ++++---------- 4 files changed, 7 insertions(+), 34 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index 4775a5501c9..a50a97494fc 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -17,17 +17,13 @@ workflow GvsCreateFilterSet { File? gatk_override Boolean use_classic_VQSR = true - # These are the SNP and INDEL annotations used for VQSR Classic, the order matters for consistency between runs. - Array[String] vqsr_classic_indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"] - Array[String] vqsr_classic_snp_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] Int? INDEL_VQSR_CLASSIC_max_gaussians_override = 4 Int? INDEL_VQSR_CLASSIC_mem_gb_override Int? SNP_VQSR_CLASSIC_max_gaussians_override = 6 Int? SNP_VQSR_CLASSIC_mem_gb_override - # These are the (unified) annotations used for VQSR Lite. The order matters for consistency between runs. - Array[String] vqsr_lite_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] + RuntimeAttributes? vqsr_lite_extract_runtime_attributes = {"command_mem_gb": 27} } File reference = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta" @@ -116,11 +112,11 @@ workflow GvsCreateFilterSet { sites_only_vcf = MergeVCFs.output_vcf, sites_only_vcf_idx = MergeVCFs.output_vcf_index, output_prefix = filter_set_name, - annotations = vqsr_lite_recalibration_annotations, + annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"], resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", extract_extra_args = "-L ${interval_list} --use-allele-specific-annotations", score_extra_args = "-L ${interval_list} --use-allele-specific-annotations", - extract_runtime_attributes = {"command_mem_gb": 27}, + extract_runtime_attributes = vqsr_lite_extract_runtime_attributes, train_runtime_attributes = {"command_mem_gb": 27}, score_runtime_attributes = {"command_mem_gb": 15}, gatk_docker = "us.gcr.io/broad-gatk/gatk:4.4.0.0", @@ -193,8 +189,6 @@ workflow GvsCreateFilterSet { sites_only_variant_filtered_vcf_idx = MergeVCFs.output_vcf_index, sites_only_variant_filtered_vcfs = ExtractFilterTask.output_vcf, sites_only_variant_filtered_vcf_idxs = ExtractFilterTask.output_vcf_index, - snp_recalibration_annotations = vqsr_classic_snp_recalibration_annotations, - indel_recalibration_annotations = vqsr_classic_indel_recalibration_annotations, INDEL_VQSR_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override, INDEL_VQSR_mem_gb_override = INDEL_VQSR_CLASSIC_mem_gb_override, SNP_VQSR_max_gaussians_override = SNP_VQSR_CLASSIC_max_gaussians_override, diff --git a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl index a25c5de4989..eef57f7e1e6 100644 --- a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl +++ b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl @@ -39,10 +39,6 @@ workflow GvsJointVariantCalling { File gatk_override = "gs://gvs_quickstart_storage/jars/gatk-package-4.2.0.0-654-g4a1c203-SNAPSHOT-local.jar" File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list" - # These are the SNP and INDEL annotations used for VQSR Classic, the order matters. - Array[String] vqsr_classic_indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"] - Array[String] vqsr_classic_snp_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] - File interval_weights_bed = "gs://broad-public-datasets/gvs/weights/gvs_vet_weights_1kb.bed" # do we ever want non-beta customers to use this instead of using GvsUnified directly? If so, we can make this an # argument that just defaults to true @@ -68,7 +64,6 @@ workflow GvsJointVariantCalling { extract_table_prefix = extract_table_prefix, fq_temp_table_dataset = "~{project_id}.~{dataset_name}", gatk_override = gatk_override, - vqsr_classic_indel_recalibration_annotations = vqsr_classic_indel_recalibration_annotations, interval_list = interval_list, interval_weights_bed = interval_weights_bed, load_data_batch_size = load_data_batch_size, @@ -77,7 +72,6 @@ workflow GvsJointVariantCalling { query_labels = query_labels, query_project = project_id, sample_names_to_extract = sample_names_to_extract, - vqsr_classic_snp_recalibration_annotations = vqsr_classic_snp_recalibration_annotations, split_intervals_disk_size_override = split_intervals_disk_size_override, split_intervals_mem_override = split_intervals_mem_override, INDEL_VQSR_CLASSIC_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override, diff --git a/scripts/variantstore/wdl/GvsUnified.wdl b/scripts/variantstore/wdl/GvsUnified.wdl index 880cc302201..88d561b3cf3 100644 --- a/scripts/variantstore/wdl/GvsUnified.wdl +++ b/scripts/variantstore/wdl/GvsUnified.wdl @@ -40,13 +40,6 @@ workflow GvsUnified { # Begin GvsCreateFilterSet String filter_set_name = call_set_identifier - # These are the SNP and INDEL annotations used for VQSR Classic, the order matters. - Array[String] vqsr_classic_indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"] - Array[String] vqsr_classic_snp_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] - - # These are the (unified) annotations used for VQSR Lite. The order matters. - Array[String] vqsr_lite_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] - Int? INDEL_VQSR_CLASSIC_max_gaussians_override = 4 Int? INDEL_VQSR_CLASSIC_mem_gb_override Int? SNP_VQSR_CLASSIC_max_gaussians_override = 6 @@ -121,8 +114,6 @@ workflow GvsUnified { project_id = project_id, call_set_identifier = call_set_identifier, filter_set_name = filter_set_name, - vqsr_classic_indel_recalibration_annotations = vqsr_classic_indel_recalibration_annotations, - vqsr_classic_snp_recalibration_annotations = vqsr_classic_snp_recalibration_annotations, interval_list = interval_list, gatk_override = gatk_override, INDEL_VQSR_CLASSIC_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override, diff --git a/scripts/variantstore/wdl/GvsVQSRClassic.wdl b/scripts/variantstore/wdl/GvsVQSRClassic.wdl index 181b1eaf8ce..4683d422823 100644 --- a/scripts/variantstore/wdl/GvsVQSRClassic.wdl +++ b/scripts/variantstore/wdl/GvsVQSRClassic.wdl @@ -12,9 +12,6 @@ workflow JointVcfFiltering { Array[File] sites_only_variant_filtered_vcfs Array[File] sites_only_variant_filtered_vcf_idxs - Array[String] indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"] - Array[String] snp_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] - File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list" File? gatk_override @@ -33,37 +30,34 @@ workflow JointVcfFiltering { Int snps_variant_recalibration_threshold = 5000 } + Array[String] indel_recalibration_annotations = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"] + Array[String] snp_recalibration_annotations = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"] + Array[String] snp_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ] Array[String] indel_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"] # reference files # Axiom - Used only for indels - # Classic: known=false,training=true,truth=false File axiomPoly_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz" File axiomPoly_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi" - # DbSNP - BOTH SNPs and INDELs. But used only as known in classic (which isn't used in Lite and so dropped in lite) - # Classic: known=true,training=false,truth=false + # DbSNP - BOTH SNPs and INDELs. File dbsnp_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf" File dbsnp_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx" # HapMap - SNPs - # Classic: known=false,training=true,truth=true File hapmap_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz" File hapmap_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi" # Mills - Indels - # Classic: known=false,training=true,truth=true File mills_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz" File mills_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi" # Omni - SNPs - # Classic: known=false,training=true,truth=true File omni_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz" File omni_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi" # 1000G - SNPs - # Classic: known=false,training=true,truth=false File one_thousand_genomes_resource_vcf = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz" File one_thousand_genomes_resource_vcf_index = "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi" From 841152192bef1078bfe155db13d6630e5c9010da Mon Sep 17 00:00:00 2001 From: gbggrant Date: Fri, 21 Apr 2023 12:03:32 -0400 Subject: [PATCH 24/25] Pass runtime attributes to VQSR Lite wdl. --- scripts/variantstore/wdl/GvsCreateFilterSet.wdl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index a50a97494fc..9dcb37a8fef 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -24,6 +24,8 @@ workflow GvsCreateFilterSet { Int? SNP_VQSR_CLASSIC_mem_gb_override RuntimeAttributes? vqsr_lite_extract_runtime_attributes = {"command_mem_gb": 27} + RuntimeAttributes? vqsr_lite_train_runtime_attributes = {"command_mem_gb": 27} + RuntimeAttributes? vqsr_lite_score_runtime_attributes = {"command_mem_gb": 15} } File reference = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta" @@ -117,8 +119,8 @@ workflow GvsCreateFilterSet { extract_extra_args = "-L ${interval_list} --use-allele-specific-annotations", score_extra_args = "-L ${interval_list} --use-allele-specific-annotations", extract_runtime_attributes = vqsr_lite_extract_runtime_attributes, - train_runtime_attributes = {"command_mem_gb": 27}, - score_runtime_attributes = {"command_mem_gb": 15}, + train_runtime_attributes = vqsr_lite_train_runtime_attributes, + score_runtime_attributes = vqsr_lite_score_runtime_attributes, gatk_docker = "us.gcr.io/broad-gatk/gatk:4.4.0.0", gatk_override = gatk_override, monitoring_script = "gs://gvs-internal/cromwell_monitoring_script.sh" From e276681494c07f0e28a22c49a7fde165eab844f3 Mon Sep 17 00:00:00 2001 From: gbggrant Date: Fri, 21 Apr 2023 13:50:09 -0400 Subject: [PATCH 25/25] Remove branches from .dockstore.yml --- .dockstore.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.dockstore.yml b/.dockstore.yml index 9586879f7ad..037ea056c57 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -95,7 +95,6 @@ workflows: branches: - master - ah_var_store - - gg_VS-776_UpdateToLatestVQSRLite - name: GvsPopulateAltAllele subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/GvsPopulateAltAllele.wdl @@ -240,7 +239,6 @@ workflows: - master - ah_var_store - vs_888_fix_broken_gsutil_docker - - gg_VS-776_UpdateToLatestVQSRLite - name: GvsIngestTieout subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/GvsIngestTieout.wdl