Skip to content

Commit

Permalink
VS-1397 update tests echo callset version (#8848)
Browse files Browse the repository at this point in the history
* VS-1397 Update Tests - ah_var_store edition (#8846)
** Add ValidateVariants to our tests.
** Bringing in Rori's change to add EXCESS_ALLELES to VCF Headers.
** Updated truth path.
* Scatter adjustments [VS-516] (#8835)

Co-authored-by: Miguel Covarrubias <mcovarr@users.noreply.github.com>
  • Loading branch information
gbggrant and mcovarr authored May 28, 2024
1 parent 293d90e commit 634bce9
Show file tree
Hide file tree
Showing 11 changed files with 79 additions and 8 deletions.
16 changes: 10 additions & 6 deletions scripts/variantstore/wdl/GvsExtractCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -116,12 +116,16 @@ workflow GvsExtractCallset {

# scatter for WGS and exome samples based on past successful runs and NOT optimized
Int effective_scatter_count = if defined(scatter_count) then select_first([scatter_count])
else if GetNumSamplesLoaded.num_samples < 100 then 100 # Quickstart
else if GetNumSamplesLoaded.num_samples < 1000 then 500
else if GetNumSamplesLoaded.num_samples < 5000 then 1000
else if GetNumSamplesLoaded.num_samples < 20000 then 2000 # Stroke Anderson
else if GetNumSamplesLoaded.num_samples < 50000 then 10000
else if is_wgs then 20000 else 7500
else if is_wgs then
if GetNumSamplesLoaded.num_samples < 5000 then 1 # This results in 1 VCF per chromosome.
else if GetNumSamplesLoaded.num_samples < 20000 then 2000 # Stroke Anderson
else if GetNumSamplesLoaded.num_samples < 50000 then 10000
else 20000
else
if GetNumSamplesLoaded.num_samples < 5000 then 1 # This results in 1 VCF per chromosome.
else if GetNumSamplesLoaded.num_samples < 20000 then 1000
else if GetNumSamplesLoaded.num_samples < 50000 then 2500
else 7500

Int effective_split_intervals_disk_size_override = select_first([split_intervals_disk_size_override,
if GetNumSamplesLoaded.num_samples < 100 then 50 # Quickstart
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsQuickstartIntegration.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ workflow GvsQuickstartIntegration {
File full_wgs_interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
File full_exome_interval_list = "gs://gcp-public-data--broad-references/hg38/v0/bge_exome_calling_regions.v1.1.interval_list"
String expected_subdir = if (!chr20_X_Y_only) then "all_chrs/" else ""
File expected_output_prefix = "gs://gvs-internal-quickstart/integration/2024-03-13/" + expected_subdir
File expected_output_prefix = "gs://gvs-internal-quickstart/integration/2024-05-23/" + expected_subdir

# WDL 1.0 trick to set a variable ('none') to be undefined.
if (false) {
Expand Down
60 changes: 60 additions & 0 deletions scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ workflow GvsQuickstartVcfIntegration {
Int? maximum_alternate_alleles
}
String project_id = "gvs-internal"
File reference_fasta = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta"

# WDL 1.0 trick to set a variable ('none') to be undefined.
if (false) {
Expand Down Expand Up @@ -147,6 +148,16 @@ workflow GvsQuickstartVcfIntegration {
}
}

scatter(i in range(length(JointVariantCalling.output_vcfs))) {
call ValidateVcf {
input:
input_vcf = JointVariantCalling.output_vcfs[i],
input_vcf_index = JointVariantCalling.output_vcf_indexes[i],
ref_fasta = reference_fasta,
gatk_docker = effective_gatk_docker,
}
}

output {
Array[File] output_vcfs = JointVariantCalling.output_vcfs
Array[File] output_vcf_indexes = JointVariantCalling.output_vcf_indexes
Expand Down Expand Up @@ -429,3 +440,52 @@ task AssertTableSizesAreExpected {
File differences = "differences.txt"
}
}

task ValidateVcf {
input {
File input_vcf
File input_vcf_index
File ref_fasta

Int? preemptible_tries
String gatk_docker
}

File monitoring_script = "gs://gvs_quickstart_storage/cromwell_monitoring_script.sh"

parameter_meta {
input_vcf: {
localization_optional: true
}
input_vcf_index: {
localization_optional: true
}
ref_fasta: {
localization_optional: true
}
}

command <<<
# Prepend date, time and pwd to xtrace log entries.
PS4='\D{+%F %T} \w $ '
set -o errexit -o nounset -o pipefail -o xtrace

bash ~{monitoring_script} > monitoring.log &

gatk --java-options -Xmx3g ValidateVariants \
-V ~{input_vcf} \
-R ~{ref_fasta}

>>>

runtime {
docker: gatk_docker
preemptible: select_first([preemptible_tries, 3])
memory: "3 GiB"
disks: "local-disk 100 HDD"
}

output {
File monitoring_log = "monitoring.log"
}
}
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ task GetToolVersions {
# there are a handlful of tasks that require the larger GNU libc-based `slim`.
String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:435.0.0-slim"
String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-05-06-alpine-778d8a77294d"
String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-05-06-gatkbase-e37f6cd67c45"
String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024_05_24-gatkbase-cdc749be72ba"
String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19"
String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest"
String gotc_imputation_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.5-1.10.2-0.1.16-1649948623"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ private static VCFHeader generateVcfHeader(Set<String> sampleNames,

// Filter fields
headerLines.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.LOW_QUAL_FILTER_NAME));
headerLines.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.EXCESS_ALLELES));
headerLines.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.EXCESS_HET_KEY));
headerLines.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.NO_HQ_GENOTYPES));

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
##fileformat=VCFv4.2
##FILTER=<ID=EXCESS_ALLELES,Description="Site has an excess of alternate alleles based on the input threshold">
##FILTER=<ID=ExcessHet,Description="Site has excess het value larger than the threshold">
##FILTER=<ID=LowQual,Description="Low quality">
##FILTER=<ID=NO_HQ_GENOTYPES,Description="Site has no high quality variant genotypes">
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
##fileformat=VCFv4.2
##FILTER=<ID=EXCESS_ALLELES,Description="Site has an excess of alternate alleles based on the input threshold">
##FILTER=<ID=ExcessHet,Description="Site has excess het value larger than the threshold">
##FILTER=<ID=LowQual,Description="Low quality">
##FILTER=<ID=NO_HQ_GENOTYPES,Description="Site has no high quality variant genotypes">
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
##fileformat=VCFv4.2
##FILTER=<ID=EXCESS_ALLELES,Description="Site has an excess of alternate alleles based on the input threshold">
##FILTER=<ID=ExcessHet,Description="Site has excess het value larger than the threshold">
##FILTER=<ID=LowQual,Description="Low quality">
##FILTER=<ID=NO_HQ_GENOTYPES,Description="Site has no high quality variant genotypes">
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
##fileformat=VCFv4.2
##FILTER=<ID=EXCESS_ALLELES,Description="Site has an excess of alternate alleles based on the input threshold">
##FILTER=<ID=ExcessHet,Description="Site has excess het value larger than the threshold">
##FILTER=<ID=LowQual,Description="Low quality">
##FILTER=<ID=NO_HQ_GENOTYPES,Description="Site has no high quality variant genotypes">
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
##fileformat=VCFv4.2
##FILTER=<ID=EXCESS_ALLELES,Description="Site has an excess of alternate alleles based on the input threshold">
##FILTER=<ID=ExcessHet,Description="Site has excess het value larger than the threshold">
##FILTER=<ID=LowQual,Description="Low quality">
##FILTER=<ID=NO_HQ_GENOTYPES,Description="Site has no high quality variant genotypes">
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
##fileformat=VCFv4.2
##FILTER=<ID=EXCESS_ALLELES,Description="Site has an excess of alternate alleles based on the input threshold">
##FILTER=<ID=ExcessHet,Description="Site has excess het value larger than the threshold">
##FILTER=<ID=LowQual,Description="Low quality">
##FILTER=<ID=NO_HQ_GENOTYPES,Description="Site has no high quality variant genotypes">
Expand Down

0 comments on commit 634bce9

Please sign in to comment.