diff --git a/.dockstore.yml b/.dockstore.yml index 0855ac45370..252512ad7d1 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -208,7 +208,7 @@ workflows: branches: - master - ah_var_store - - vs_648_alpine + - vs_655_avro_extract_warn_on_bad_filter_name - name: GvsIngestTieout subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/GvsIngestTieout.wdl @@ -230,7 +230,7 @@ workflows: branches: - master - ah_var_store - - vs_616_split_hail + - vs_655_avro_extract_warn_on_bad_filter_name - name: GvsCallsetStatistics subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/GvsCallsetStatistics.wdl @@ -238,7 +238,7 @@ workflows: branches: - master - ah_var_store - - vs_616_split_hail + - vs_655_avro_extract_warn_on_bad_filter_name - name: MitochondriaPipeline subclass: WDL primaryDescriptorPath: /scripts/mitochondria_m2_wdl/MitochondriaPipeline.wdl diff --git a/scripts/variantstore/wdl/GvsCallsetStatistics.wdl b/scripts/variantstore/wdl/GvsCallsetStatistics.wdl index 71295af64a7..72a3928bc53 100644 --- a/scripts/variantstore/wdl/GvsCallsetStatistics.wdl +++ b/scripts/variantstore/wdl/GvsCallsetStatistics.wdl @@ -1,5 +1,7 @@ version 1.0 +import "GvsUtils.wdl" as Utils + workflow GvsCallsetStatistics { input { String project_id @@ -11,8 +13,16 @@ workflow GvsCallsetStatistics { String statistics_table = "~{extract_prefix}_statistics" } + call Utils.ValidateFilterSetName { + input: + data_project = project_id, + data_dataset = dataset_name, + filter_set_name = filter_set_name + } + call CreateTables { input: + go = ValidateFilterSetName.done, project_id = project_id, dataset_name = dataset_name, metrics_table = metrics_table, @@ -72,6 +82,7 @@ workflow GvsCallsetStatistics { task CreateTables { input { + Boolean go = true String project_id String dataset_name String metrics_table diff --git a/scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl b/scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl index ec09869cb66..713fb1e0a7a 100644 --- a/scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl +++ b/scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl @@ -11,7 +11,16 @@ workflow GvsExtractAvroFilesForHail { Int scatter_width = 10 } - call OutputPath { input: go = true } + call Utils.ValidateFilterSetName { + input: + data_project = project_id, + data_dataset = dataset, + filter_set_name = filter_set_name, + } + + call OutputPath { + input: go = ValidateFilterSetName.done + } call ExtractFromNonSuperpartitionedTables { input: diff --git a/scripts/variantstore/wdl/GvsExtractCallset.wdl b/scripts/variantstore/wdl/GvsExtractCallset.wdl index 016ed02a030..b15a08c7095 100644 --- a/scripts/variantstore/wdl/GvsExtractCallset.wdl +++ b/scripts/variantstore/wdl/GvsExtractCallset.wdl @@ -114,7 +114,7 @@ workflow GvsExtractCallset { } if ( !do_not_filter_override ) { - call ValidateFilterSetName { + call Utils.ValidateFilterSetName { input: query_project = query_project, filter_set_name = filter_set_name, @@ -138,6 +138,7 @@ workflow GvsExtractCallset { call ExtractTask { input: + go = select_first([ValidateFilterSetName.done, true]), dataset_id = dataset_name, call_set_identifier = call_set_identifier, gatk_override = gatk_override, @@ -156,7 +157,6 @@ workflow GvsExtractCallset { fq_filter_set_site_table = fq_filter_set_site_table, fq_filter_set_tranches_table = fq_filter_set_tranches_table, filter_set_name = filter_set_name, - filter_set_name_verified = select_first([ValidateFilterSetName.done, "done"]), drop_state = drop_state, output_file = vcf_filename, output_gcs_dir = output_gcs_dir, @@ -207,52 +207,11 @@ workflow GvsExtractCallset { } } -task ValidateFilterSetName { - input { - String filter_set_name - String data_project - String data_dataset - String query_project - String filter_set_info_timestamp - } - meta { - # Not `volatile: true` since there shouldn't be a need to re-run this if there has already been a successful execution. - } - - # add labels for DSP Cloud Cost Control Labeling and Reporting - String bq_labels = "--label service:gvs --label team:variants --label managedby:extract_callset" - - command <<< - set -ex - - echo "project_id = ~{query_project}" > ~/.bigqueryrc - - OUTPUT=$(bq --location=US --project_id=~{query_project} --format=csv query --use_legacy_sql=false ~{bq_labels} "SELECT filter_set_name as available_filter_set_names FROM \`~{data_project}.~{data_dataset}.filter_set_info\` GROUP BY filter_set_name") - FILTERSETS=${OUTPUT#"available_filter_set_names"} - - if [[ $FILTERSETS =~ "~{filter_set_name}" ]]; then - echo "Filter set name '~{filter_set_name}' found." - else - echo "ERROR: '~{filter_set_name}' is not an existing filter_set_name. Available in ~{data_project}.~{data_dataset} are" - echo $FILTERSETS - exit 1 - fi - >>> - output { - String done = read_string(stdout()) - } - - runtime { - docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:398.0.0" - memory: "3 GB" - disks: "local-disk 10 HDD" - preemptible: 3 - cpu: 1 - } -} task ExtractTask { input { + Boolean go + String dataset_id String call_set_identifier @@ -283,7 +242,6 @@ task ExtractTask { String fq_filter_set_site_table String fq_filter_set_tranches_table String? filter_set_name - String filter_set_name_verified # Runtime Options: File? gatk_override @@ -398,9 +356,9 @@ task SumBytes { print(total_mb);" >>> runtime { - docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:398.0.0" + docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine" memory: "3 GB" - disks: "local-disk 10 HDD" + disks: "local-disk 500 HDD" preemptible: 3 cpu: 1 } @@ -437,9 +395,9 @@ task CreateManifest { } runtime { - docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:398.0.0" + docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine" memory: "3 GB" - disks: "local-disk 10 HDD" + disks: "local-disk 500 HDD" preemptible: 3 cpu: 1 } @@ -479,9 +437,9 @@ task GenerateSampleListFile { } runtime { - docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:398.0.0" + docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine" memory: "3 GB" - disks: "local-disk 10 HDD" + disks: "local-disk 500 HDD" preemptible: 3 cpu: 1 } diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index 9513741aba4..87106c5ffd1 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -419,3 +419,48 @@ task CountSuperpartitions { Int num_superpartitions = read_int('num_superpartitions.txt') } } + +task ValidateFilterSetName { + input { + Boolean go = true + String filter_set_name + String data_project + String data_dataset + String query_project = data_project + String filter_set_info_timestamp = "" + } + meta { + # Not `volatile: true` since there shouldn't be a need to re-run this if there has already been a successful execution. + } + + # add labels for DSP Cloud Cost Control Labeling and Reporting + String bq_labels = "--label service:gvs --label team:variants --label managedby:gvs_utils" + + command <<< + set -o errexit -o nounset -o xtrace -o pipefail + + echo "project_id = ~{query_project}" > ~/.bigqueryrc + + OUTPUT=$(bq --project_id=~{query_project} --format=csv query --use_legacy_sql=false ~{bq_labels} "SELECT filter_set_name as available_filter_set_names FROM \`~{data_project}.~{data_dataset}.filter_set_info\` GROUP BY filter_set_name") + FILTERSETS=${OUTPUT#"available_filter_set_names"} + + if [[ $FILTERSETS =~ "~{filter_set_name}" ]]; then + echo "Filter set name '~{filter_set_name}' found." + else + echo "ERROR: '~{filter_set_name}' is not an existing filter_set_name. Available in ~{data_project}.~{data_dataset} are" + echo $FILTERSETS + exit 1 + fi + >>> + output { + Boolean done = true + } + + runtime { + docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine" + memory: "3 GB" + disks: "local-disk 500 HDD" + preemptible: 3 + cpu: 1 + } +}