From 6d973172949ce57ce0b5a311528ee71a57c09a9f Mon Sep 17 00:00:00 2001 From: Rori Cremer <6863459+RoriCremer@users.noreply.github.com> Date: Thu, 13 Jan 2022 10:40:05 -0500 Subject: [PATCH 01/10] update jars --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index 7e32b7ab18e..70267d07947 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -17,7 +17,7 @@ workflow GvsImportGenomes { Int batch_size = 1 Int? preemptible_tries - File? gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/kc_extract_perf_20220111/gatk-package-4.2.0.0-455-g40a40bc-SNAPSHOT-local.jar" + File? gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/kc_ranges_prepare_20220118/gatk-package-4.2.0.0-462-gc0e684c-SNAPSHOT-local.jar" String? docker } From 6865f6575233a5299f38ee2ea575a7b64e81825f Mon Sep 17 00:00:00 2001 From: Rori Cremer <6863459+RoriCremer@users.noreply.github.com> Date: Tue, 4 Jan 2022 13:34:47 -0500 Subject: [PATCH 02/10] dockstore for testing --- .dockstore.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.dockstore.yml b/.dockstore.yml index 19934bbf221..8a1a7c191e1 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -128,6 +128,7 @@ workflows: branches: - master - ah_var_store + - rc-vs-268-import-more-samples - name: GvsPrepareCallset subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/GvsPrepareCallset.wdl From 1a140c2ef9204b3846fed37baf8d64faa9633834 Mon Sep 17 00:00:00 2001 From: Rori Cremer <6863459+RoriCremer@users.noreply.github.com> Date: Tue, 4 Jan 2022 17:54:11 -0500 Subject: [PATCH 03/10] make a sample name table to join to --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index 70267d07947..132d92d97bd 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -396,9 +396,31 @@ task GetSampleIds { echo "project_id = ~{project_id}" > ~/.bigqueryrc + # get the current maximum id, or 0 if there are none + #bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false \ + # "SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM ~{dataset_name}.~{table_name} where sample_name in ('~{sep="\',\'" external_sample_names}')" > results + + + # the above query is causing an out of bounds error that I'm hoping to solve by loading the data into a temp table and doing a join + # SO load in and create a temp table with all the sample names in external_sample_names + + # 1. create temp table with the sample_names + TEMP_TABLE="~{dataset_name}.sample_names_to_load " + # bq --location=US mk ${PARTITION_STRING} ${CLUSTERING_STRING} --project_id=~{project_id} $TABLE schema.json + # TODO does this need a hash for the temp table name!??! + bq --project_id=~{project_id} mk ${TEMP_TABLE} "sample_name:STRING" + + # 2. load external sample names into temp table + NAMES_FILE=~{write_lines(external_sample_names)} + bq load --project_id=~{project_id} ${TEMP_TABLE} $NAMES_FILE "sample_name:STRING" + + # 3. get the max from the temp table + # get the current maximum id, or 0 if there are none bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false \ - "SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM ~{dataset_name}.~{table_name} where sample_name in ('~{sep="\',\'" external_sample_names}')" > results + "SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM ~{dataset_name}.~{table_name} AS samples JOIN ${TEMP_TABLE} AS temp ON samples.sample_name=temp.sample_name" > results + + # prep for being able to return min table id min_sample_id=$(tail -1 results | cut -d, -f1) From 8e61483d32686091963a8687934ec16dec889b7a Mon Sep 17 00:00:00 2001 From: Rori Cremer <6863459+RoriCremer@users.noreply.github.com> Date: Tue, 4 Jan 2022 18:07:32 -0500 Subject: [PATCH 04/10] make it temp --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index 132d92d97bd..ec7b61e7aa6 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -408,6 +408,7 @@ task GetSampleIds { TEMP_TABLE="~{dataset_name}.sample_names_to_load " # bq --location=US mk ${PARTITION_STRING} ${CLUSTERING_STRING} --project_id=~{project_id} $TABLE schema.json # TODO does this need a hash for the temp table name!??! + # TODO this needs a time to live bq --project_id=~{project_id} mk ${TEMP_TABLE} "sample_name:STRING" # 2. load external sample names into temp table From 1c1405ba3cd5e77c6bcc64f50d6a44bf3b7267b3 Mon Sep 17 00:00:00 2001 From: Rori Cremer <6863459+RoriCremer@users.noreply.github.com> Date: Thu, 6 Jan 2022 16:55:24 -0500 Subject: [PATCH 05/10] replace the second list with the join --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index ec7b61e7aa6..3c6e3864194 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -437,7 +437,7 @@ task GetSampleIds { python3 -c "from math import ceil; print(ceil($min_sample_id/~{samples_per_table}))" > min_sample_id bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n ~{num_samples} \ - "SELECT sample_id, sample_name FROM ~{dataset_name}.~{table_name} where sample_name in ('~{sep="\',\'" external_sample_names}')" > sample_map + "SELECT sample_id, sample_name FROM ~{dataset_name}.~{table_name} AS samples JOIN ${TEMP_TABLE} AS temp ON samples.sample_name=temp.sample_name" > sample_map cut -d, -f1 sample_map > gvs_ids From c2fdc2c194dea8f31950fcfc0c41d15476b398ab Mon Sep 17 00:00:00 2001 From: Rori Cremer <6863459+RoriCremer@users.noreply.github.com> Date: Thu, 6 Jan 2022 21:25:33 -0500 Subject: [PATCH 06/10] typo query fix --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index 3c6e3864194..0f55be9c52f 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -437,7 +437,7 @@ task GetSampleIds { python3 -c "from math import ceil; print(ceil($min_sample_id/~{samples_per_table}))" > min_sample_id bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n ~{num_samples} \ - "SELECT sample_id, sample_name FROM ~{dataset_name}.~{table_name} AS samples JOIN ${TEMP_TABLE} AS temp ON samples.sample_name=temp.sample_name" > sample_map + "SELECT sample_id, samples.sample_name FROM ~{dataset_name}.~{table_name} AS samples JOIN ${TEMP_TABLE} AS temp ON samples.sample_name=temp.sample_name" > sample_map cut -d, -f1 sample_map > gvs_ids From 02fce24c8a294d4af652b5e0f5b587b6fd938d97 Mon Sep 17 00:00:00 2001 From: Rori Cremer <6863459+RoriCremer@users.noreply.github.com> Date: Fri, 7 Jan 2022 15:09:30 -0500 Subject: [PATCH 07/10] delete temp table after done --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 24 ++++--------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index 0f55be9c52f..447e97185c0 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -396,33 +396,16 @@ task GetSampleIds { echo "project_id = ~{project_id}" > ~/.bigqueryrc - # get the current maximum id, or 0 if there are none - #bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false \ - # "SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM ~{dataset_name}.~{table_name} where sample_name in ('~{sep="\',\'" external_sample_names}')" > results - - - # the above query is causing an out of bounds error that I'm hoping to solve by loading the data into a temp table and doing a join - # SO load in and create a temp table with all the sample names in external_sample_names - - # 1. create temp table with the sample_names - TEMP_TABLE="~{dataset_name}.sample_names_to_load " - # bq --location=US mk ${PARTITION_STRING} ${CLUSTERING_STRING} --project_id=~{project_id} $TABLE schema.json - # TODO does this need a hash for the temp table name!??! - # TODO this needs a time to live + # create temp table with the sample_names and load external sample names into temp table + TEMP_TABLE="~{dataset_name}.sample_names_to_load" bq --project_id=~{project_id} mk ${TEMP_TABLE} "sample_name:STRING" - - # 2. load external sample names into temp table NAMES_FILE=~{write_lines(external_sample_names)} bq load --project_id=~{project_id} ${TEMP_TABLE} $NAMES_FILE "sample_name:STRING" - # 3. get the max from the temp table - # get the current maximum id, or 0 if there are none bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false \ "SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM ~{dataset_name}.~{table_name} AS samples JOIN ${TEMP_TABLE} AS temp ON samples.sample_name=temp.sample_name" > results - - # prep for being able to return min table id min_sample_id=$(tail -1 results | cut -d, -f1) max_sample_id=$(tail -1 results | cut -d, -f2) @@ -441,6 +424,9 @@ task GetSampleIds { cut -d, -f1 sample_map > gvs_ids + // delete the table that was only needed for this ingest + bq --project_id=~{project_id} rm -f=true ${TEMP_TABLE} + >>> runtime { docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0" From 064b9c5d47d7a76ee2ba4bad2c7e72061e32cfc7 Mon Sep 17 00:00:00 2001 From: Rori Cremer <6863459+RoriCremer@users.noreply.github.com> Date: Fri, 7 Jan 2022 15:26:14 -0500 Subject: [PATCH 08/10] throw an error if the table exists already --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index 447e97185c0..37bf62bb5cb 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -396,7 +396,20 @@ task GetSampleIds { echo "project_id = ~{project_id}" > ~/.bigqueryrc - # create temp table with the sample_names and load external sample names into temp table + # create temp table with the sample_names and load external sample names into temp table -- make sure it doesn't exist already + set +e + TEMP_TABLE="~{dataset_name}.sample_names_to_load" + bq show --project_id ~{project_id} ${TEMP_TABLE} > /dev/null + BQ_SHOW_RC=$? + set -e + + # if there is already a table of sample names or something else is wrong, bail + if [ $BQ_SHOW_RC -eq 0 ]; then + echo "There is already a list of sample names. This may need manual cleanup. Exiting" + exit 1 + fi + + echo "Creating the external sample name list table ${TEMP_TABLE}" TEMP_TABLE="~{dataset_name}.sample_names_to_load" bq --project_id=~{project_id} mk ${TEMP_TABLE} "sample_name:STRING" NAMES_FILE=~{write_lines(external_sample_names)} From d2b4661911e9f0a6884f6c328711bdc1aed53a90 Mon Sep 17 00:00:00 2001 From: Rori Cremer <6863459+RoriCremer@users.noreply.github.com> Date: Tue, 11 Jan 2022 22:42:48 -0500 Subject: [PATCH 09/10] ack comments --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index 37bf62bb5cb..72b13fbe16b 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -437,7 +437,7 @@ task GetSampleIds { cut -d, -f1 sample_map > gvs_ids - // delete the table that was only needed for this ingest + ## delete the table that was only needed for this ingest bq --project_id=~{project_id} rm -f=true ${TEMP_TABLE} >>> From 72bdfb20a6d02f7d2c0175a405b4eeabbabc27d9 Mon Sep 17 00:00:00 2001 From: Rori Cremer <6863459+RoriCremer@users.noreply.github.com> Date: Mon, 24 Jan 2022 11:10:08 -0500 Subject: [PATCH 10/10] remove from dockstore --- .dockstore.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.dockstore.yml b/.dockstore.yml index 8a1a7c191e1..19934bbf221 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -128,7 +128,6 @@ workflows: branches: - master - ah_var_store - - rc-vs-268-import-more-samples - name: GvsPrepareCallset subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/GvsPrepareCallset.wdl