Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VS-268 import more samples at once #7629

Merged
merged 10 commits into from
Jan 24, 2022
28 changes: 25 additions & 3 deletions scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ workflow GvsImportGenomes {
Int batch_size = 1

Int? preemptible_tries
File? gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/kc_extract_perf_20220111/gatk-package-4.2.0.0-455-g40a40bc-SNAPSHOT-local.jar"
File? gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/kc_ranges_prepare_20220118/gatk-package-4.2.0.0-462-gc0e684c-SNAPSHOT-local.jar"
String? docker
}

Expand Down Expand Up @@ -396,9 +396,28 @@ task GetSampleIds {

echo "project_id = ~{project_id}" > ~/.bigqueryrc

# create temp table with the sample_names and load external sample names into temp table -- make sure it doesn't exist already
set +e
TEMP_TABLE="~{dataset_name}.sample_names_to_load"
bq show --project_id ~{project_id} ${TEMP_TABLE} > /dev/null
BQ_SHOW_RC=$?
set -e

# if there is already a table of sample names or something else is wrong, bail
if [ $BQ_SHOW_RC -eq 0 ]; then
echo "There is already a list of sample names. This may need manual cleanup. Exiting"
exit 1
fi

echo "Creating the external sample name list table ${TEMP_TABLE}"
TEMP_TABLE="~{dataset_name}.sample_names_to_load"
bq --project_id=~{project_id} mk ${TEMP_TABLE} "sample_name:STRING"
NAMES_FILE=~{write_lines(external_sample_names)}
bq load --project_id=~{project_id} ${TEMP_TABLE} $NAMES_FILE "sample_name:STRING"

# get the current maximum id, or 0 if there are none
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
"SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM ~{dataset_name}.~{table_name} where sample_name in ('~{sep="\',\'" external_sample_names}')" > results
"SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM ~{dataset_name}.~{table_name} AS samples JOIN ${TEMP_TABLE} AS temp ON samples.sample_name=temp.sample_name" > results

# prep for being able to return min table id
min_sample_id=$(tail -1 results | cut -d, -f1)
Expand All @@ -414,10 +433,13 @@ task GetSampleIds {
python3 -c "from math import ceil; print(ceil($min_sample_id/~{samples_per_table}))" > min_sample_id

bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n ~{num_samples} \
"SELECT sample_id, sample_name FROM ~{dataset_name}.~{table_name} where sample_name in ('~{sep="\',\'" external_sample_names}')" > sample_map
"SELECT sample_id, samples.sample_name FROM ~{dataset_name}.~{table_name} AS samples JOIN ${TEMP_TABLE} AS temp ON samples.sample_name=temp.sample_name" > sample_map

cut -d, -f1 sample_map > gvs_ids

## delete the table that was only needed for this ingest
bq --project_id=~{project_id} rm -f=true ${TEMP_TABLE}

>>>
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0"
Expand Down