From 99a0be076e5ec43213c22ae653fd0a5de5081265 Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Wed, 18 May 2022 10:47:36 -0400 Subject: [PATCH 01/16] dockstore --- .dockstore.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.dockstore.yml b/.dockstore.yml index 6b92684cb97..07326351f8d 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -121,6 +121,7 @@ workflows: branches: - master - ah_var_store + - rsa_skip_samples - name: GvsPrepareRangesCallset subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl From 9af2ca25503dafc4226d822813fe5ea00f20c1c5 Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Wed, 18 May 2022 10:52:23 -0400 Subject: [PATCH 02/16] first stab --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index d28849a16a6..70bf69a60d1 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -37,14 +37,6 @@ workflow GvsImportGenomes { service_account_json_path = service_account_json_path } - call CheckForDuplicateData { - input: - dataset_name = dataset_name, - project_id = project_id, - sample_names = external_sample_names, - service_account_json_path = service_account_json_path - } - call CreateFOFNs { input: batch_size = 1, @@ -58,7 +50,6 @@ workflow GvsImportGenomes { input: dataset_name = dataset_name, project_id = project_id, - duplicate_check_passed = CheckForDuplicateData.done, drop_state = "FORTY", drop_state_includes_greater_than = false, input_vcf_indexes = read_lines(CreateFOFNs.vcf_batch_vcf_index_fofns[i]), @@ -136,8 +127,8 @@ task CheckForDuplicateData { cat query.sql | bq --location=US --project_id=~{project_id} query --format=csv -n ~{num_samples} --use_legacy_sql=false | sed -e '/sample_name/d' > duplicates - # remove the temp table - bq --project_id=~{project_id} rm -f -t ${TEMP_TABLE} + # remove the temp table + bq --project_id=~{project_id} rm -f -t ${TEMP_TABLE} # true if there is data in results if [ -s duplicates ]; then @@ -195,7 +186,6 @@ task LoadData { String dataset_name String project_id - Boolean duplicate_check_passed Array[File] input_vcf_indexes Array[File] input_vcfs File interval_list @@ -245,6 +235,8 @@ task LoadData { gcloud auth activate-service-account --key-file=local.service_account.json fi + echo "project_id = ~{project_id}" > ~/.bigqueryrc + # translate WDL arrays into BASH arrays VCFS_ARRAY=(~{sep=" " input_vcfs}) VCF_INDEXES_ARRAY=(~{sep=" " input_vcf_indexes}) @@ -252,12 +244,25 @@ task LoadData { # loop over the BASH arrays (See https://stackoverflow.com/questions/6723426/looping-over-arrays-printing-both-index-and-value) for i in "${!VCFS_ARRAY[@]}"; do + input_vcf="${VCFS_ARRAY[$i]}" input_vcf_basename=$(basename $input_vcf) updated_input_vcf=$input_vcf input_vcf_index="${VCF_INDEXES_ARRAY[$i]}" sample_name="${SAMPLE_NAMES_ARRAY[$i]}" + # first, see if this sample is already in the DB, and if so, skip + echo "SELECT DISTINCT i.sample_id FROM `~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS` p, `~{dataset_name}.sample_info` i WHERE i.sample_name = '${sample_name}' AND p.partition_id = CAST(i.sample_id AS STRING) AND p.total_logical_bytes > 0 AND (table_name like 'ref_ranges_%' OR table_name like 'vet_%')" > query.sql + + cat query.sql + cat query.sql | bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false | sed -e '/sampleid/d' > duplicates + + if [ -s duplicates ]; then + echo "Skipping already loaded sample, id: " $(cat duplicates) + rm duplicates + continue + fi + # we always do our own localization gsutil cp $input_vcf . gsutil cp $input_vcf_index . From 102cd8910d55a9df341135649cd09e165bdddf7e Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Wed, 18 May 2022 11:48:24 -0400 Subject: [PATCH 03/16] escape backticks --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index 70bf69a60d1..15849998b7c 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -252,7 +252,7 @@ task LoadData { sample_name="${SAMPLE_NAMES_ARRAY[$i]}" # first, see if this sample is already in the DB, and if so, skip - echo "SELECT DISTINCT i.sample_id FROM `~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS` p, `~{dataset_name}.sample_info` i WHERE i.sample_name = '${sample_name}' AND p.partition_id = CAST(i.sample_id AS STRING) AND p.total_logical_bytes > 0 AND (table_name like 'ref_ranges_%' OR table_name like 'vet_%')" > query.sql + echo "SELECT DISTINCT i.sample_id FROM \`~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS\` p, \`~{dataset_name}.sample_info\` i WHERE i.sample_name = '${sample_name}' AND p.partition_id = CAST(i.sample_id AS STRING) AND p.total_logical_bytes > 0 AND (table_name like 'ref_ranges_%' OR table_name like 'vet_%')" > query.sql cat query.sql cat query.sql | bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false | sed -e '/sampleid/d' > duplicates From 77f13a030bc82d16e2d5f23e92628b0303ba70ad Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Wed, 18 May 2022 12:19:01 -0400 Subject: [PATCH 04/16] fix logic --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index 15849998b7c..9cff199aefb 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -254,11 +254,11 @@ task LoadData { # first, see if this sample is already in the DB, and if so, skip echo "SELECT DISTINCT i.sample_id FROM \`~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS\` p, \`~{dataset_name}.sample_info\` i WHERE i.sample_name = '${sample_name}' AND p.partition_id = CAST(i.sample_id AS STRING) AND p.total_logical_bytes > 0 AND (table_name like 'ref_ranges_%' OR table_name like 'vet_%')" > query.sql - cat query.sql - cat query.sql | bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false | sed -e '/sampleid/d' > duplicates + cat query.sql | bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false | sed -e '/sample_id/d' > duplicates if [ -s duplicates ]; then - echo "Skipping already loaded sample, id: " $(cat duplicates) + else + echo "\nSkipping already loaded sample, id: " $(cat duplicates) rm duplicates continue fi From f5c4f4219ae3d6df413c237991af1f779b49cc97 Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Thu, 19 May 2022 15:34:21 -0400 Subject: [PATCH 05/16] first stab with python --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 169 ++++++++---------- scripts/variantstore/wdl/extract/Dockerfile | 1 + .../wdl/extract/curate_input_array_files.py | 34 ++++ 3 files changed, 109 insertions(+), 95 deletions(-) create mode 100644 scripts/variantstore/wdl/extract/curate_input_array_files.py diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index 9cff199aefb..6c179c7fbf1 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -28,7 +28,7 @@ workflow GvsImportGenomes { } } - call GetSampleIds { + call GetUningestedSampleIds { input: dataset_name = dataset_name, project_id = project_id, @@ -37,12 +37,23 @@ workflow GvsImportGenomes { service_account_json_path = service_account_json_path } - call CreateFOFNs { + call CurateInputLists { input: - batch_size = 1, + dataset_name = dataset_name, + project_id = project_id, input_vcf_index_list = write_lines(input_vcf_indexes), input_vcf_list = write_lines(input_vcfs), - sample_name_list = write_lines(external_sample_names), + input_sample_name_list = write_lines(external_sample_names), + input_sample_map = GetUningestedSampleIds.sample_map, + service_account_json_path = service_account_json_path + } + + call CreateFOFNs { + input: + batch_size = 1, + input_vcf_index_list = CurateInputLists.index_list, + input_vcf_list = CurateInputLists.vcf_list, + sample_name_list = CurateInputLists.sample_name_list } scatter (i in range(length(CreateFOFNs.vcf_batch_vcf_fofns))) { @@ -59,7 +70,7 @@ workflow GvsImportGenomes { load_data_preemptible_override = load_data_preemptible_override, load_data_maxretries_override = load_data_maxretries_override, sample_names = read_lines(CreateFOFNs.vcf_sample_name_fofns[i]), - sample_map = GetSampleIds.sample_map, + sample_map = CurateInputLists.output_sample_map, service_account_json_path = service_account_json_path, } } @@ -78,78 +89,6 @@ workflow GvsImportGenomes { } } -task CheckForDuplicateData { - input { - String dataset_name - String project_id - - Array[String] sample_names - - String? service_account_json_path - } - - String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false' - Int num_samples = length(sample_names) - - meta { - volatile: true - } - - command <<< - set -e - - if [ ~{has_service_account_file} = 'true' ]; then - gsutil cp ~{service_account_json_path} local.service_account.json - gcloud auth activate-service-account --key-file=local.service_account.json - gcloud config set project ~{project_id} - fi - - echo "project_id = ~{project_id}" > ~/.bigqueryrc - - INFO_SCHEMA_TABLE="~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS" - TEMP_TABLE="~{dataset_name}.sample_dupe_check" - SAMPLE_INFO_TABLE="~{dataset_name}.sample_info" - - # create a temp table with the sample_names - bq --project_id=~{project_id} mk ${TEMP_TABLE} "sample_name:STRING" - NAMES_FILE=~{write_lines(sample_names)} - bq load --project_id=~{project_id} ${TEMP_TABLE} $NAMES_FILE "sample_name:STRING" - - # check the INFORMATION_SCHEMA.PARTITIONS table to see if any of input sample names/ids have data loaded into their partitions - # this returns the list of sample names that do already have data loaded - echo "WITH items as (SELECT s.sample_id, s.sample_name, s.is_loaded, s.withdrawn FROM \`${TEMP_TABLE}\` t left outer join \`${SAMPLE_INFO_TABLE}\` s on (s.sample_name = t.sample_name)) " >> query.sql - echo "SELECT i.sample_name FROM \`${INFO_SCHEMA_TABLE}\` p JOIN items i ON (p.partition_id = CAST(i.sample_id AS STRING)) WHERE p.total_logical_bytes > 0 AND (table_name like 'ref_ranges_%' OR table_name like 'vet_%')" >> query.sql - echo "UNION DISTINCT " >> query.sql - echo "SELECT i.sample_name FROM items i WHERE i.is_loaded = True AND i.withdrawn IS NULL " >> query.sql - echo "UNION DISTINCT " >> query.sql - echo "SELECT i.sample_name FROM items i WHERE i.sample_id IN (SELECT sample_id FROM \`~{dataset_name}.sample_load_status\`) " >> query.sql - - - cat query.sql | bq --location=US --project_id=~{project_id} query --format=csv -n ~{num_samples} --use_legacy_sql=false | sed -e '/sample_name/d' > duplicates - - # remove the temp table - bq --project_id=~{project_id} rm -f -t ${TEMP_TABLE} - - # true if there is data in results - if [ -s duplicates ]; then - echo "ERROR: Trying to load samples that have already been loaded" - cat duplicates - exit 1 - fi - >>> - runtime { - docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0" - memory: "1 GB" - disks: "local-disk 10 HDD" - preemptible: 5 - cpu: 1 - } - output { - Boolean done = true - File? duplicates = "duplicates" - } -} - task CreateFOFNs { input { Int batch_size @@ -256,8 +195,7 @@ task LoadData { cat query.sql | bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false | sed -e '/sample_id/d' > duplicates - if [ -s duplicates ]; then - else + if ! [ -s duplicates ]; then echo "\nSkipping already loaded sample, id: " $(cat duplicates) rm duplicates continue @@ -298,8 +236,6 @@ task LoadData { } } - - task TerminateWorkflow { input { String message @@ -362,7 +298,7 @@ task SetIsLoadedColumn { } } -task GetSampleIds { +task GetUningestedSampleIds { meta { volatile: true } @@ -391,20 +327,19 @@ task GetSampleIds { echo "project_id = ~{project_id}" > ~/.bigqueryrc # create temp table with the sample_names and load external sample names into temp table -- make sure it doesn't exist already - set +e - TEMP_TABLE="~{dataset_name}.sample_names_to_load" - bq show --project_id ~{project_id} ${TEMP_TABLE} > /dev/null - BQ_SHOW_RC=$? - set -e - - # if there is already a table of sample names or something else is wrong, bail - if [ $BQ_SHOW_RC -eq 0 ]; then - echo "There is already a list of sample names. This may need manual cleanup. Exiting" - exit 1 - fi + set +e + TEMP_TABLE="~{dataset_name}.sample_names_to_load" + bq show --project_id ~{project_id} ${TEMP_TABLE} > /dev/null + BQ_SHOW_RC=$? + set -e + + # if there is already a table of sample names or something else is wrong, bail + if [ $BQ_SHOW_RC -eq 0 ]; then + echo "There is already a list of sample names. This may need manual cleanup. Exiting" + exit 1 + fi echo "Creating the external sample name list table ${TEMP_TABLE}" - TEMP_TABLE="~{dataset_name}.sample_names_to_load" bq --project_id=~{project_id} mk ${TEMP_TABLE} "sample_name:STRING" NAMES_FILE=~{write_lines(external_sample_names)} bq load --project_id=~{project_id} ${TEMP_TABLE} $NAMES_FILE "sample_name:STRING" @@ -426,8 +361,9 @@ task GetSampleIds { python3 -c "from math import ceil; print(ceil($max_sample_id/~{samples_per_table}))" > max_sample_id python3 -c "from math import ceil; print(ceil($min_sample_id/~{samples_per_table}))" > min_sample_id + # get sample map of samples that haven't been loaded yet bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n ~{num_samples} \ - "SELECT sample_id, samples.sample_name FROM \`~{dataset_name}.~{table_name}\` AS samples JOIN \`${TEMP_TABLE}\` AS temp ON samples.sample_name=temp.sample_name" > sample_map + "SELECT sample_id, samples.sample_name FROM \`~{dataset_name}.~{table_name}\` AS samples JOIN \`${TEMP_TABLE}\` AS temp ON samples.sample_name=temp.sample_name WHERE samples.sample_id NOT IN (SELECT sample_id FROM \`~{dataset_name}.sample_load_status\` WHERE status='FINISHED')" > sample_map cut -d, -f1 sample_map > gvs_ids @@ -448,3 +384,46 @@ task GetSampleIds { File gvs_ids = "gvs_ids" } } + +task CurateInputLists { + input { + String dataset_name + String project_id + File input_vcf_index_list + File input_vcf_list + File input_sample_map + File input_sample_name_list + + String? service_account_json_path + } + + String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false' + command <<< + set -ex + if [ ~{has_service_account_file} = 'true' ]; then + gsutil cp ~{service_account_json_path} local.service_account.json + gcloud auth activate-service-account --key-file=local.service_account.json + fi + + gsutil cp ~{input_vcf_index_list} input_vcf_index_list + gsutil cp ~{input_vcf_list} input_vcf_list + gsutil cp ~{input_sample_map} input_sample_map + gsutil cp ~{input_sample_name_list} input_sample_name_list + + python3 /app/curate_input_array_files.py + >>> + runtime { + docker: "us.gcr.io/broad-dsde-methods/variantstore:rsa_skip_samples_20220519" + memory: "3 GB" + disks: "local-disk 100 HDD" + bootDiskSizeGb: 15 + preemptible: 0 + cpu: 1 + } + + output { + File index_list = "output_vcf_index_list" + File vcf_list = "output_vcf_list" + File sample_name_list = "output_sample_name_list" + } +} diff --git a/scripts/variantstore/wdl/extract/Dockerfile b/scripts/variantstore/wdl/extract/Dockerfile index 3a8103fc666..54703eefaf5 100644 --- a/scripts/variantstore/wdl/extract/Dockerfile +++ b/scripts/variantstore/wdl/extract/Dockerfile @@ -14,5 +14,6 @@ COPY alt_allele_positions.sql /app COPY alt_allele_temp_function.sql /app COPY utils.py /app COPY add_max_as_vqslod.py /app +COPY curate_input_array_files.py /app WORKDIR /app diff --git a/scripts/variantstore/wdl/extract/curate_input_array_files.py b/scripts/variantstore/wdl/extract/curate_input_array_files.py new file mode 100644 index 00000000000..26fe13e1b69 --- /dev/null +++ b/scripts/variantstore/wdl/extract/curate_input_array_files.py @@ -0,0 +1,34 @@ +import argparse +import numpy as np + +SAMPLE_MAP_FILE_SUFFIX = "sample_map_file" +SAMPLE_NAME_FILE_SUFFIX = "sample_names_list_file" +VCF_FILE_SUFFIX = "vcf_list_file" +VCF_INDEX_FILE_SUFFIX = "vcf_index_list_file" + +def curate_input_arrays(): + sample_map_array = np.loadtxt(f"input_{SAMPLE_MAP_FILE_SUFFIX}", dtype=str, delimiter=",") + vcf_array = np.loadtxt(f"input_{VCF_FILE_SUFFIX}", dtype=str) + vcf_indexes_array = np.loadtxt(f"input_{VCF_INDEX_FILE_SUFFIX}", dtype=str) + sample_names_array = np.loadtxt(f"input_{SAMPLE_NAME_FILE_SUFFIX}", dtype=str, delimiter="\n") + rows_to_delete = [] + + # use input_sample_names_array to figure out which index "rows" to delete + for i in range(len(sample_names_array)): + if sample_names_array[i] not in sample_map_array: + rows_to_delete.append(i) + + + # re-create input arrays using array of "rows" to delete + vcf_array = [vcf_array[i] for i, e in enumerate(vcf_array) if i not in rows_to_delete] + vcf_indexes_array = [vcf_indexes_array[i] for i, e in enumerate(vcf_indexes_array) if i not in rows_to_delete] + sample_names_array = [sample_names_array[i] for i, e in enumerate(sample_names_array) if i not in rows_to_delete] + + np.savetxt(f"output_{SAMPLE_NAME_FILE_SUFFIX}", sample_names_array, fmt='%s') + np.savetxt(f"output_{VCF_FILE_SUFFIX}", vcf_array, fmt='%s') + np.savetxt(f"output_{VCF_INDEX_FILE_SUFFIX}", vcf_indexes_array, fmt='%s') + +if __name__ == '__main__': + parser = argparse.ArgumentParser(allow_abbrev=False, description=f"Expects that four files exist: input_{SAMPLE_MAP_FILE_SUFFIX}, input_{SAMPLE_NAME_FILE_SUFFIX}, input_{VCF_FILE_SUFFIX}, input_{VCF_INDEX_FILE_SUFFIX}; will create the following files: output_{SAMPLE_MAP_FILE_SUFFIX}, output_{SAMPLE_NAME_FILE_SUFFIX}, output_{VCF_FILE_SUFFIX}, output_{VCF_INDEX_FILE_SUFFIX}") + + curate_input_arrays() From deec99dd4b340571b34fd117cf6fad03d1dc0d69 Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Thu, 19 May 2022 15:38:47 -0400 Subject: [PATCH 06/16] whoops, wrong sample_map --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index 6c179c7fbf1..8b503e7d7a5 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -70,7 +70,7 @@ workflow GvsImportGenomes { load_data_preemptible_override = load_data_preemptible_override, load_data_maxretries_override = load_data_maxretries_override, sample_names = read_lines(CreateFOFNs.vcf_sample_name_fofns[i]), - sample_map = CurateInputLists.output_sample_map, + sample_map = GetUningestedSampleIds.sample_map, service_account_json_path = service_account_json_path, } } @@ -174,8 +174,6 @@ task LoadData { gcloud auth activate-service-account --key-file=local.service_account.json fi - echo "project_id = ~{project_id}" > ~/.bigqueryrc - # translate WDL arrays into BASH arrays VCFS_ARRAY=(~{sep=" " input_vcfs}) VCF_INDEXES_ARRAY=(~{sep=" " input_vcf_indexes}) @@ -183,24 +181,12 @@ task LoadData { # loop over the BASH arrays (See https://stackoverflow.com/questions/6723426/looping-over-arrays-printing-both-index-and-value) for i in "${!VCFS_ARRAY[@]}"; do - input_vcf="${VCFS_ARRAY[$i]}" input_vcf_basename=$(basename $input_vcf) updated_input_vcf=$input_vcf input_vcf_index="${VCF_INDEXES_ARRAY[$i]}" sample_name="${SAMPLE_NAMES_ARRAY[$i]}" - # first, see if this sample is already in the DB, and if so, skip - echo "SELECT DISTINCT i.sample_id FROM \`~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS\` p, \`~{dataset_name}.sample_info\` i WHERE i.sample_name = '${sample_name}' AND p.partition_id = CAST(i.sample_id AS STRING) AND p.total_logical_bytes > 0 AND (table_name like 'ref_ranges_%' OR table_name like 'vet_%')" > query.sql - - cat query.sql | bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false | sed -e '/sample_id/d' > duplicates - - if ! [ -s duplicates ]; then - echo "\nSkipping already loaded sample, id: " $(cat duplicates) - rm duplicates - continue - fi - # we always do our own localization gsutil cp $input_vcf . gsutil cp $input_vcf_index . From 4135ef58d7b0c75798248a482c31fba713092403 Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Thu, 19 May 2022 16:21:03 -0400 Subject: [PATCH 07/16] add numpy to requirements.txt --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 2 +- scripts/variantstore/wdl/extract/requirements.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index 8b503e7d7a5..0bf351b2b2a 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -399,7 +399,7 @@ task CurateInputLists { python3 /app/curate_input_array_files.py >>> runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:rsa_skip_samples_20220519" + docker: "us.gcr.io/broad-dsde-methods/variantstore:rsa_skip_samples_20220519_2" memory: "3 GB" disks: "local-disk 100 HDD" bootDiskSizeGb: 15 diff --git a/scripts/variantstore/wdl/extract/requirements.txt b/scripts/variantstore/wdl/extract/requirements.txt index 969f3857400..84e36661d59 100644 --- a/scripts/variantstore/wdl/extract/requirements.txt +++ b/scripts/variantstore/wdl/extract/requirements.txt @@ -1,2 +1,3 @@ google-cloud-bigquery ijson +numpy From bb87dc12c1dff9469095fe22cc9cf1be37b5f20b Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Thu, 19 May 2022 16:39:10 -0400 Subject: [PATCH 08/16] name the files consistently --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index 0bf351b2b2a..bc94998d60f 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -391,10 +391,10 @@ task CurateInputLists { gcloud auth activate-service-account --key-file=local.service_account.json fi - gsutil cp ~{input_vcf_index_list} input_vcf_index_list - gsutil cp ~{input_vcf_list} input_vcf_list - gsutil cp ~{input_sample_map} input_sample_map - gsutil cp ~{input_sample_name_list} input_sample_name_list + gsutil cp ~{input_vcf_index_list} input_vcf_index_file + gsutil cp ~{input_vcf_list} input_vcf_file + gsutil cp ~{input_sample_map} input_sample_map_file + gsutil cp ~{input_sample_name_list} input_sample_name_file python3 /app/curate_input_array_files.py >>> From f0bd8a421a3a27a432cc8fedfcce9167095ae314 Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Thu, 19 May 2022 16:52:29 -0400 Subject: [PATCH 09/16] more syncing of file names --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 14 +++++++------- .../wdl/extract/curate_input_array_files.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index bc94998d60f..7b8e131150b 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -391,15 +391,15 @@ task CurateInputLists { gcloud auth activate-service-account --key-file=local.service_account.json fi - gsutil cp ~{input_vcf_index_list} input_vcf_index_file - gsutil cp ~{input_vcf_list} input_vcf_file + gsutil cp ~{input_vcf_index_list} input_vcf_index_list_file + gsutil cp ~{input_vcf_list} input_vcf_list_file gsutil cp ~{input_sample_map} input_sample_map_file - gsutil cp ~{input_sample_name_list} input_sample_name_file + gsutil cp ~{input_sample_name_list} input_sample_name_list_file python3 /app/curate_input_array_files.py >>> runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:rsa_skip_samples_20220519_2" + docker: "us.gcr.io/broad-dsde-methods/variantstore:rsa_skip_samples_20220519_4" memory: "3 GB" disks: "local-disk 100 HDD" bootDiskSizeGb: 15 @@ -408,8 +408,8 @@ task CurateInputLists { } output { - File index_list = "output_vcf_index_list" - File vcf_list = "output_vcf_list" - File sample_name_list = "output_sample_name_list" + File index_list = "output_vcf_index_list_file" + File vcf_list = "output_vcf_list_file" + File sample_name_list = "output_sample_name_list_file" } } diff --git a/scripts/variantstore/wdl/extract/curate_input_array_files.py b/scripts/variantstore/wdl/extract/curate_input_array_files.py index 26fe13e1b69..0fcbab3d0e0 100644 --- a/scripts/variantstore/wdl/extract/curate_input_array_files.py +++ b/scripts/variantstore/wdl/extract/curate_input_array_files.py @@ -2,7 +2,7 @@ import numpy as np SAMPLE_MAP_FILE_SUFFIX = "sample_map_file" -SAMPLE_NAME_FILE_SUFFIX = "sample_names_list_file" +SAMPLE_NAME_FILE_SUFFIX = "sample_name_list_file" VCF_FILE_SUFFIX = "vcf_list_file" VCF_INDEX_FILE_SUFFIX = "vcf_index_list_file" From 96c75190928908bc6eec5739954c122bb18bd361 Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Thu, 19 May 2022 18:32:14 -0400 Subject: [PATCH 10/16] add file validation --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 2 +- .../wdl/extract/curate_input_array_files.py | 34 +++++++++++++------ 2 files changed, 25 insertions(+), 11 deletions(-) mode change 100644 => 100755 scripts/variantstore/wdl/extract/curate_input_array_files.py diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index e18e6b37a10..4c1930e6186 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -383,7 +383,7 @@ task CurateInputLists { python3 /app/curate_input_array_files.py >>> runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:rsa_skip_samples_20220519_4" + docker: "us.gcr.io/broad-dsde-methods/variantstore:rsa_skip_samples_20220519_5" memory: "3 GB" disks: "local-disk 100 HDD" bootDiskSizeGb: 15 diff --git a/scripts/variantstore/wdl/extract/curate_input_array_files.py b/scripts/variantstore/wdl/extract/curate_input_array_files.py old mode 100644 new mode 100755 index 0fcbab3d0e0..13e27ea1bc1 --- a/scripts/variantstore/wdl/extract/curate_input_array_files.py +++ b/scripts/variantstore/wdl/extract/curate_input_array_files.py @@ -1,16 +1,29 @@ -import argparse import numpy as np +from contextlib import contextmanager SAMPLE_MAP_FILE_SUFFIX = "sample_map_file" SAMPLE_NAME_FILE_SUFFIX = "sample_name_list_file" VCF_FILE_SUFFIX = "vcf_list_file" VCF_INDEX_FILE_SUFFIX = "vcf_index_list_file" +@contextmanager +def handle_file_error(file_name): + try: + yield + except: + print(f"ERROR: required file named '{file_name}' does not exist.") + + def curate_input_arrays(): - sample_map_array = np.loadtxt(f"input_{SAMPLE_MAP_FILE_SUFFIX}", dtype=str, delimiter=",") - vcf_array = np.loadtxt(f"input_{VCF_FILE_SUFFIX}", dtype=str) - vcf_indexes_array = np.loadtxt(f"input_{VCF_INDEX_FILE_SUFFIX}", dtype=str) - sample_names_array = np.loadtxt(f"input_{SAMPLE_NAME_FILE_SUFFIX}", dtype=str, delimiter="\n") + sample_map_array = vcf_array = vcf_indexes_array = sample_names_array = [] + with handle_file_error(f"input_{SAMPLE_MAP_FILE_SUFFIX}"): + sample_map_array = np.loadtxt(f"input_{SAMPLE_MAP_FILE_SUFFIX}", dtype=str, delimiter=",") + with handle_file_error(f"input_{VCF_FILE_SUFFIX}"): + vcf_array = np.loadtxt(f"input_{VCF_FILE_SUFFIX}", dtype=str) + with handle_file_error(f"input_{VCF_INDEX_FILE_SUFFIX}"): + vcf_indexes_array = np.loadtxt(f"input_{VCF_INDEX_FILE_SUFFIX}", dtype=str) + with handle_file_error(f"input_{SAMPLE_NAME_FILE_SUFFIX}"): + sample_names_array = np.loadtxt(f"input_{SAMPLE_NAME_FILE_SUFFIX}", dtype=str) rows_to_delete = [] # use input_sample_names_array to figure out which index "rows" to delete @@ -18,17 +31,18 @@ def curate_input_arrays(): if sample_names_array[i] not in sample_map_array: rows_to_delete.append(i) - # re-create input arrays using array of "rows" to delete vcf_array = [vcf_array[i] for i, e in enumerate(vcf_array) if i not in rows_to_delete] - vcf_indexes_array = [vcf_indexes_array[i] for i, e in enumerate(vcf_indexes_array) if i not in rows_to_delete] - sample_names_array = [sample_names_array[i] for i, e in enumerate(sample_names_array) if i not in rows_to_delete] + vcf_indexes_array = [vcf_indexes_array[i] for i, e in enumerate(vcf_indexes_array) if + i not in rows_to_delete] + sample_names_array = [sample_names_array[i] for i, e in enumerate(sample_names_array) if + i not in rows_to_delete] + # create output files with array contents np.savetxt(f"output_{SAMPLE_NAME_FILE_SUFFIX}", sample_names_array, fmt='%s') np.savetxt(f"output_{VCF_FILE_SUFFIX}", vcf_array, fmt='%s') np.savetxt(f"output_{VCF_INDEX_FILE_SUFFIX}", vcf_indexes_array, fmt='%s') -if __name__ == '__main__': - parser = argparse.ArgumentParser(allow_abbrev=False, description=f"Expects that four files exist: input_{SAMPLE_MAP_FILE_SUFFIX}, input_{SAMPLE_NAME_FILE_SUFFIX}, input_{VCF_FILE_SUFFIX}, input_{VCF_INDEX_FILE_SUFFIX}; will create the following files: output_{SAMPLE_MAP_FILE_SUFFIX}, output_{SAMPLE_NAME_FILE_SUFFIX}, output_{VCF_FILE_SUFFIX}, output_{VCF_INDEX_FILE_SUFFIX}") +if __name__ == '__main__': curate_input_arrays() From ac6f2e1af40e652c7b39781c07133bdeb9f49e86 Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Fri, 20 May 2022 11:20:56 -0400 Subject: [PATCH 11/16] more cleanup and PR feedback --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 2 +- .../variantstore/wdl/extract/curate_input_array_files.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index 4c1930e6186..ca1cf8432a8 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -383,7 +383,7 @@ task CurateInputLists { python3 /app/curate_input_array_files.py >>> runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:rsa_skip_samples_20220519_5" + docker: "us.gcr.io/broad-dsde-methods/variantstore:rsa_skip_samples_20220520" memory: "3 GB" disks: "local-disk 100 HDD" bootDiskSizeGb: 15 diff --git a/scripts/variantstore/wdl/extract/curate_input_array_files.py b/scripts/variantstore/wdl/extract/curate_input_array_files.py index 13e27ea1bc1..c8ad939ebc0 100755 --- a/scripts/variantstore/wdl/extract/curate_input_array_files.py +++ b/scripts/variantstore/wdl/extract/curate_input_array_files.py @@ -32,13 +32,13 @@ def curate_input_arrays(): rows_to_delete.append(i) # re-create input arrays using array of "rows" to delete - vcf_array = [vcf_array[i] for i, e in enumerate(vcf_array) if i not in rows_to_delete] - vcf_indexes_array = [vcf_indexes_array[i] for i, e in enumerate(vcf_indexes_array) if + vcf_array = [vcf_array[i] for i in range(len(vcf_array)) if i not in rows_to_delete] + vcf_indexes_array = [vcf_indexes_array[i] for i in range(len(vcf_indexes_array)) if i not in rows_to_delete] - sample_names_array = [sample_names_array[i] for i, e in enumerate(sample_names_array) if + sample_names_array = [sample_names_array[i] for i in range(len(sample_names_array)) if i not in rows_to_delete] - # create output files with array contents + print(f"Creating 'output_{SAMPLE_NAME_FILE_SUFFIX}', 'output_{VCF_FILE_SUFFIX}' and 'output_{VCF_INDEX_FILE_SUFFIX}'.") np.savetxt(f"output_{SAMPLE_NAME_FILE_SUFFIX}", sample_names_array, fmt='%s') np.savetxt(f"output_{VCF_FILE_SUFFIX}", vcf_array, fmt='%s') np.savetxt(f"output_{VCF_INDEX_FILE_SUFFIX}", vcf_indexes_array, fmt='%s') From afa90cdad9a3beeaad27b5eeeb432128d069fc5f Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Wed, 1 Jun 2022 12:35:01 -0400 Subject: [PATCH 12/16] PR feedback --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 6 +++--- .../variantstore/wdl/extract/curate_input_array_files.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index ca1cf8432a8..b9287941e43 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -47,7 +47,7 @@ workflow GvsImportGenomes { input_vcf_index_list = write_lines(input_vcf_indexes), input_vcf_list = write_lines(input_vcfs), input_sample_name_list = write_lines(external_sample_names), - input_sample_map = GetUningestedSampleIds.sample_map, + input_samples_to_be_loaded_map = GetUningestedSampleIds.sample_map, service_account_json_path = service_account_json_path } @@ -361,7 +361,7 @@ task CurateInputLists { String project_id File input_vcf_index_list File input_vcf_list - File input_sample_map + File input_samples_to_be_loaded_map File input_sample_name_list String? service_account_json_path @@ -377,7 +377,7 @@ task CurateInputLists { gsutil cp ~{input_vcf_index_list} input_vcf_index_list_file gsutil cp ~{input_vcf_list} input_vcf_list_file - gsutil cp ~{input_sample_map} input_sample_map_file + gsutil cp ~{input_samples_to_be_loaded_map} input_samples_to_be_loaded_map_file gsutil cp ~{input_sample_name_list} input_sample_name_list_file python3 /app/curate_input_array_files.py diff --git a/scripts/variantstore/wdl/extract/curate_input_array_files.py b/scripts/variantstore/wdl/extract/curate_input_array_files.py index c8ad939ebc0..7298945d992 100755 --- a/scripts/variantstore/wdl/extract/curate_input_array_files.py +++ b/scripts/variantstore/wdl/extract/curate_input_array_files.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- import numpy as np from contextlib import contextmanager From 8ae8a6f80b6da7730da0193d4ccaea890c9bfbcb Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Thu, 2 Jun 2022 17:12:07 -0400 Subject: [PATCH 13/16] add test for curate_input_array_test_files --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 8 +-- .../wdl/extract/curate_input_array_files.py | 60 +++++++++++++------ .../input_sample_name_list_file | 10 ++++ .../input_samples_to_be_loaded_map_file | 8 +++ .../input_vcf_index_list_file | 10 ++++ .../input_vcf_list_file | 10 ++++ .../output_sample_name_list_file_correct | 7 +++ .../output_vcf_index_list_file_correct | 7 +++ .../output_vcf_list_file_correct | 7 +++ .../extract/test_curate_input_array_files.py | 26 ++++++++ 10 files changed, 131 insertions(+), 22 deletions(-) create mode 100644 scripts/variantstore/wdl/extract/curate_input_array_test_files/input_sample_name_list_file create mode 100644 scripts/variantstore/wdl/extract/curate_input_array_test_files/input_samples_to_be_loaded_map_file create mode 100644 scripts/variantstore/wdl/extract/curate_input_array_test_files/input_vcf_index_list_file create mode 100644 scripts/variantstore/wdl/extract/curate_input_array_test_files/input_vcf_list_file create mode 100644 scripts/variantstore/wdl/extract/curate_input_array_test_files/output_sample_name_list_file_correct create mode 100644 scripts/variantstore/wdl/extract/curate_input_array_test_files/output_vcf_index_list_file_correct create mode 100644 scripts/variantstore/wdl/extract/curate_input_array_test_files/output_vcf_list_file_correct create mode 100644 scripts/variantstore/wdl/extract/test_curate_input_array_files.py diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index c3542b3c7f4..0a772e1d0e4 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -56,9 +56,9 @@ workflow GvsImportGenomes { call CreateFOFNs { input: batch_size = load_data_batch_size, - input_vcf_index_list = write_lines(input_vcf_indexes), - input_vcf_list = write_lines(input_vcfs), - sample_name_list = write_lines(external_sample_names), + input_vcf_index_list = CurateInputLists.input_vcf_indexes, + input_vcf_list = CurateInputLists.input_vcfs, + sample_name_list = CurateInputLists.sample_name_list, } scatter (i in range(length(CreateFOFNs.vcf_batch_vcf_fofns))) { @@ -385,7 +385,7 @@ task CurateInputLists { python3 /app/curate_input_array_files.py >>> runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:rsa_skip_samples_20220520" + docker: "us.gcr.io/broad-dsde-methods/variantstore:rsa_skip_samples_20220602" memory: "3 GB" disks: "local-disk 100 HDD" bootDiskSizeGb: 15 diff --git a/scripts/variantstore/wdl/extract/curate_input_array_files.py b/scripts/variantstore/wdl/extract/curate_input_array_files.py index 7298945d992..0669dae3f3a 100755 --- a/scripts/variantstore/wdl/extract/curate_input_array_files.py +++ b/scripts/variantstore/wdl/extract/curate_input_array_files.py @@ -1,8 +1,9 @@ # -*- coding: utf-8 -*- import numpy as np from contextlib import contextmanager +import argparse -SAMPLE_MAP_FILE_SUFFIX = "sample_map_file" +SAMPLE_MAP_TO_BE_LOADED_FILE_SUFFIX = "samples_to_be_loaded_map_file" SAMPLE_NAME_FILE_SUFFIX = "sample_name_list_file" VCF_FILE_SUFFIX = "vcf_list_file" VCF_INDEX_FILE_SUFFIX = "vcf_index_list_file" @@ -15,21 +16,25 @@ def handle_file_error(file_name): print(f"ERROR: required file named '{file_name}' does not exist.") -def curate_input_arrays(): - sample_map_array = vcf_array = vcf_indexes_array = sample_names_array = [] - with handle_file_error(f"input_{SAMPLE_MAP_FILE_SUFFIX}"): - sample_map_array = np.loadtxt(f"input_{SAMPLE_MAP_FILE_SUFFIX}", dtype=str, delimiter=",") - with handle_file_error(f"input_{VCF_FILE_SUFFIX}"): - vcf_array = np.loadtxt(f"input_{VCF_FILE_SUFFIX}", dtype=str) - with handle_file_error(f"input_{VCF_INDEX_FILE_SUFFIX}"): - vcf_indexes_array = np.loadtxt(f"input_{VCF_INDEX_FILE_SUFFIX}", dtype=str) - with handle_file_error(f"input_{SAMPLE_NAME_FILE_SUFFIX}"): - sample_names_array = np.loadtxt(f"input_{SAMPLE_NAME_FILE_SUFFIX}", dtype=str) +def curate_input_arrays(sample_map_to_be_loaded_file_name, + sample_name_list_file_name, + vcf_list_file_name, + vcf_index_list_file_name, + output_files): + sample_map_to_be_loaded_array = vcf_array = vcf_indexes_array = sample_names_array = [] + with handle_file_error(sample_map_to_be_loaded_file_name): + sample_map_to_be_loaded_array = np.loadtxt(sample_map_to_be_loaded_file_name, dtype=str, delimiter=",") + with handle_file_error(vcf_list_file_name): + vcf_array = np.loadtxt(vcf_list_file_name, dtype=str) + with handle_file_error(vcf_index_list_file_name): + vcf_indexes_array = np.loadtxt(vcf_index_list_file_name, dtype=str) + with handle_file_error(sample_name_list_file_name): + sample_names_array = np.loadtxt(sample_name_list_file_name, dtype=str) rows_to_delete = [] # use input_sample_names_array to figure out which index "rows" to delete for i in range(len(sample_names_array)): - if sample_names_array[i] not in sample_map_array: + if sample_names_array[i] not in sample_map_to_be_loaded_array: rows_to_delete.append(i) # re-create input arrays using array of "rows" to delete @@ -39,11 +44,30 @@ def curate_input_arrays(): sample_names_array = [sample_names_array[i] for i in range(len(sample_names_array)) if i not in rows_to_delete] - print(f"Creating 'output_{SAMPLE_NAME_FILE_SUFFIX}', 'output_{VCF_FILE_SUFFIX}' and 'output_{VCF_INDEX_FILE_SUFFIX}'.") - np.savetxt(f"output_{SAMPLE_NAME_FILE_SUFFIX}", sample_names_array, fmt='%s') - np.savetxt(f"output_{VCF_FILE_SUFFIX}", vcf_array, fmt='%s') - np.savetxt(f"output_{VCF_INDEX_FILE_SUFFIX}", vcf_indexes_array, fmt='%s') - + if output_files: + print(f"Creating 'output_{SAMPLE_NAME_FILE_SUFFIX}', 'output_{VCF_FILE_SUFFIX}' and 'output_{VCF_INDEX_FILE_SUFFIX}'.") + np.savetxt(f"output_{SAMPLE_NAME_FILE_SUFFIX}", sample_names_array, fmt='%s') + np.savetxt(f"output_{VCF_FILE_SUFFIX}", vcf_array, fmt='%s') + np.savetxt(f"output_{VCF_INDEX_FILE_SUFFIX}", vcf_indexes_array, fmt='%s') + else: + d = dict(); + d['sample_names_array'] = sample_names_array + d['vcf_array'] = vcf_array + d['vcf_indexes_array'] = vcf_indexes_array + return d if __name__ == '__main__': - curate_input_arrays() + parser = argparse.ArgumentParser(allow_abbrev=False, description='Curate GvsImportGenomes arrays to remove duplicate samples') + + parser.add_argument('--sample_map_to_be_loaded_file_name',type=str, help='name of sample_map file', required=False, default=f"input_{SAMPLE_MAP_TO_BE_LOADED_FILE_SUFFIX}") + parser.add_argument('--sample_name_list_file_name',type=str, help='name of sample name list file', required=False, default=f"input_{SAMPLE_NAME_FILE_SUFFIX}") + parser.add_argument('--vcf_list_file_name',type=str, help='name of VCF list file', required=False, default=f"input_{VCF_FILE_SUFFIX}") + parser.add_argument('--vcf_index_list_file_name',type=str, help='name of VCF index list file', required=False, default=f"input_{VCF_INDEX_FILE_SUFFIX}") + parser.add_argument('--output_files',type=bool, help='true (default): outputs are files; false: outputs are arrays', required=False, default=True) + args = parser.parse_args() + + curate_input_arrays(args.sample_map_to_be_loaded_file_name, + args.sample_name_list_file_name, + args.vcf_list_file_name, + args.vcf_index_list_file_name, + args.output_files) diff --git a/scripts/variantstore/wdl/extract/curate_input_array_test_files/input_sample_name_list_file b/scripts/variantstore/wdl/extract/curate_input_array_test_files/input_sample_name_list_file new file mode 100644 index 00000000000..5fa7b33ae98 --- /dev/null +++ b/scripts/variantstore/wdl/extract/curate_input_array_test_files/input_sample_name_list_file @@ -0,0 +1,10 @@ +ERS4367795 +ERS4367796 +ERS4367797 +ERS4367798 +ERS4367799 +ERS4367800 +ERS4367801 +ERS4367803 +ERS4367804 +ERS4367805 diff --git a/scripts/variantstore/wdl/extract/curate_input_array_test_files/input_samples_to_be_loaded_map_file b/scripts/variantstore/wdl/extract/curate_input_array_test_files/input_samples_to_be_loaded_map_file new file mode 100644 index 00000000000..558a078cae3 --- /dev/null +++ b/scripts/variantstore/wdl/extract/curate_input_array_test_files/input_samples_to_be_loaded_map_file @@ -0,0 +1,8 @@ +sample_id,sample_name +9,ERS4367804 +7,ERS4367801 +4,ERS4367798 +6,ERS4367800 +10,ERS4367805 +2,ERS4367796 +1,ERS4367795 diff --git a/scripts/variantstore/wdl/extract/curate_input_array_test_files/input_vcf_index_list_file b/scripts/variantstore/wdl/extract/curate_input_array_test_files/input_vcf_index_list_file new file mode 100644 index 00000000000..e89e15c5ae6 --- /dev/null +++ b/scripts/variantstore/wdl/extract/curate_input_array_test_files/input_vcf_index_list_file @@ -0,0 +1,10 @@ +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/702cdbf7-0666-4ee5-b889-91ba0ffa90bd/call-Reblock/HG00405.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/59ee6ce9-b8d0-4e39-8cc7-8908c6daf87c/call-Reblock/HG00408.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/22604e48-c97b-4709-bb0c-aeeef2891177/call-Reblock/HG00418.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/16c7c3a2-aa82-42ca-904f-8f0eebc21507/call-Reblock/attempt-2/HG00420.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/ee0fbdc6-4d59-4733-a40b-e3fd51b8daea/call-Reblock/HG00423.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/1acc4d0b-812d-4539-9a3b-841c3413d057/call-Reblock/HG00427.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/47091691-f665-4a2c-bc6a-b4b5d57fa222/call-Reblock/HG00429.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/26523d6c-5bae-4486-915d-f4ee3a969420/call-Reblock/HG00444.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/117697bf-fe61-4c18-85c3-162c706c9037/call-Reblock/attempt-2/HG00447.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/d5dbd8dc-bbd1-484a-b4ea-94c02ed896d0/call-Reblock/HG00450.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi diff --git a/scripts/variantstore/wdl/extract/curate_input_array_test_files/input_vcf_list_file b/scripts/variantstore/wdl/extract/curate_input_array_test_files/input_vcf_list_file new file mode 100644 index 00000000000..cfc9df59c94 --- /dev/null +++ b/scripts/variantstore/wdl/extract/curate_input_array_test_files/input_vcf_list_file @@ -0,0 +1,10 @@ +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/702cdbf7-0666-4ee5-b889-91ba0ffa90bd/call-Reblock/HG00405.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/59ee6ce9-b8d0-4e39-8cc7-8908c6daf87c/call-Reblock/HG00408.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/22604e48-c97b-4709-bb0c-aeeef2891177/call-Reblock/HG00418.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/16c7c3a2-aa82-42ca-904f-8f0eebc21507/call-Reblock/attempt-2/HG00420.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/ee0fbdc6-4d59-4733-a40b-e3fd51b8daea/call-Reblock/HG00423.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/1acc4d0b-812d-4539-9a3b-841c3413d057/call-Reblock/HG00427.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/47091691-f665-4a2c-bc6a-b4b5d57fa222/call-Reblock/HG00429.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/26523d6c-5bae-4486-915d-f4ee3a969420/call-Reblock/HG00444.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/117697bf-fe61-4c18-85c3-162c706c9037/call-Reblock/attempt-2/HG00447.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/d5dbd8dc-bbd1-484a-b4ea-94c02ed896d0/call-Reblock/HG00450.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz diff --git a/scripts/variantstore/wdl/extract/curate_input_array_test_files/output_sample_name_list_file_correct b/scripts/variantstore/wdl/extract/curate_input_array_test_files/output_sample_name_list_file_correct new file mode 100644 index 00000000000..c4897ec29ff --- /dev/null +++ b/scripts/variantstore/wdl/extract/curate_input_array_test_files/output_sample_name_list_file_correct @@ -0,0 +1,7 @@ +ERS4367795 +ERS4367796 +ERS4367798 +ERS4367800 +ERS4367801 +ERS4367804 +ERS4367805 diff --git a/scripts/variantstore/wdl/extract/curate_input_array_test_files/output_vcf_index_list_file_correct b/scripts/variantstore/wdl/extract/curate_input_array_test_files/output_vcf_index_list_file_correct new file mode 100644 index 00000000000..7d71be1da11 --- /dev/null +++ b/scripts/variantstore/wdl/extract/curate_input_array_test_files/output_vcf_index_list_file_correct @@ -0,0 +1,7 @@ +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/702cdbf7-0666-4ee5-b889-91ba0ffa90bd/call-Reblock/HG00405.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/59ee6ce9-b8d0-4e39-8cc7-8908c6daf87c/call-Reblock/HG00408.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/16c7c3a2-aa82-42ca-904f-8f0eebc21507/call-Reblock/attempt-2/HG00420.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/1acc4d0b-812d-4539-9a3b-841c3413d057/call-Reblock/HG00427.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/47091691-f665-4a2c-bc6a-b4b5d57fa222/call-Reblock/HG00429.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/117697bf-fe61-4c18-85c3-162c706c9037/call-Reblock/attempt-2/HG00447.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/d5dbd8dc-bbd1-484a-b4ea-94c02ed896d0/call-Reblock/HG00450.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi diff --git a/scripts/variantstore/wdl/extract/curate_input_array_test_files/output_vcf_list_file_correct b/scripts/variantstore/wdl/extract/curate_input_array_test_files/output_vcf_list_file_correct new file mode 100644 index 00000000000..d8fbd889a87 --- /dev/null +++ b/scripts/variantstore/wdl/extract/curate_input_array_test_files/output_vcf_list_file_correct @@ -0,0 +1,7 @@ +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/702cdbf7-0666-4ee5-b889-91ba0ffa90bd/call-Reblock/HG00405.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/59ee6ce9-b8d0-4e39-8cc7-8908c6daf87c/call-Reblock/HG00408.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/16c7c3a2-aa82-42ca-904f-8f0eebc21507/call-Reblock/attempt-2/HG00420.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/1acc4d0b-812d-4539-9a3b-841c3413d057/call-Reblock/HG00427.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/47091691-f665-4a2c-bc6a-b4b5d57fa222/call-Reblock/HG00429.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/117697bf-fe61-4c18-85c3-162c706c9037/call-Reblock/attempt-2/HG00447.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz +gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/d5dbd8dc-bbd1-484a-b4ea-94c02ed896d0/call-Reblock/HG00450.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz diff --git a/scripts/variantstore/wdl/extract/test_curate_input_array_files.py b/scripts/variantstore/wdl/extract/test_curate_input_array_files.py new file mode 100644 index 00000000000..0e94163f638 --- /dev/null +++ b/scripts/variantstore/wdl/extract/test_curate_input_array_files.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +import numpy as np +import unittest + +from curate_input_array_files import curate_input_arrays + +dir='curate_input_array_test_files/' +with open(dir + "output_sample_name_list_file_correct"): + output_sample_name_list_correct = np.loadtxt(dir + "output_sample_name_list_file_correct", dtype=str).tolist() +with open(dir + "output_vcf_list_file_correct"): + output_vcf_list_correct = np.loadtxt(dir + "output_vcf_list_file_correct", dtype=str).tolist() +with open(dir + "output_vcf_index_list_file_correct"): + output_vcf_index_list_correct = np.loadtxt(dir + "output_vcf_index_list_file_correct", dtype=str).tolist() + +class TestCurateInputArrays(unittest.TestCase): + def test_curate_input_array_files_success(self): + actual = curate_input_arrays( + sample_map_to_be_loaded_file_name=dir + 'input_samples_to_be_loaded_map_file', + sample_name_list_file_name=dir + 'input_sample_name_list_file', + vcf_list_file_name=dir + 'input_vcf_list_file', + vcf_index_list_file_name=dir + 'input_vcf_index_list_file', + output_files='') + self.maxDiff=None + self.assertEqual(actual['sample_names_array'], output_sample_name_list_correct) + self.assertEqual(actual['vcf_array'], output_vcf_list_correct) + self.assertEqual(actual['vcf_indexes_array'], output_vcf_index_list_correct) From 40b71adb359ee3f22904abbb3f71536075f49843 Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Fri, 3 Jun 2022 11:24:04 -0400 Subject: [PATCH 14/16] PR feedback --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index b92b047cd9f..1c1b0dc171d 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -3,6 +3,7 @@ version 1.0 import "GvsUtils.wdl" as GvsUtils workflow GvsImportGenomes { + input { Boolean go = true String dataset_name @@ -382,10 +383,10 @@ task CurateInputLists { gcloud auth activate-service-account --key-file=local.service_account.json fi - gsutil cp ~{input_vcf_index_list} input_vcf_index_list_file - gsutil cp ~{input_vcf_list} input_vcf_list_file - gsutil cp ~{input_samples_to_be_loaded_map} input_samples_to_be_loaded_map_file - gsutil cp ~{input_sample_name_list} input_sample_name_list_file +# gsutil cp ~{input_vcf_index_list} input_vcf_index_list_file +# gsutil cp ~{input_vcf_list} input_vcf_list_file +# gsutil cp ~{input_samples_to_be_loaded_map} input_samples_to_be_loaded_map_file +# gsutil cp ~{input_sample_name_list} input_sample_name_list_file python3 /app/curate_input_array_files.py >>> @@ -394,7 +395,7 @@ task CurateInputLists { memory: "3 GB" disks: "local-disk 100 HDD" bootDiskSizeGb: 15 - preemptible: 0 + preemptible: 3 cpu: 1 } From f565850b6550b5073518025f8ba378fea009e1d1 Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Fri, 3 Jun 2022 12:32:26 -0400 Subject: [PATCH 15/16] use localized inputs --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index 1c1b0dc171d..bc99bf70823 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -383,12 +383,11 @@ task CurateInputLists { gcloud auth activate-service-account --key-file=local.service_account.json fi -# gsutil cp ~{input_vcf_index_list} input_vcf_index_list_file -# gsutil cp ~{input_vcf_list} input_vcf_list_file -# gsutil cp ~{input_samples_to_be_loaded_map} input_samples_to_be_loaded_map_file -# gsutil cp ~{input_sample_name_list} input_sample_name_list_file - - python3 /app/curate_input_array_files.py + python3 /app/curate_input_array_files.py --sample_map_to_be_loaded_file_name ~{input_samples_to_be_loaded_map} \ + --sample_name_list_file_name ~{input_sample_name_list} \ + --vcf_list_file_name ~{input_vcf_list} \ + --vcf_index_list_file_name ~{input_sample_name_list} \ + --output_files True >>> runtime { docker: "us.gcr.io/broad-dsde-methods/variantstore:rsa_skip_samples_20220602" From c8a6046ec45df51a6a27d851a66d5ca3622fe9c2 Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Fri, 3 Jun 2022 13:36:58 -0400 Subject: [PATCH 16/16] one more typo --- scripts/variantstore/wdl/GvsImportGenomes.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index bc99bf70823..e4e77aeba9e 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -386,7 +386,7 @@ task CurateInputLists { python3 /app/curate_input_array_files.py --sample_map_to_be_loaded_file_name ~{input_samples_to_be_loaded_map} \ --sample_name_list_file_name ~{input_sample_name_list} \ --vcf_list_file_name ~{input_vcf_list} \ - --vcf_index_list_file_name ~{input_sample_name_list} \ + --vcf_index_list_file_name ~{input_vcf_index_list} \ --output_files True >>> runtime {