broadinstitute · rsasch · Sep 24, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -184,7 +184,7 @@ workflows:
          branches:
              - master
              - ah_var_store
-             - vs_1456_status_writes_bug
+             - rsa_vs_1218
          tags:
              - /.*/
    - name: GvsPrepareRangesCallset

diff --git a/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md b/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md
@@ -6,7 +6,7 @@
   - As described in the "Getting Started" of [Operational concerns for running Hail in Terra Cromwell/WDL](https://docs.google.com/document/d/1_OY2rKwZ-qKCDldSZrte4jRIZf4eAw2d7Jd-Asi50KE/edit?usp=sharing), this workspace will need permission in Terra to run Hail dataproc clusters within WDL. Contact Emily to request this access as part of setting up the new workspace.
   - There is a quota that needs to be upgraded for the process of Bulk Ingest.
     When we ingest data, we use the Write API, which is part of BQ’s Storage API. Since we are hitting this API with so much data all at once, we want to increase our CreateWriteStream quota. Follow the [Quota Request Template](workspace/CreateWriteStreamRequestIncreasedQuota.md).
-    Once that quota has been increased, the `load_data_batch` value needs to be updated based on calculations in the [Quota Request Template](workspace/CreateWriteStreamRequestIncreasedQuota.md) doc. Even if no increased quota is granted, this doc goes over how to choose the value for this param.
+    Once that quota has been increased, the `load_data_scatter_width` value needs to be updated based on that new quota (for information on what we did for Echo, see the "Calculate Quota To be Requested" section in the [Quota Request Template](workspace/CreateWriteStreamRequestIncreasedQuota.md) doc).
   - Create and push a feature branch (e.g. `EchoCallset`) based off the `ah_var_store` branch to the GATK GitHub repo.
     - Update the .dockstore.yml file on that feature branch to add the feature branch for all the WDLs that will be loaded into the workspace in the next step.
 - Once the requested workspace has been created and permissioned, populate with the following WDLs:

diff --git a/scripts/variantstore/wdl/GvsBulkIngestGenomes.wdl b/scripts/variantstore/wdl/GvsBulkIngestGenomes.wdl
@@ -39,9 +39,7 @@ workflow GvsBulkIngestGenomes {
         # set to "NONE" to ingest all the reference data into GVS for VDS (instead of VCF) output
         String drop_state = "NONE"
 
-        # The larger the `load_data_batch_size` the greater the probability of preemptions and non-retryable BigQuery errors,
-        # so if specifying `load_data_batch_size`, adjust preemptible and maxretries accordingly. Or just take the defaults, as those should work fine in most cases.
-        Int? load_data_batch_size
+        Int? load_data_scatter_width
         Int? load_data_preemptible_override
         Int? load_data_maxretries_override
         String? billing_project_id
@@ -131,11 +129,7 @@ workflow GvsBulkIngestGenomes {
             input_vcfs = SplitBulkImportFofn.vcf_file_name_fofn,
             input_vcf_indexes = SplitBulkImportFofn.vcf_index_file_name_fofn,
             interval_list = interval_list,
-
-            # The larger the `load_data_batch_size` the greater the probability of preemptions and non-retryable
-            # BigQuery errors so if specifying this adjust preemptible and maxretries accordingly. Or just take the defaults,
-            # those should work fine in most cases.
-            load_data_batch_size = load_data_batch_size,
+            load_data_scatter_width = load_data_scatter_width,
             load_data_maxretries_override = load_data_maxretries_override,
             load_data_preemptible_override = load_data_preemptible_override,
             basic_docker = effective_basic_docker,

diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl
@@ -29,7 +29,7 @@ workflow GvsImportGenomes {
     # without going over
     Int beta_customer_max_scatter = 200
     File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
-    Int? load_data_batch_size
+    Int? load_data_scatter_width
     Int? load_data_preemptible_override
     Int? load_data_maxretries_override
     # At least one of these "load" inputs must be true
@@ -76,17 +76,17 @@ workflow GvsImportGenomes {
     }
   }
 
-  if ((num_samples > max_auto_batch_size) && !(defined(load_data_batch_size))) {
+  if ((num_samples > max_auto_batch_size) && !(defined(load_data_scatter_width))) {
     call Utils.TerminateWorkflow as DieDueToTooManySamplesWithoutExplicitLoadDataBatchSize {
       input:
-        message = "Importing " + num_samples + " samples but 'load_data_batch_size' is not explicitly specified; the limit for auto batch-sizing is " + max_auto_batch_size + " for " + genome_type + " samples.",
+        message = "Importing " + num_samples + " samples but 'load_data_scatter_width' is not explicitly specified; the limit for auto batch-sizing is " + max_auto_batch_size + " for " + genome_type + " samples.",
         basic_docker = effective_basic_docker,
     }
   }
 
   # At least 1, per limits above not more than 20.
   # But if it's a beta customer, use the number computed above
-  Int effective_load_data_batch_size = if (defined(load_data_batch_size)) then select_first([load_data_batch_size])
+  Int effective_load_data_batch_size = if (defined(load_data_scatter_width)) then select_first([num_samples / load_data_scatter_width])
                                        else if num_samples < max_scatter_for_user then 1
                                          else if is_wgs then num_samples / max_scatter_for_user
                                            else if num_samples < 5001 then (num_samples / (max_scatter_for_user * 2))