broadinstitute · rsasch · Jun 29, 2022 · Jun 27, 2022 · Jun 27, 2022 · Jun 28, 2022
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -210,7 +210,7 @@ workflows:
        branches:
          - master
          - ah_var_store
-         - vs_447_fixup_non_fq_invocations
+         - rsa_reblock_quickstart_v2
    - name: GvsIngestTieout
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsIngestTieout.wdl

diff --git a/scripts/variantstore/InputValidation.ipynb b/scripts/variantstore/InputValidation.ipynb
@@ -51,7 +51,7 @@
    "source": [
     "def get_field_name(possible_field_names, attribute_names, type_string):\n",
     "    error_seen = False\n",
-    "    \n",
+    "\n",
     "    field_names_found = set()\n",
     "    for field_name in possible_field_names:\n",
     "        if (field_name in attribute_names):\n",
@@ -126,13 +126,13 @@
     "errors_seen = False\n",
     "\n",
     "# This is a list of all of the *possible* field names for reblocked gvcfs and their corresponding indices\n",
-    "reblocked_gvcf_fields = ['reblocked_gvcf', \n",
+    "reblocked_gvcf_fields = ['reblocked_gvcf',\n",
     "                         'reblocked_gvcf_path',\n",
-    "                         'hg38_reblocked_gvcf']\n",
+    "                         'hg38_reblocked_v2_vcf']\n",
     "reblocked_gvcf_index_fields = [\n",
     "                         'reblocked_gvcf_index',\n",
     "                         'reblocked_gvcf_index_path',\n",
-    "                         'hg38_reblocked_gvcf_index']\n",
+    "                         'hg38_reblocked_v2_vcf_index']\n",
     "\n",
     "\n",
     "entity_types = fapi.list_entity_types(ws_project, ws_name).json()\n",
@@ -146,19 +146,19 @@
     "        errors_seen = True\n",
     "        error_message = sample_set[\"message\"]\n",
     "        print(f\"ERROR: Looking up {sample_set_id}: {error_message}\")\n",
-    "    \n",
+    "\n",
     "if (not errors_seen):\n",
     "    samples_in_sample_set = set()\n",
     "    samples_dupes = set()\n",
-    "    \n",
+    "\n",
     "    attributes = sample_set[\"attributes\"]\n",
     "    for entity in sample_set['attributes']['samples']['items']:\n",
     "        sample_id = entity['entityName']\n",
     "\n",
     "        if sample_id in samples_in_sample_set:\n",
     "            samples_dupes.add(sample_id)\n",
     "        else:\n",
-    "            samples_in_sample_set.add(sample_id)    \n",
+    "            samples_in_sample_set.add(sample_id)\n",
     "\n",
     "    # Are there any empty sample_ids?\n",
     "    if ('' in samples_in_sample_set):\n",
@@ -191,7 +191,7 @@
     "    gvcf_index_field, error_seen = get_field_name(reblocked_gvcf_index_fields, attribute_names, \"reblocked gvcf index\")\n",
     "    if (error_seen):\n",
     "        errors_seen = True\n",
-    "    \n",
+    "\n",
     "if (not errors_seen):\n",
     "    entity_count = entity_types[etype][\"count\"]\n",
     "\n",
@@ -226,7 +226,7 @@
     "                    else:\n",
     "                        error_seen_for_sample = True\n",
     "                        print(f\"ERROR: Unrecognized extension \\\"{reblocked_gvcf_name_and_ext[1]}\\\" for {gvcf_field}: {reblocked_gvcf_name}\")\n",
-    "                        \n",
+    "\n",
     "                    if (not error_seen_for_sample):\n",
     "                        reblocked_gvcf_index_name = os.path.basename(reblocked_gvcf_index)\n",
     "                        if (reblocked_gvcf_index_name != expected_reblocked_gvcf_index_name):\n",

diff --git a/scripts/variantstore/TERRA_QUICKSTART.md b/scripts/variantstore/TERRA_QUICKSTART.md
@@ -10,15 +10,14 @@ Through this QuickStart you will learn how to use the Broad Genomic Variant Stor
 This quickstart assumes that you are familiar with Terra workspaces, the data model and providing input parameters and launching workflows.
 
 1. You will need to have or create a BigQuery dataset (we'll call this `dataset_name` later on).
-2. Grant the "BigQuery Data Editor" role on that **dataset** to your Terra PROXY group.  Your proxy group name can be found on your Terra Profile page and look something like `PROXY_12345678901234567890@firecloud.org`.
-3. Grant the following roles on the Google **project** (we'll call this `project_id` later on) containing the dataset to your proxy group:
+2. Grant the following roles on the Google **project** (we'll call this `project_id` later on) containing the dataset to your proxy group:
     - BigQuery data editor
     - BigQuery job user
     - BigQuery Read Session User
-4. These tools expect re-blocked gVCF files as input, which are provided in this workspace
+3. These tools expect re-blocked gVCF files as input, which are provided in this workspace
 
 ## 1. Import Data
-A sample set for the quickstart has already been created with 10 samples and paths to re-blocked gVCFs for each sample.  Run the two import workflows against this sample set by selecting "sample_set" as the root entity type ("Step 1" on the workflow submission page) and `gvs_demo-10` for the data ("Step 2" on the workflow submission page).  If you are creating your own sample set, note that the sample table should have columns for the re-blocked gVCFs (`hg38_reblocked_gvcf` or `reblocked_gvcf_path`) and their index files.
+A sample set for the quickstart has already been created with 10 samples and paths to re-blocked gVCFs for each sample.  Run the two import workflows against this sample set by selecting "sample_set" as the root entity type ("Step 1" on the workflow submission page) and `gvs_demo-10` for the data ("Step 2" on the workflow submission page).  If you are creating your own sample set, note that the sample table should have columns for the re-blocked gVCFs (`hg38_reblocked_v2_vcf` or `reblocked_gvcf_path`) and their index files.
 
 ## 1.1 Assign Gvs IDs and Create Loading Tables
 To optimize the internal queries, each sample must have a unique and consecutive integer ID assigned. Run the `GvsAssignIds` workflow, which will create an appropriate ID for each sample in the sample set and update the BigQuery dataset with the sample name to ID mapping info.
@@ -44,8 +43,8 @@ These are the required parameters which must be supplied to the workflow:
 | --------------------- | ----------- |
 | dataset_name          | the name of the dataset you created above       |
 | external_sample_names | `this.samples.sample_id` (the sample identifier from the `gvs_demo_10` sample set) |
-| input_vcf_indexes     | `this.samples.hg38_reblocked_gvcf_index` (reblocked gvcf index file for each sample) |
-| input_vcfs            | `this.samples.hg38_reblocked_gvcf` (reblocked gvcf file for each sample) |
+| input_vcf_indexes     | `this.samples.hg38_reblocked_v2_vcf_index` (reblocked gvcf index file for each sample) |
+| input_vcfs            | `this.samples.hg38_reblocked_v2_vcf` (reblocked gvcf file for each sample) |
 | project_id            | the name of the google project containing the dataset |
 
 ## 2. Create Alt Allele Table
@@ -57,6 +56,7 @@ This is done by running the `GvsCreateAltAllele` workflow with the following par
 
 | Parameter         | Description |
 | ----------------- | ----------- |
+| call_set_identifier | a unique name to identify this callset (e.g. `my_gvs_demo`); you will want to make note of this for later steps |
 | dataset_name      | the name of the dataset you created above  |
 | project_id        | the name of the google project containing the dataset |
 
@@ -71,9 +71,7 @@ This is done by running the `GvsCreateFilterSet` workflow with the following par
 | --------------------------------- | ----------- |
 | dataset_name                      | the name of the dataset you created above  |
 | filter_set_name                   | a unique name to identify this filter set (e.g. `my_demo_filters`); you will want to make note of this for use in step 5 |
-| INDEL_VQSR_max_gaussians_override | you don't need to set this unless a previous run of IndelsVariantRecalibrator task failed to converge, start with 3 and lower as needed |
 | project_id                        | the name of the google project containing the dataset |
-| SNP_VQSR_max_gaussians_override   | you don't need to set this unless a previous run of SNPsVariantRecalibratorClassic task failed to converge, start with 5 and lower as needed |
 
 ## 4. Prepare Callset
 This step performs the heavy lifting in BigQuery to gather all the data required to create a jointly called VCF.

diff --git a/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl
@@ -7,7 +7,7 @@ workflow GvsQuickstartIntegration {
 
     input {
         String branch_name
-        String expected_output_prefix = "gs://broad-dsp-spec-ops/quickstart_integration/2022-06-03/"
+        String expected_output_prefix = "gs://gvs-internal-quickstart/integration/2022-06-28/"
 
         Array[String] external_sample_names = [
                                               "ERS4367795",
@@ -23,29 +23,29 @@ workflow GvsQuickstartIntegration {
                                               ]
 
         Array[File] input_vcfs = [
-                                 "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00405.haplotypeCalls.er.raw.vcf.gz.vcf.gz",
-                                 "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00408.haplotypeCalls.er.raw.vcf.gz.vcf.gz",
-                                 "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00418.haplotypeCalls.er.raw.vcf.gz.vcf.gz",
-                                 "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00420.haplotypeCalls.er.raw.vcf.gz.vcf.gz",
-                                 "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00423.haplotypeCalls.er.raw.vcf.gz.vcf.gz",
-                                 "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00427.haplotypeCalls.er.raw.vcf.gz.vcf.gz",
-                                 "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00429.haplotypeCalls.er.raw.vcf.gz.vcf.gz",
-                                 "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00444.haplotypeCalls.er.raw.vcf.gz.vcf.gz",
-                                 "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00447.haplotypeCalls.er.raw.vcf.gz.vcf.gz",
-                                 "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00450.haplotypeCalls.er.raw.vcf.gz.vcf.gz"
+                                 "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00405.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz",
+                                 "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00408.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz",
+                                 "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00418.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz",
+                                 "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00420.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz",
+                                 "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00423.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz",
+                                 "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00427.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz",
+                                 "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00429.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz",
+                                 "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00444.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz",
+                                 "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00447.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz",
+                                 "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00450.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz"
                                  ]
 
         Array[File] input_vcf_indexes = [
-                                        "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00405.haplotypeCalls.er.raw.vcf.gz.vcf.gz.tbi",
-                                        "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00408.haplotypeCalls.er.raw.vcf.gz.vcf.gz.tbi",
-                                        "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00418.haplotypeCalls.er.raw.vcf.gz.vcf.gz.tbi",
-                                        "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00420.haplotypeCalls.er.raw.vcf.gz.vcf.gz.tbi",
-                                        "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00423.haplotypeCalls.er.raw.vcf.gz.vcf.gz.tbi",
-                                        "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00427.haplotypeCalls.er.raw.vcf.gz.vcf.gz.tbi",
-                                        "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00429.haplotypeCalls.er.raw.vcf.gz.vcf.gz.tbi",
-                                        "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00444.haplotypeCalls.er.raw.vcf.gz.vcf.gz.tbi",
-                                        "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00447.haplotypeCalls.er.raw.vcf.gz.vcf.gz.tbi",
-                                        "gs://fc-2b4456d7-974b-4b67-90f8-63c2fd2c03d4/gvcfs/HG00450.haplotypeCalls.er.raw.vcf.gz.vcf.gz.tbi",
+                                        "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00405.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi",
+                                        "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00408.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi",
+                                        "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00418.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi",
+                                        "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00420.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi",
+                                        "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00423.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi",
+                                        "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00427.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi",
+                                        "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00429.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi",
+                                        "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00444.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi",
+                                        "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00447.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi",
+                                        "gs://gvs-internal-quickstart/reblocked-v2-vcfs/HG00450.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi"
                                         ]
 
         Int? extract_scatter_count