Adding AD for scale testing VS 225 add AD (#7713)

* add call_AD to prepare py * add callAD to the extract record * add callAD to engine * update schema for PLs and AD * add callAD to extract * still need to update the Avro file * update dockstore * add updated jar * add emit AD to wdl * pass wdl vars * add docker image * add AD to rescattering workflow in dockstore * update naming * update wdl with newest jar
broadinstitute · Apr 12, 2022 · fd960a4 · fd960a4
1 parent f49e122
commit fd960a4
Show file tree

Hide file tree

Showing 9 changed files with 60 additions and 18 deletions.
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -79,6 +79,7 @@ workflows:
        branches:
          - master
          - ah_var_store
+         - rc-vs-225-add-AD
    - name: GvsCreateFilterSet
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -116,7 +117,7 @@ workflows:
        branches:
          - master
          - ah_var_store
-         - kc_wdl_gatk_override
+         - rc-vs-225-add-AD
    - name: GvsImportGenomes
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsImportGenomes.wdl
@@ -145,6 +146,7 @@ workflows:
        branches:
          - master
          - ah_var_store
+         - rc-vs-225-add-AD
    - name: GvsCreateVAT
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsCreateVAT.wdl

diff --git a/scripts/variantstore/wdl/GvsExtractCallset.wdl b/scripts/variantstore/wdl/GvsExtractCallset.wdl
@@ -15,7 +15,7 @@ workflow GvsExtractCallset {
 
     File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
     File interval_weights_bed = "gs://broad-public-datasets/gvs/weights/gvs_vet_weights_1kb.bed"
-    File gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/kc_extract_perf_20220404/gatk-package-4.2.0.0-485-g86fd5ac-SNAPSHOT-local.jar"
+    File gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/rc-add-AD-04112022/gatk-package-4.2.0.0-498-g1f53709-SNAPSHOT-local.jar"
 
     String output_file_base_name = filter_set_name
 
@@ -42,6 +42,9 @@ workflow GvsExtractCallset {
   String fq_ranges_dataset = "~{project_id}.~{dataset_name}"
   Array[String] tables_patterns_for_datetime_check = ["~{full_extract_prefix}__%"]
 
+  Boolean emit_pls = false
+  Boolean emit_ads = true
+
   call Utils.SplitIntervals {
     input:
       intervals = interval_list,
@@ -102,6 +105,8 @@ workflow GvsExtractCallset {
         max_last_modified_timestamp        = GetBQTablesMaxLastModifiedTimestamp.max_last_modified_timestamp,
         extract_preemptible_override       = extract_preemptible_override,
         extract_maxretries_override        = extract_maxretries_override,
+        emit_pls                           = emit_pls,
+        emit_ads                           = emit_ads,
     }
   }
 
@@ -200,6 +205,9 @@ task ExtractTask {
     String output_file
     String? output_gcs_dir
 
+    Boolean emit_pls
+    Boolean emit_ads
+
     Boolean do_not_filter_override
     String fq_ranges_dataset
     String fq_filter_set_info_table
@@ -256,6 +264,8 @@ task ExtractTask {
         ~{"--inferred-reference-state " + drop_state} \
         -L ~{intervals} \
         --project-id ~{read_project_id} \
+        ~{true='--emit-pls' false='' emit_pls} \
+        ~{true='--emit-ads' false='' emit_ads} \
         ${FILTERING_ARGS}
 
     # Drop trailing slash if one exists

diff --git a/scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl b/scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl
@@ -111,7 +111,7 @@ task PrepareRangesCallsetTask {
   }
 
   runtime {
-    docker: "us.gcr.io/broad-dsde-methods/variantstore:rsa_add_sample_columns_2022_03_28"
+    docker: "us.gcr.io/broad-dsde-methods/variantstore:rc_add_AD_2022_03_30"
     memory: "3 GB"
     disks: "local-disk 100 HDD"
     bootDiskSizeGb: 15

diff --git a/scripts/variantstore/wdl/extract/create_ranges_cohort_extract_data_table.py b/scripts/variantstore/wdl/extract/create_ranges_cohort_extract_data_table.py
@@ -26,7 +26,7 @@
 output_table_prefix = str(uuid.uuid4()).split("-")[0]
 print(f"running with prefix {output_table_prefix}")
 
-PET_VET_TABLE_COUNT = -1
+REF_VET_TABLE_COUNT = -1
 client = None
 
 EXTRACT_SAMPLE_TABLE = f"{output_table_prefix}_sample_names"
@@ -45,7 +45,7 @@ def dump_job_stats():
   print(" Total GBs billed ", total/(1024 * 1024 * 1024), " GBs")
 
 def get_partition_range(i):
-  if i < 1 or i > PET_VET_TABLE_COUNT:
+  if i < 1 or i > REF_VET_TABLE_COUNT:
     raise ValueError(f"out of partition range")
 
   return { 'start': (i-1)*SAMPLES_PER_PARTITION + 1, 'end': i*SAMPLES_PER_PARTITION }
@@ -115,6 +115,7 @@ def create_final_extract_vet_table(fq_destination_table_vet_data):
               alt           STRING,
               call_GT       STRING,
               call_GQ       INT64,
+              call_AD       STRING,
               AS_QUALapprox STRING,
               QUALapprox    STRING,
               call_PL       STRING	
@@ -151,7 +152,7 @@ def get_ref_subselect(fq_ref_table, samples, id):
           f" `{fq_ref_table}` WHERE sample_id IN ({sample_stanza})), "
     return sql
 
-  for i in range(1, PET_VET_TABLE_COUNT+1):
+  for i in range(1, REF_VET_TABLE_COUNT+1):
     partition_samples = get_samples_for_partition(sample_ids, i)  #sample ids for the partition
 
     if len(partition_samples) > 0:
@@ -180,16 +181,16 @@ def get_ref_subselect(fq_ref_table, samples, id):
 def populate_final_extract_table_with_vet(fq_ranges_dataset, fq_destination_table_data, sample_ids):
   def get_ref_subselect(fq_vet_table, samples, id):
     sample_stanza = ','.join([str(s) for s in samples])
-    sql = f"    q_{id} AS (SELECT location, sample_id, ref, alt, call_GT, call_GQ, AS_QUALapprox, QUALapprox, CALL_PL FROM \n" \
+    sql = f"    q_{id} AS (SELECT location, sample_id, ref, alt, call_GT, call_GQ, call_AD, AS_QUALapprox, QUALapprox, CALL_PL FROM \n" \
           f" `{fq_vet_table}` WHERE sample_id IN ({sample_stanza})), "
     return sql
 
-  for i in range(1, PET_VET_TABLE_COUNT+1):
+  for i in range(1, REF_VET_TABLE_COUNT+1):
     partition_samples = get_samples_for_partition(sample_ids, i)  #sample ids for the partition
 
     if len(partition_samples) > 0:
       subs = {}
-      insert = f"\nINSERT INTO `{fq_destination_table_data}` (location, sample_id, ref, alt, call_GT, call_GQ, AS_QUALapprox, QUALapprox, CALL_PL) \n WITH \n"
+      insert = f"\nINSERT INTO `{fq_destination_table_data}` (location, sample_id, ref, alt, call_GT, call_GQ, call_AD, AS_QUALapprox, QUALapprox, CALL_PL) \n WITH \n"
       fq_vet_table = f"{fq_ranges_dataset}.{VET_TABLE_PREFIX}{i:03}"
       j = 1
 
@@ -264,16 +265,16 @@ def make_extract_table(control_samples,
 
     ## TODO -- provide a cmdline arg to override this (so we can simulate smaller datasets)
 
-    global PET_VET_TABLE_COUNT
-    PET_VET_TABLE_COUNT = max_tables
+    global REF_VET_TABLE_COUNT ## TODO why are we using PET here?
+    REF_VET_TABLE_COUNT = max_tables
 
     global TEMP_TABLE_TTL_HOURS
     TEMP_TABLE_TTL_HOURS = temp_table_ttl_hours
 
     global TEMP_TABLE_TTL
     TEMP_TABLE_TTL = f" OPTIONS( expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL {TEMP_TABLE_TTL_HOURS} HOUR)) "
 
-    print(f"Using {PET_VET_TABLE_COUNT} tables in {fq_ranges_dataset}...")
+    print(f"Using {REF_VET_TABLE_COUNT} tables in {fq_ranges_dataset}...")
 
     # if we have a file of sample names, load it into a temporary table
     if (sample_names_to_extract):
@@ -289,7 +290,7 @@ def make_extract_table(control_samples,
     sample_ids = get_all_sample_ids(fq_destination_table_samples)
     print(f"Discovered {len(sample_ids)} samples in {fq_destination_table_samples}...")
 
-    # create the tables for exract data
+    # create the tables for extract data
     create_final_extract_ref_table(fq_destination_table_ref_data)
     create_final_extract_vet_table(fq_destination_table_vet_data)
 

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/SchemaUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/SchemaUtils.java
@@ -78,18 +78,19 @@ public class SchemaUtils {
     public static final String FILTERS = "filters";
     public static final List<String> FILTER_SET_SITE_FIELDS = Arrays.asList(FILTER_SET_NAME,LOCATION_FIELD_NAME,FILTERS);
 
-    public static final List<String> EXTRACT_VET_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, CALL_GT, CALL_GQ, AS_QUALapprox, QUALapprox, CALL_PL);
+    public static final List<String> EXTRACT_VET_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, CALL_GT, CALL_GQ, AS_QUALapprox, QUALapprox, CALL_PL, CALL_AD);
     public static final List<String> EXTRACT_REF_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, LENGTH_FIELD_NAME, STATE_FIELD_NAME);
 
-    public static final List<String> COHORT_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, STATE_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, CALL_GT, CALL_GQ, CALL_RGQ, QUALapprox, AS_QUALapprox, CALL_PL);
-    public static final List<String> COHORT_FIELDS_NO_PL = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, STATE_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, CALL_GT, CALL_GQ, CALL_RGQ, QUALapprox, AS_QUALapprox);
+    public static final List<String> COHORT_FIELDS_PL = Arrays.asList(CALL_PL);
+    public static final List<String> COHORT_FIELDS_AD = Arrays.asList(CALL_AD);
+    public static final List<String> COHORT_FIELDS_REQ = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, STATE_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, CALL_GT, CALL_GQ, CALL_RGQ, QUALapprox, AS_QUALapprox);
 
     public static final List<String> SAMPLE_FIELDS = Arrays.asList(SchemaUtils.SAMPLE_NAME_FIELD_NAME, SchemaUtils.SAMPLE_ID_FIELD_NAME);
     public static final List<String> YNG_FIELDS = Arrays.asList(FILTER_SET_NAME, LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, VQSLOD, YNG_STATUS);
     public static final List<String> TRANCHE_FIELDS = Arrays.asList(TARGET_TRUTH_SENSITIVITY, MIN_VQSLOD, TRANCHE_FILTER_NAME, TRANCHE_MODEL);
 
 
-    public static final List<String> PET_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, STATE_FIELD_NAME);
+    public static final List<String> PET_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, STATE_FIELD_NAME); // TODO do we still need?
     public static final List<String> VET_FIELDS = Arrays.asList(SAMPLE_ID_FIELD_NAME, LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, AS_RAW_MQ,
             AS_RAW_MQRankSum, QUALapprox, AS_QUALapprox, AS_RAW_ReadPosRankSum, AS_SB_TABLE, AS_VarDP, CALL_GT, CALL_AD, CALL_GQ, CALL_PGT, CALL_PID, CALL_PL);
     public static final List<String> ALT_ALLELE_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, REF_ALLELE_FIELD_NAME, "allele", ALT_ALLELE_FIELD_NAME, "allele_pos", CALL_GT, AS_RAW_MQ, RAW_MQ, AS_RAW_MQRankSum, "raw_mqranksum_x_10", AS_QUALapprox, "qual", AS_RAW_ReadPosRankSum, "raw_readposranksum_x_10", AS_SB_TABLE, "SB_REF_PLUS","SB_REF_MINUS","SB_ALT_PLUS","SB_ALT_MINUS", CALL_AD, "ref_ad", "ad");

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohort.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohort.java
@@ -139,6 +139,13 @@ public enum VQSLODFilteringType { GENOTYPE, SITES, NONE }
     )
     private boolean emitPLs = false;
 
+    @Argument(
+            fullName = "emit-ads",
+            doc = "Should allele depths be emitted in output VCF",
+            optional = true
+    )
+    private boolean emitADs = false;
+
     // what if this was a flag input only?
 
     @Argument(
@@ -283,6 +290,12 @@ protected void onStartup() {
             );
         }
 
+        if (emitADs) {
+            VCFStandardHeaderLines.addStandardFormatLines(extraHeaderLines, true,
+                    VCFConstants.GENOTYPE_ALLELE_DEPTHS
+            );
+        }
+
         SampleList sampleList = new SampleList(sampleTableName, sampleFileName, projectID, printDebugInformation, "extract-cohort");
         Map<Long, String> sampleIdToName = sampleList.getMap();
 
@@ -341,6 +354,7 @@ protected void onStartup() {
                 progressMeter,
                 filterSetName,
                 emitPLs,
+                emitADs,
                 vqslodfilteringType,
                 excludeFilteredSites,
                 inferredReferenceState,

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java
@@ -1,6 +1,5 @@
 package org.broadinstitute.hellbender.tools.gvs.extract;
 
-
 import htsjdk.samtools.util.OverlapDetector;
 import htsjdk.variant.variantcontext.Allele;
 import htsjdk.variant.variantcontext.GenotypeBuilder;
@@ -60,6 +59,7 @@ public class ExtractCohortEngine {
     private final ProgressMeter progressMeter;
     private final String projectID;
     private final boolean emitPLs;
+    private final boolean emitADs;
 
     /** List of sample names seen in the variant data from BigQuery. */
     private final Set<String> sampleNames;
@@ -116,6 +116,7 @@ public ExtractCohortEngine(final String projectID,
                                final ProgressMeter progressMeter,
                                final String filterSetName,
                                final boolean emitPLs,
+                               final boolean emitADs,
                                final ExtractCohort.VQSLODFilteringType VQSLODFilteringType,
                                final boolean excludeFilteredSites,
                                final GQStateEnum inferredReferenceState,
@@ -144,6 +145,7 @@ public ExtractCohortEngine(final String projectID,
         }
 
         this.emitPLs = emitPLs;
+        this.emitADs = emitADs;
 
         this.vetRangesFQDataSet = vetRangesFQDataSet;
         this.fqRangesExtractVetTable = fqRangesExtractVetTable;
@@ -779,6 +781,11 @@ private VariantContext createVariantContextFromSampleRecord(final ExtractCohortR
             genotypeBuilder.PL(Arrays.stream(callPL.split(SchemaUtils.MULTIVALUE_FIELD_DELIMITER)).mapToInt(Integer::parseInt).toArray());
         }
 
+        final String callAD = sampleRecord.getCallAD();
+        if ( this.emitADs && callAD != null ) {
+            genotypeBuilder.AD(Arrays.stream(callAD.split(SchemaUtils.MULTIVALUE_FIELD_DELIMITER)).mapToInt(Integer::parseInt).toArray());
+        }
+
         final String callRGQ = sampleRecord.getCallRGQ();
         if ( callRGQ != null ) {
             genotypeBuilder.attribute(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY, callRGQ);

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecord.java
@@ -19,6 +19,7 @@ public class ExtractCohortRecord implements Locatable {
     private final String refAllele;
     private final String altAllele;
     private final String callGT;
+    private final String callAD;
     private final String callGQ;
     private final String callRGQ;
     private final String qualapprox;
@@ -54,6 +55,7 @@ public ExtractCohortRecord(GenericRecord genericRecord) {
         this.refAllele = Objects.toString(genericRecord.get(SchemaUtils.REF_ALLELE_FIELD_NAME), null);
         this.altAllele = Objects.toString(genericRecord.get(SchemaUtils.ALT_ALLELE_FIELD_NAME), null);
         this.callGT = Objects.toString(genericRecord.get(SchemaUtils.CALL_GT), null);
+        this.callAD = Objects.toString(genericRecord.get(SchemaUtils.CALL_AD), null);
         this.callGQ = Objects.toString(genericRecord.get(SchemaUtils.CALL_GQ), null);
         this.qualapprox = Objects.toString(genericRecord.get(SchemaUtils.QUALapprox), null);
         this.asQualapprox = Objects.toString(genericRecord.get(SchemaUtils.AS_QUALapprox), null);
@@ -84,6 +86,7 @@ public ExtractCohortRecord(long location, long sampleId, String state) {
         this.refAllele = null;
         this.altAllele = null;
         this.callGT = null;
+        this.callAD = null;
         this.callGQ = null;
         this.callRGQ = null;
         this.qualapprox = null;
@@ -112,6 +115,8 @@ public ExtractCohortRecord(long location, long sampleId, String state) {
 
     public String getCallGT() { return this.callGT; }
 
+    public String getCallAD() { return this.callAD; }
+
     public String getCallGQ() { return this.callGQ; }
 
     public String getCallRGQ() { return this.callRGQ; }

diff --git a/...est/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecordUnitTest.java b/...est/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecordUnitTest.java
@@ -32,6 +32,7 @@ public void testDefinedExtractCohortRecord() {
         Assert.assertEquals(allDefinedRecord.getRefAllele(), "CTTT");
         Assert.assertEquals(allDefinedRecord.getAltAllele(), "C");
         Assert.assertEquals(allDefinedRecord.getCallGT(), "1/1");
+        // Assert.assertEquals(allDefinedRecord.getCallAD(), "0,1"); // TODO need to add this to the extract Avro!
         Assert.assertEquals(allDefinedRecord.getCallGQ(), "3");
         Assert.assertEquals(allDefinedRecord.getCallRGQ(), "38");
         Assert.assertEquals(allDefinedRecord.getQUALApprox(), "38");
@@ -55,6 +56,7 @@ public void testNulledExtractCohortRecord() {
         Assert.assertNull(someNullsRecord.getRefAllele());
         Assert.assertNull(someNullsRecord.getAltAllele());
         Assert.assertNull(someNullsRecord.getCallGT());
+        Assert.assertNull(someNullsRecord.getCallAD());
         Assert.assertNull(someNullsRecord.getCallGQ());
         Assert.assertNull(someNullsRecord.getCallRGQ());
         Assert.assertNull(someNullsRecord.getQUALApprox());