Skip to content

Commit

Permalink
Adding AD for scale testing VS 225 add AD (#7713)
Browse files Browse the repository at this point in the history
* add call_AD to prepare py

* add callAD to the extract record

* add callAD to engine

* update schema for PLs and AD

* add callAD to extract

* still need to update the Avro file

* update dockstore

* add updated jar

* add emit AD to wdl

* pass wdl vars

* add docker image

* add AD to rescattering workflow in dockstore

* update naming

* update wdl with newest jar
  • Loading branch information
RoriCremer authored Apr 12, 2022
1 parent f49e122 commit fd960a4
Show file tree
Hide file tree
Showing 9 changed files with 60 additions and 18 deletions.
4 changes: 3 additions & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ workflows:
branches:
- master
- ah_var_store
- rc-vs-225-add-AD
- name: GvsCreateFilterSet
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsCreateFilterSet.wdl
Expand Down Expand Up @@ -116,7 +117,7 @@ workflows:
branches:
- master
- ah_var_store
- kc_wdl_gatk_override
- rc-vs-225-add-AD
- name: GvsImportGenomes
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsImportGenomes.wdl
Expand Down Expand Up @@ -145,6 +146,7 @@ workflows:
branches:
- master
- ah_var_store
- rc-vs-225-add-AD
- name: GvsCreateVAT
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsCreateVAT.wdl
Expand Down
12 changes: 11 additions & 1 deletion scripts/variantstore/wdl/GvsExtractCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ workflow GvsExtractCallset {

File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
File interval_weights_bed = "gs://broad-public-datasets/gvs/weights/gvs_vet_weights_1kb.bed"
File gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/kc_extract_perf_20220404/gatk-package-4.2.0.0-485-g86fd5ac-SNAPSHOT-local.jar"
File gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/rc-add-AD-04112022/gatk-package-4.2.0.0-498-g1f53709-SNAPSHOT-local.jar"

String output_file_base_name = filter_set_name

Expand All @@ -42,6 +42,9 @@ workflow GvsExtractCallset {
String fq_ranges_dataset = "~{project_id}.~{dataset_name}"
Array[String] tables_patterns_for_datetime_check = ["~{full_extract_prefix}__%"]

Boolean emit_pls = false
Boolean emit_ads = true

call Utils.SplitIntervals {
input:
intervals = interval_list,
Expand Down Expand Up @@ -102,6 +105,8 @@ workflow GvsExtractCallset {
max_last_modified_timestamp = GetBQTablesMaxLastModifiedTimestamp.max_last_modified_timestamp,
extract_preemptible_override = extract_preemptible_override,
extract_maxretries_override = extract_maxretries_override,
emit_pls = emit_pls,
emit_ads = emit_ads,
}
}

Expand Down Expand Up @@ -200,6 +205,9 @@ task ExtractTask {
String output_file
String? output_gcs_dir

Boolean emit_pls
Boolean emit_ads

Boolean do_not_filter_override
String fq_ranges_dataset
String fq_filter_set_info_table
Expand Down Expand Up @@ -256,6 +264,8 @@ task ExtractTask {
~{"--inferred-reference-state " + drop_state} \
-L ~{intervals} \
--project-id ~{read_project_id} \
~{true='--emit-pls' false='' emit_pls} \
~{true='--emit-ads' false='' emit_ads} \
${FILTERING_ARGS}

# Drop trailing slash if one exists
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ task PrepareRangesCallsetTask {
}

runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:rsa_add_sample_columns_2022_03_28"
docker: "us.gcr.io/broad-dsde-methods/variantstore:rc_add_AD_2022_03_30"
memory: "3 GB"
disks: "local-disk 100 HDD"
bootDiskSizeGb: 15
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
output_table_prefix = str(uuid.uuid4()).split("-")[0]
print(f"running with prefix {output_table_prefix}")

PET_VET_TABLE_COUNT = -1
REF_VET_TABLE_COUNT = -1
client = None

EXTRACT_SAMPLE_TABLE = f"{output_table_prefix}_sample_names"
Expand All @@ -45,7 +45,7 @@ def dump_job_stats():
print(" Total GBs billed ", total/(1024 * 1024 * 1024), " GBs")

def get_partition_range(i):
if i < 1 or i > PET_VET_TABLE_COUNT:
if i < 1 or i > REF_VET_TABLE_COUNT:
raise ValueError(f"out of partition range")

return { 'start': (i-1)*SAMPLES_PER_PARTITION + 1, 'end': i*SAMPLES_PER_PARTITION }
Expand Down Expand Up @@ -115,6 +115,7 @@ def create_final_extract_vet_table(fq_destination_table_vet_data):
alt STRING,
call_GT STRING,
call_GQ INT64,
call_AD STRING,
AS_QUALapprox STRING,
QUALapprox STRING,
call_PL STRING
Expand Down Expand Up @@ -151,7 +152,7 @@ def get_ref_subselect(fq_ref_table, samples, id):
f" `{fq_ref_table}` WHERE sample_id IN ({sample_stanza})), "
return sql

for i in range(1, PET_VET_TABLE_COUNT+1):
for i in range(1, REF_VET_TABLE_COUNT+1):
partition_samples = get_samples_for_partition(sample_ids, i) #sample ids for the partition

if len(partition_samples) > 0:
Expand Down Expand Up @@ -180,16 +181,16 @@ def get_ref_subselect(fq_ref_table, samples, id):
def populate_final_extract_table_with_vet(fq_ranges_dataset, fq_destination_table_data, sample_ids):
def get_ref_subselect(fq_vet_table, samples, id):
sample_stanza = ','.join([str(s) for s in samples])
sql = f" q_{id} AS (SELECT location, sample_id, ref, alt, call_GT, call_GQ, AS_QUALapprox, QUALapprox, CALL_PL FROM \n" \
sql = f" q_{id} AS (SELECT location, sample_id, ref, alt, call_GT, call_GQ, call_AD, AS_QUALapprox, QUALapprox, CALL_PL FROM \n" \
f" `{fq_vet_table}` WHERE sample_id IN ({sample_stanza})), "
return sql

for i in range(1, PET_VET_TABLE_COUNT+1):
for i in range(1, REF_VET_TABLE_COUNT+1):
partition_samples = get_samples_for_partition(sample_ids, i) #sample ids for the partition

if len(partition_samples) > 0:
subs = {}
insert = f"\nINSERT INTO `{fq_destination_table_data}` (location, sample_id, ref, alt, call_GT, call_GQ, AS_QUALapprox, QUALapprox, CALL_PL) \n WITH \n"
insert = f"\nINSERT INTO `{fq_destination_table_data}` (location, sample_id, ref, alt, call_GT, call_GQ, call_AD, AS_QUALapprox, QUALapprox, CALL_PL) \n WITH \n"
fq_vet_table = f"{fq_ranges_dataset}.{VET_TABLE_PREFIX}{i:03}"
j = 1

Expand Down Expand Up @@ -264,16 +265,16 @@ def make_extract_table(control_samples,

## TODO -- provide a cmdline arg to override this (so we can simulate smaller datasets)

global PET_VET_TABLE_COUNT
PET_VET_TABLE_COUNT = max_tables
global REF_VET_TABLE_COUNT ## TODO why are we using PET here?
REF_VET_TABLE_COUNT = max_tables

global TEMP_TABLE_TTL_HOURS
TEMP_TABLE_TTL_HOURS = temp_table_ttl_hours

global TEMP_TABLE_TTL
TEMP_TABLE_TTL = f" OPTIONS( expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL {TEMP_TABLE_TTL_HOURS} HOUR)) "

print(f"Using {PET_VET_TABLE_COUNT} tables in {fq_ranges_dataset}...")
print(f"Using {REF_VET_TABLE_COUNT} tables in {fq_ranges_dataset}...")

# if we have a file of sample names, load it into a temporary table
if (sample_names_to_extract):
Expand All @@ -289,7 +290,7 @@ def make_extract_table(control_samples,
sample_ids = get_all_sample_ids(fq_destination_table_samples)
print(f"Discovered {len(sample_ids)} samples in {fq_destination_table_samples}...")

# create the tables for exract data
# create the tables for extract data
create_final_extract_ref_table(fq_destination_table_ref_data)
create_final_extract_vet_table(fq_destination_table_vet_data)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,18 +78,19 @@ public class SchemaUtils {
public static final String FILTERS = "filters";
public static final List<String> FILTER_SET_SITE_FIELDS = Arrays.asList(FILTER_SET_NAME,LOCATION_FIELD_NAME,FILTERS);

public static final List<String> EXTRACT_VET_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, CALL_GT, CALL_GQ, AS_QUALapprox, QUALapprox, CALL_PL);
public static final List<String> EXTRACT_VET_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, CALL_GT, CALL_GQ, AS_QUALapprox, QUALapprox, CALL_PL, CALL_AD);
public static final List<String> EXTRACT_REF_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, LENGTH_FIELD_NAME, STATE_FIELD_NAME);

public static final List<String> COHORT_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, STATE_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, CALL_GT, CALL_GQ, CALL_RGQ, QUALapprox, AS_QUALapprox, CALL_PL);
public static final List<String> COHORT_FIELDS_NO_PL = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, STATE_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, CALL_GT, CALL_GQ, CALL_RGQ, QUALapprox, AS_QUALapprox);
public static final List<String> COHORT_FIELDS_PL = Arrays.asList(CALL_PL);
public static final List<String> COHORT_FIELDS_AD = Arrays.asList(CALL_AD);
public static final List<String> COHORT_FIELDS_REQ = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, STATE_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, CALL_GT, CALL_GQ, CALL_RGQ, QUALapprox, AS_QUALapprox);

public static final List<String> SAMPLE_FIELDS = Arrays.asList(SchemaUtils.SAMPLE_NAME_FIELD_NAME, SchemaUtils.SAMPLE_ID_FIELD_NAME);
public static final List<String> YNG_FIELDS = Arrays.asList(FILTER_SET_NAME, LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, VQSLOD, YNG_STATUS);
public static final List<String> TRANCHE_FIELDS = Arrays.asList(TARGET_TRUTH_SENSITIVITY, MIN_VQSLOD, TRANCHE_FILTER_NAME, TRANCHE_MODEL);


public static final List<String> PET_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, STATE_FIELD_NAME);
public static final List<String> PET_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, STATE_FIELD_NAME); // TODO do we still need?
public static final List<String> VET_FIELDS = Arrays.asList(SAMPLE_ID_FIELD_NAME, LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, AS_RAW_MQ,
AS_RAW_MQRankSum, QUALapprox, AS_QUALapprox, AS_RAW_ReadPosRankSum, AS_SB_TABLE, AS_VarDP, CALL_GT, CALL_AD, CALL_GQ, CALL_PGT, CALL_PID, CALL_PL);
public static final List<String> ALT_ALLELE_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, REF_ALLELE_FIELD_NAME, "allele", ALT_ALLELE_FIELD_NAME, "allele_pos", CALL_GT, AS_RAW_MQ, RAW_MQ, AS_RAW_MQRankSum, "raw_mqranksum_x_10", AS_QUALapprox, "qual", AS_RAW_ReadPosRankSum, "raw_readposranksum_x_10", AS_SB_TABLE, "SB_REF_PLUS","SB_REF_MINUS","SB_ALT_PLUS","SB_ALT_MINUS", CALL_AD, "ref_ad", "ad");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,13 @@ public enum VQSLODFilteringType { GENOTYPE, SITES, NONE }
)
private boolean emitPLs = false;

@Argument(
fullName = "emit-ads",
doc = "Should allele depths be emitted in output VCF",
optional = true
)
private boolean emitADs = false;

// what if this was a flag input only?

@Argument(
Expand Down Expand Up @@ -283,6 +290,12 @@ protected void onStartup() {
);
}

if (emitADs) {
VCFStandardHeaderLines.addStandardFormatLines(extraHeaderLines, true,
VCFConstants.GENOTYPE_ALLELE_DEPTHS
);
}

SampleList sampleList = new SampleList(sampleTableName, sampleFileName, projectID, printDebugInformation, "extract-cohort");
Map<Long, String> sampleIdToName = sampleList.getMap();

Expand Down Expand Up @@ -341,6 +354,7 @@ protected void onStartup() {
progressMeter,
filterSetName,
emitPLs,
emitADs,
vqslodfilteringType,
excludeFilteredSites,
inferredReferenceState,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.broadinstitute.hellbender.tools.gvs.extract;


import htsjdk.samtools.util.OverlapDetector;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.GenotypeBuilder;
Expand Down Expand Up @@ -60,6 +59,7 @@ public class ExtractCohortEngine {
private final ProgressMeter progressMeter;
private final String projectID;
private final boolean emitPLs;
private final boolean emitADs;

/** List of sample names seen in the variant data from BigQuery. */
private final Set<String> sampleNames;
Expand Down Expand Up @@ -116,6 +116,7 @@ public ExtractCohortEngine(final String projectID,
final ProgressMeter progressMeter,
final String filterSetName,
final boolean emitPLs,
final boolean emitADs,
final ExtractCohort.VQSLODFilteringType VQSLODFilteringType,
final boolean excludeFilteredSites,
final GQStateEnum inferredReferenceState,
Expand Down Expand Up @@ -144,6 +145,7 @@ public ExtractCohortEngine(final String projectID,
}

this.emitPLs = emitPLs;
this.emitADs = emitADs;

this.vetRangesFQDataSet = vetRangesFQDataSet;
this.fqRangesExtractVetTable = fqRangesExtractVetTable;
Expand Down Expand Up @@ -779,6 +781,11 @@ private VariantContext createVariantContextFromSampleRecord(final ExtractCohortR
genotypeBuilder.PL(Arrays.stream(callPL.split(SchemaUtils.MULTIVALUE_FIELD_DELIMITER)).mapToInt(Integer::parseInt).toArray());
}

final String callAD = sampleRecord.getCallAD();
if ( this.emitADs && callAD != null ) {
genotypeBuilder.AD(Arrays.stream(callAD.split(SchemaUtils.MULTIVALUE_FIELD_DELIMITER)).mapToInt(Integer::parseInt).toArray());
}

final String callRGQ = sampleRecord.getCallRGQ();
if ( callRGQ != null ) {
genotypeBuilder.attribute(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY, callRGQ);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ public class ExtractCohortRecord implements Locatable {
private final String refAllele;
private final String altAllele;
private final String callGT;
private final String callAD;
private final String callGQ;
private final String callRGQ;
private final String qualapprox;
Expand Down Expand Up @@ -54,6 +55,7 @@ public ExtractCohortRecord(GenericRecord genericRecord) {
this.refAllele = Objects.toString(genericRecord.get(SchemaUtils.REF_ALLELE_FIELD_NAME), null);
this.altAllele = Objects.toString(genericRecord.get(SchemaUtils.ALT_ALLELE_FIELD_NAME), null);
this.callGT = Objects.toString(genericRecord.get(SchemaUtils.CALL_GT), null);
this.callAD = Objects.toString(genericRecord.get(SchemaUtils.CALL_AD), null);
this.callGQ = Objects.toString(genericRecord.get(SchemaUtils.CALL_GQ), null);
this.qualapprox = Objects.toString(genericRecord.get(SchemaUtils.QUALapprox), null);
this.asQualapprox = Objects.toString(genericRecord.get(SchemaUtils.AS_QUALapprox), null);
Expand Down Expand Up @@ -84,6 +86,7 @@ public ExtractCohortRecord(long location, long sampleId, String state) {
this.refAllele = null;
this.altAllele = null;
this.callGT = null;
this.callAD = null;
this.callGQ = null;
this.callRGQ = null;
this.qualapprox = null;
Expand Down Expand Up @@ -112,6 +115,8 @@ public ExtractCohortRecord(long location, long sampleId, String state) {

public String getCallGT() { return this.callGT; }

public String getCallAD() { return this.callAD; }

public String getCallGQ() { return this.callGQ; }

public String getCallRGQ() { return this.callRGQ; }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ public void testDefinedExtractCohortRecord() {
Assert.assertEquals(allDefinedRecord.getRefAllele(), "CTTT");
Assert.assertEquals(allDefinedRecord.getAltAllele(), "C");
Assert.assertEquals(allDefinedRecord.getCallGT(), "1/1");
// Assert.assertEquals(allDefinedRecord.getCallAD(), "0,1"); // TODO need to add this to the extract Avro!
Assert.assertEquals(allDefinedRecord.getCallGQ(), "3");
Assert.assertEquals(allDefinedRecord.getCallRGQ(), "38");
Assert.assertEquals(allDefinedRecord.getQUALApprox(), "38");
Expand All @@ -55,6 +56,7 @@ public void testNulledExtractCohortRecord() {
Assert.assertNull(someNullsRecord.getRefAllele());
Assert.assertNull(someNullsRecord.getAltAllele());
Assert.assertNull(someNullsRecord.getCallGT());
Assert.assertNull(someNullsRecord.getCallAD());
Assert.assertNull(someNullsRecord.getCallGQ());
Assert.assertNull(someNullsRecord.getCallRGQ());
Assert.assertNull(someNullsRecord.getQUALApprox());
Expand Down

0 comments on commit fd960a4

Please sign in to comment.