Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VET Ingest Validation / Allow Ingest of non-VQSR'ed data #7870

Merged
merged 3 commits into from
Jun 2, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions scripts/variantstore/wdl/GvsExtractCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,15 @@ workflow GvsExtractCallset {
service_account_json_path = service_account_json_path
}

call ValidateFilterSetName {
input:
if ( !do_not_filter_override ) {
call ValidateFilterSetName {
input:
query_project = query_project,
filter_set_name = filter_set_name,
data_project = project_id,
data_dataset = dataset_name,
service_account_json_path = service_account_json_path
}
}

scatter(i in range(length(SplitIntervals.interval_files))) {
Expand All @@ -116,7 +118,7 @@ workflow GvsExtractCallset {
fq_filter_set_site_table = fq_filter_set_site_table,
fq_filter_set_tranches_table = fq_filter_set_tranches_table,
filter_set_name = filter_set_name,
filter_set_name_verified = ValidateFilterSetName.done,
filter_set_name_verified = select_first([ValidateFilterSetName.done, "done"]),
service_account_json_path = service_account_json_path,
drop_state = "FORTY",
output_file = vcf_filename,
Expand Down
9 changes: 7 additions & 2 deletions scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@ workflow GvsImportGenomes {
Array[File] input_vcfs
Array[File] input_vcf_indexes

Boolean skip_loading_vqsr_fields = false

File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
# If increasing this, also consider increasing `load_data_preemptible_override` and `load_data_maxretries_override`.
Int load_data_batch_size = 5
Int? load_data_preemptible_override
Int? load_data_maxretries_override
File? load_data_gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/ah_var_store_20220415/gatk-package-4.2.0.0-492-g1387d47-SNAPSHOT-local.jar"
File? load_data_gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/gg_VS-443_VETIngestValidation_20220531/gatk-package-4.2.0.0-531-gf8f4ede-SNAPSHOT-local.jar"
String? service_account_json_path
}

Expand Down Expand Up @@ -65,6 +67,7 @@ workflow GvsImportGenomes {
dataset_name = dataset_name,
project_id = project_id,
duplicate_check_passed = CheckForDuplicateData.done,
skip_loading_vqsr_fields = skip_loading_vqsr_fields,
drop_state = "FORTY",
drop_state_includes_greater_than = false,
input_vcf_indexes = read_lines(CreateFOFNs.vcf_batch_vcf_index_fofns[i]),
Expand Down Expand Up @@ -211,6 +214,7 @@ task LoadData {
String? drop_state
Boolean? drop_state_includes_greater_than = false
Boolean force_loading_from_non_allele_specific = false
Boolean skip_loading_vqsr_fields = false

File? gatk_override
Int? load_data_preemptible_override
Expand Down Expand Up @@ -282,7 +286,8 @@ task LoadData {
--enable-vet ~{load_vet} \
-SN ${sample_name} \
-SNM ~{sample_map} \
--ref-version 38
--ref-version 38 \
--skip-loading-vqsr-fields ~{skip_loading_vqsr_fields}
done
>>>
runtime {
Expand Down
5 changes: 5 additions & 0 deletions scripts/variantstore/wdl/GvsUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,11 @@ task GetBQTablesMaxLastModifiedTimestamp {
}

task BuildGATKJarAndCreateDataset {
# Since this might be called repeatedly on the same branch (and the latest commit might have been updated), so we never call cache.
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This addresses VS-455

meta {
volatile: true
}

input {
String branch_name
String dataset_prefix
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,13 @@ public final class CreateVariantIngestFiles extends VariantWalker {
)
public boolean forceLoadingFromNonAlleleSpecific = false;

@Argument(
fullName = "skip-loading-vqsr-fields",
doc = "Do not load data for fields specific to VQSR",
optional = true
)
public boolean skipLoadingVqsrFields = false;

private boolean shouldWriteLoadStatusStarted = true;

// getGenotypes() returns list of lists for all samples at variant
Expand Down Expand Up @@ -248,7 +255,7 @@ public void onTraversalStart() {
}

if (enableVet && !vetRowsExist) {
vetCreator = new VetCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, outputDir, outputType, projectID, datasetName, forceLoadingFromNonAlleleSpecific);
vetCreator = new VetCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, outputDir, outputType, projectID, datasetName, forceLoadingFromNonAlleleSpecific, skipLoadingVqsrFields);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ public class VetCreator {
private final String sampleId;
private PendingBQWriter vetBQJsonWriter = null;
private final boolean forceLoadingFromNonAlleleSpecific;
private final boolean skipLoadingVqsrFields;

private static final String VET_FILETYPE_PREFIX = "vet_";

Expand All @@ -35,10 +36,11 @@ public static boolean doRowsExistFor(CommonCode.OutputType outputType, String pr
return BigQueryUtils.doRowsExistFor(projectId, datasetName, VET_FILETYPE_PREFIX + tableNumber, sampleId);
}

public VetCreator(String sampleIdentifierForOutputFileName, String sampleId, String tableNumber, final File outputDirectory, final CommonCode.OutputType outputType, final String projectId, final String datasetName, final boolean forceLoadingFromNonAlleleSpecific) {
public VetCreator(String sampleIdentifierForOutputFileName, String sampleId, String tableNumber, final File outputDirectory, final CommonCode.OutputType outputType, final String projectId, final String datasetName, final boolean forceLoadingFromNonAlleleSpecific, final boolean skipLoadingVqsrFields) {
this.sampleId = sampleId;
this.outputType = outputType;
this.forceLoadingFromNonAlleleSpecific = forceLoadingFromNonAlleleSpecific;
this.skipLoadingVqsrFields = skipLoadingVqsrFields;

try {
String PREFIX_SEPARATOR = "_";
Expand All @@ -64,11 +66,6 @@ public VetCreator(String sampleIdentifierForOutputFileName, String sampleId, Str
public void apply(VariantContext variant) throws IOException {
int start = variant.getStart();
long location = SchemaUtils.encodeLocation(variant.getContig(), start);
List<String> row = createRow(
location,
variant,
sampleId
);
Comment on lines -67 to -71
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TOL perhaps rather than deleting this and inlining the createRow on line 80 this could just be moved into the TSV case?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I understand. Do you not like the inlining and want the code block to remain explicit in the TSV case?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, I was suggesting to keep this block explicit but only in the case TSV: since that's the only spot it's used.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍


switch(outputType) {
case BQ:
Expand All @@ -80,7 +77,7 @@ public void apply(VariantContext variant) throws IOException {
break;
case TSV:
// write the variant to the XSV
vetWriter.getNewLineBuilder().setRow(row).write();
vetWriter.getNewLineBuilder().setRow(createRow(location, variant, sampleId)).write();
break;

}
Expand All @@ -91,9 +88,11 @@ public List<String> createRow(final long location, final VariantContext variant,
for ( final VetFieldEnum fieldEnum : VetFieldEnum.values() ) {
if (fieldEnum.equals(VetFieldEnum.location)) {
row.add(String.valueOf(location));
} else if (fieldEnum.equals(VetFieldEnum.sample_id)) {
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The if/else reformatting here and below is actually contrary to what my IntelliJ would do; is there a reason for this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that's me. I followed a different style guide in a former life and don't like the way it looks like that. But I'll stick with what IJ does.

else if (fieldEnum.equals(VetFieldEnum.sample_id)) {
row.add(sampleId);
} else {
}
else if ( !(skipLoadingVqsrFields && fieldEnum.isVqsrSpecificField() ) ) {
row.add(fieldEnum.getColumnValue(variant, forceLoadingFromNonAlleleSpecific));
}
}
Expand All @@ -106,12 +105,15 @@ public JSONObject createJson(final long location, final VariantContext variant,
for ( final VetFieldEnum fieldEnum : VetFieldEnum.values() ) {
if (fieldEnum.equals(VetFieldEnum.location)) {
jsonObject.put(VetFieldEnum.location.toString(), location);
} else if (fieldEnum.equals(VetFieldEnum.sample_id)) {
}
else if (fieldEnum.equals(VetFieldEnum.sample_id)) {
jsonObject.put(VetFieldEnum.sample_id.toString(), sampleId);
} else if (fieldEnum.equals(VetFieldEnum.call_GQ)) {
}
else if (fieldEnum.equals(VetFieldEnum.call_GQ)) {
jsonObject.put(fieldEnum.toString(), Integer.valueOf(fieldEnum.getColumnValue(variant, forceLoadingFromNonAlleleSpecific)));
} else {
String strVal = fieldEnum.getColumnValue(variant, forceLoadingFromNonAlleleSpecific);
}
else {
final String strVal = !(skipLoadingVqsrFields && fieldEnum.isVqsrSpecificField()) ? fieldEnum.getColumnValue(variant, forceLoadingFromNonAlleleSpecific) : "";
jsonObject.put(fieldEnum.toString(), StringUtils.isEmpty(strVal) ? null : strVal);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@
* location, // req
* reference_bases, // req
* alternate_bases_alt, // req
* alternate_bases_AS_RAW_MQ, // req
* alternate_bases_AS_RAW_MQ,
* alternate_bases_AS_RAW_MQRankSum,
* alternate_bases_AS_QUALapprox, // req
* alternate_bases_AS_QUALapprox,
* alternate_bases_AS_RAW_ReadPosRankSum,
* alternate_bases_AS_SB_TABLE, // req
* alternate_bases_AS_VarDP, // req
* alternate_bases_AS_SB_TABLE,
* alternate_bases_AS_VarDP,
* call_genotype, // req
* call_AD,
* call_DP, // Laura says consider removing for now-- so similar to AS_VarDP
Expand Down Expand Up @@ -76,7 +76,7 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo
},

AS_RAW_MQ {
// Required
// Required for VQSR Data
// can strip off the first one?
// TODO sci notation?
public String getColumnValue(final VariantContext variant, final boolean forceLoadingFromNonAlleleSpecific) {
Expand All @@ -87,10 +87,10 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo
throw new UserException("Cannot be missing required value for alternate_bases.AS_RAW_MQ, RAW_MQandDP or RAW_MQ.");
}
if (!forceLoadingFromNonAlleleSpecific && out != null) {
if(!out.endsWith("|0.00")) {
// KC: we are seeing a TON of these!
// if (!out.endsWith("|0.00")) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

commented out code

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, @kcibul commented out the logger statement, I just expanded the commenting out to take out all the functional code.

// logger.warn("Expected AS_RAW_MQ value to end in |0.00. value is: " + out + " for variant " + variant.toString());
}
// }
out = out.substring(0, out.lastIndexOf("|"));
String[] outValues = out.split("\\|");
out = Arrays
Expand Down Expand Up @@ -124,9 +124,13 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo
return outNotAlleleSpecific;
}
}
public boolean isVqsrSpecificField() {
return true;
}
},

AS_RAW_MQRankSum { // TODO -- maybe rely on 1/1 for call_GT, also get rid of the | at the beginning
// Required for VQSR Data
public String getColumnValue(final VariantContext variant, final boolean forceLoadingFromNonAlleleSpecific) {
// in the case where neither allele is reference, don't return a value
if (isGenotypeAllNonRef(variant.getGenotype(0))) {
Expand Down Expand Up @@ -175,27 +179,34 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo
//throw new UserException("Expected AS_RAW_MQRankSum value to be ||, ||| or to end in |NaN");
}
return out;

}
public boolean isVqsrSpecificField() {
return true;
}
},

QUALapprox { // Required
QUALapprox {
// Required for VQSR Data
public String getColumnValue(final VariantContext variant, final boolean forceLoadingFromNonAlleleSpecific) {
String out = getAttribute(variant, RAW_QUAL_APPROX_KEY, null);
if (out == null) {
throw new UserException("Cannot be missing required value for QUALapprox at site: " + variant.toString());
throw new UserException("Cannot be missing required value for QUALapprox at site: " + variant);
}
return out;
}
public boolean isVqsrSpecificField() {
return true;
}
},

AS_QUALapprox { // Required
AS_QUALapprox {
// Required for VQSR Data
public String getColumnValue(final VariantContext variant, final boolean forceLoadingFromNonAlleleSpecific) {
String out = getAttribute(variant, AS_RAW_QUAL_APPROX_KEY, null);
if (out == null) {
String outNotAlleleSpecific = getAttribute(variant, RAW_QUAL_APPROX_KEY, null);
if (outNotAlleleSpecific == null) {
throw new UserException("Cannot be missing required value for AS_QUALapprox or QUALapprox at site: " + variant.toString());
throw new UserException("Cannot be missing required value for AS_QUALapprox or QUALapprox at site: " + variant);
}
return outNotAlleleSpecific;
}
Expand All @@ -217,6 +228,9 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo
}
return out;
}
public boolean isVqsrSpecificField() {
return true;
}
},

AS_RAW_ReadPosRankSum { // TODO -- maybe rely on 1/1 for call_GT
Expand Down Expand Up @@ -268,9 +282,13 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo
}
return out;
}
public boolean isVqsrSpecificField() {
return true;
}
},

AS_SB_TABLE { // Required
AS_SB_TABLE {
// Required for VQSR Data
public String getColumnValue(final VariantContext variant, final boolean forceLoadingFromNonAlleleSpecific) {
String out = getAttribute(variant, GATKVCFConstants.AS_SB_TABLE_KEY, null);
if (forceLoadingFromNonAlleleSpecific || out == null) {
Expand All @@ -296,16 +314,20 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo
}
return out;
}
public boolean isVqsrSpecificField() {
return true;
}
},

AS_VarDP { // Required
AS_VarDP {
// Required for VQSR Data
public String getColumnValue(final VariantContext variant, final boolean forceLoadingFromNonAlleleSpecific) {
String out = getAttribute(variant, AS_VARIANT_DEPTH_KEY, null);
if (out == null) {
String varDP = getAttribute(variant, VARIANT_DEPTH_KEY, null);
String dP = getAttribute(variant, DEPTH_KEY, null);
if (varDP == null || dP == null) {
throw new UserException("Cannot be missing required value for AS_VarDP, or VarDP and DP, at site:" + variant.toString());
throw new UserException("Cannot be missing required value for AS_VarDP, or VarDP and DP, at site:" + variant);
}
int refDP = Integer.parseInt(dP) - Integer.parseInt(varDP);
out = refDP + "|" + varDP + "|";
Expand All @@ -319,10 +341,16 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo
}
return out;
}
public boolean isVqsrSpecificField() {
return true;
}
},

call_GT {
public String getColumnValue(final VariantContext variant, final boolean forceLoadingFromNonAlleleSpecific) {
if (!variant.hasGenotypes()) {
throw new UserException("Cannot be missing required value for call.GT");
}
// TODO how is missing handled?
return CommonCode.getGTString(variant);
}
Expand Down Expand Up @@ -357,8 +385,7 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo
call_GQ { // Required
public String getColumnValue(final VariantContext variant, final boolean forceLoadingFromNonAlleleSpecific) {
if (!variant.getGenotype(0).hasGQ()) {
//throw new UserException("Cannot be missing required value for call.GQ");
return "";
throw new UserException("Cannot be missing required value for call.GQ");
}
return String.valueOf(variant.getGenotype(0).getGQ());
}
Expand Down Expand Up @@ -389,6 +416,10 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo
throw new IllegalArgumentException("Not implemented");
}

public boolean isVqsrSpecificField() {
return false;
}

public static boolean isGenotypeAllNonRef(Genotype g) {
return g.getAllele(0).isNonReference() && g.getAllele(1).isNonReference();
}
Expand Down