Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

optimized TSV experiment, and range GQ dropping #6987

Merged
merged 3 commits into from
Dec 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions scripts/variantstore/wdl/ImportGenomes.example.inputs.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,19 @@
"ImportGenomes.vet_schema": "gs://broad-dsp-spec-ops/scratch/andrea/schemas/vet_schema.relaxed.json",
"ImportGenomes.metadata_schema": "gs://broad-dsp-spec-ops/scratch/andrea/schemas/sample_list_schema.json",
"ImportGenomes.interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list",
"ImportGenomes.drop_state": "SIXTY",
"ImportGenomes.gatk_override": "gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/gatk-package-4.1.8.1-149-g6b93ec6-SNAPSHOT-local.jar",
"ImportGenomes.drop_state": "FORTY",
"ImportGenomes.drop_state_includes_greater_than": "true",
"ImportGenomes.gatk_override": "gs://broad-dsp-spec-ops/kcibul/gatk-package-4.1.8.1-153-g9c3b338-SNAPSHOT-local.jar",

"ImportGenomes.project_id": "spec-ops-aou",
"ImportGenomes.dataset_name": "kc_import_test5",
"ImportGenomes.dataset_name": "kc_import_test1",

"ImportGenomes.sample_map": "gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/test_sample_map.csv",

"ImportGenomes.input_vcfs": ["gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/9a5be44d-c032-40ef-bd4f-8de736299484/ReblockGVCF/65c5b1d0-4aeb-44cc-a696-710dbc5f60f1/call-Reblock/BI_NA14170_A11_99000142090_SM-GXZV3_1.vcf.gz",
"gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/9a5be44d-c032-40ef-bd4f-8de736299484/ReblockGVCF/f02d7f4e-a19b-4d84-b0de-0750c9b3cfd9/call-Reblock/BI_NA23079_1_99000142156_SM-GXZVL_1.vcf.gz",
"gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/9a5be44d-c032-40ef-bd4f-8de736299484/ReblockGVCF/dc63b7c7-9d13-4674-8680-072ee04e9237/call-Reblock/BI_NA21987_1_99000142135_SM-GXZVB_1.vcf.gz"],

"ImportGenomes.output_directory": "gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/testrun1_importdir"
"ImportGenomes.output_directory": "gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/testrun2_importdir"
}

4 changes: 4 additions & 0 deletions scripts/variantstore/wdl/ImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ workflow ImportGenomes {
File vet_schema
File metadata_schema
String? drop_state
Boolean? drop_state_includes_greater_than = false

Int? preemptible_tries
File? gatk_override
Expand All @@ -39,6 +40,7 @@ workflow ImportGenomes {
input_metrics = input_metric,
sample_map = sample_map,
drop_state = drop_state,
drop_state_includes_greater_than = drop_state_includes_greater_than,
output_directory = output_directory,
gatk_override = gatk_override,
docker = docker_final,
Expand Down Expand Up @@ -124,6 +126,7 @@ task CreateImportTsvs {
String output_directory
File sample_map
String? drop_state
Boolean? drop_state_includes_greater_than = false

# runtime
Int? preemptible_tries
Expand Down Expand Up @@ -157,6 +160,7 @@ task CreateImportTsvs {
-V ~{input_vcf} \
-L ~{interval_list} \
~{"-IG " + drop_state} \
--ignore-above-gq-threshold ~{drop_state_includes_greater_than} \
~{"-QCF " + input_metrics} \
--mode GENOMES \
-SNM ~{sample_map} \
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/ngs_cohort_extract.inputs.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"NgsCohortExtract.wgs_intervals": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list",
"NgsCohortExtract.scatter_count": 50,

"NgsCohortExtract.gatk_override": "gs://broad-dsp-spec-ops/scratch/mshand/JointGenotyping/jars/gatk_filtering_for_export.jar",
"NgsCohortExtract.gatk_override": "gs://broad-dsp-spec-ops/kcibul/gatk-package-4.1.8.1-153-g9c3b338-SNAPSHOT-local.jar",

"NgsCohortExtract.fq_sample_table": "spec-ops-aou.kc_high_cov_ccdg.cohort_100_of_194",
"NgsCohortExtract.fq_cohort_extract_table": "spec-ops-aou.kc_high_cov_ccdg.exported_cohort_100_test",
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/ngs_filter_extract.inputs.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
"NgsFilterExtract.large_disk": 1000,
"NgsFilterExtract.huge_disk": 2000,

"NgsFilterExtract.gatk_override": "gs://broad-dsp-spec-ops/kcibul/gatk-package-4.1.8.1-145-gefaac86-SNAPSHOT-local.jar",
"NgsFilterExtract.gatk_override": "gs://broad-dsp-spec-ops/kcibul/gatk-package-4.1.8.1-153-g9c3b338-SNAPSHOT-local.jar",

"NgsFilterExtract.fq_sample_table": "spec-ops-aou.kc_acmg.metadata",
"NgsFilterExtract.fq_alt_allele_table": "spec-ops-aou.kc_acmg.alt_allele",
Expand Down
6 changes: 3 additions & 3 deletions scripts/variantstore/wdl/ngs_filter_extract.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ task SNPsVariantRecalibrator {
dbsnp_resource_vcf],
"GiB"))
Int machine_mem = select_first([machine_mem_gb, if auto_mem < 7 then 7 else auto_mem])
Int java_mem = machine_mem - 1
Int java_mem = machine_mem - 3


String model_report_arg = if defined(model_report) then "--input-model $MODEL_REPORT --output-tranches-for-scatter" else ""
Expand All @@ -458,7 +458,7 @@ task SNPsVariantRecalibrator {

MODEL_REPORT=~{model_report}

gatk --java-options -Xms~{java_mem}g \
gatk --java-options -Xmx~{java_mem}g \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment here on why we are setting the max?

VariantRecalibrator \
-V ~{sites_only_variant_filtered_vcf} \
-O ~{recalibration_filename} \
Expand All @@ -480,7 +480,7 @@ task SNPsVariantRecalibrator {
memory: "~{machine_mem} GiB"
cpu: 2
disks: "local-disk " + disk_size + " HDD"
preemptible: 1
preemptible: 0
docker: gatk_docker
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ public enum OutputType {
TSV,
ORC,
AVRO,
PARQUET
PARQUET,
TSV2
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,12 @@ public final class CreateVariantIngestFiles extends VariantWalker {
optional = true)
public PetTsvCreator.GQStateEnum gqStateToIgnore = null;

@Argument(fullName = "ignore-above-gq-threshold",
shortName = "GTIG",
doc = "in addition to dropping the gq block specified by ref-block-gq-to-ignore, also drop higher gq blocks",
optional = true)
public boolean dropAboveGqThreshold = false;

@Argument(fullName = "sample-name-mapping",
shortName = "SNM",
doc = "Sample name to sample id mapping",
Expand Down Expand Up @@ -145,7 +151,7 @@ public void onTraversalStart() {
final GenomeLocSortedSet genomeLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary));
intervalArgumentGenomeLocSortedSet = GenomeLocSortedSet.createSetFromList(genomeLocSortedSet.getGenomeLocParser(), IntervalUtils.genomeLocsFromLocatables(genomeLocSortedSet.getGenomeLocParser(), intervalArgumentCollection.getIntervals(seqDictionary)));

petTsvCreator = new PetTsvCreator(sampleName, sampleId, tableNumberPrefix, seqDictionary, gqStateToIgnore, outputDir, outputType);
petTsvCreator = new PetTsvCreator(sampleName, sampleId, tableNumberPrefix, seqDictionary, gqStateToIgnore, dropAboveGqThreshold, outputDir, outputType);
switch (mode) {
case EXOMES:
case GENOMES:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

public final class PetTsvCreator {
Expand All @@ -24,21 +26,21 @@ public final class PetTsvCreator {
private CommonCode.OutputType outputType;

private SimpleXSVWriter petTsvWriter = null;
private PetTsvWriter petTsv2Writer = null;
private PetOrcWriter petOrcWriter = null;
private PetAvroWriter petAvroWriter = null;
private PetParquetWriter petParquetWriter = null;

private final String sampleId;
private SimpleInterval previousInterval;
private final SAMSequenceDictionary seqDictionary;
private final GQStateEnum gqStateToIgnore;
private final Set<GQStateEnum> gqStatesToIgnore = new HashSet<GQStateEnum>();
private GenomeLocSortedSet coverageLocSortedSet;
private static String PET_FILETYPE_PREFIX = "pet_";

public PetTsvCreator(String sampleName, String sampleId, String tableNumberPrefix, SAMSequenceDictionary seqDictionary, GQStateEnum gqStateToIgnore, final File outputDirectory, final CommonCode.OutputType outputType) {
public PetTsvCreator(String sampleName, String sampleId, String tableNumberPrefix, SAMSequenceDictionary seqDictionary, GQStateEnum gqStateToIgnore, final boolean dropAboveGqThreshold, final File outputDirectory, final CommonCode.OutputType outputType) {
this.sampleId = sampleId;
this.seqDictionary = seqDictionary;
this.gqStateToIgnore = gqStateToIgnore;
this.outputType = outputType;

coverageLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary));
Expand All @@ -51,6 +53,9 @@ public PetTsvCreator(String sampleName, String sampleId, String tableNumberPrefi
petTsvWriter = new SimpleXSVWriter(petOutputFile.toPath(), IngestConstants.SEPARATOR);
petTsvWriter.setHeaderLine(petHeader);
break;
case TSV2:
petTsv2Writer = new PetTsvWriter(petOutputFile.getCanonicalPath());
break;
case ORC:
petOrcWriter = new PetOrcWriter(petOutputFile.getCanonicalPath());
break;
Expand All @@ -64,6 +69,10 @@ public PetTsvCreator(String sampleName, String sampleId, String tableNumberPrefi
throw new UserException("Could not create pet outputs", e);
}

this.gqStatesToIgnore.add(gqStateToIgnore);
if (dropAboveGqThreshold) {
this.gqStatesToIgnore.addAll(getGQStateEnumGreaterThan(gqStateToIgnore));
}
}

/**
Expand Down Expand Up @@ -112,13 +121,13 @@ public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) thro
// TODO throw an error if start and end are the same?

// for each of the reference blocks with the GQ to discard, keep track of the positions for the missing insertions
if (PetTsvCreator.getGQStateEnum(variant.getGenotype(0).getGQ()).equals(gqStateToIgnore)) {
if (this.gqStatesToIgnore.contains(getGQStateEnum(variant.getGenotype(0).getGQ()))) {
// add interval to "covered" intervals
setCoveredInterval(variantChr, start, end);
}

// create PET output if the reference block's GQ is not the one to discard or its a variant
if (!variant.isReferenceBlock() || !PetTsvCreator.getGQStateEnum(variant.getGenotype(0).getGQ()).equals(gqStateToIgnore)) {
if (!variant.isReferenceBlock() || !this.gqStatesToIgnore.contains(PetTsvCreator.getGQStateEnum(variant.getGenotype(0).getGQ()))) {

// add interval to "covered" intervals
setCoveredInterval(variantChr, start, end);
Expand Down Expand Up @@ -151,6 +160,9 @@ public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) thro
case TSV:
petTsvWriter.getNewLineBuilder().setRow(TSVLineToCreatePet).write();
break;
case TSV2:
petTsv2Writer.addRow(location, sampleId, state);
break;
case ORC:
petOrcWriter.addRow(location, sampleId, state);
break;
Expand Down Expand Up @@ -308,6 +320,50 @@ public static GQStateEnum getGQStateEnum(int GQ){

}

// this is ugly.... I think we need to rework the enum to better handle the new use cases
// but just getting this going.
public static Set<GQStateEnum> getGQStateEnumGreaterThan(GQStateEnum s){
Set<GQStateEnum> ret = new HashSet<GQStateEnum>();

switch (s) {
case ZERO:
ret.add(GQStateEnum.TEN);
ret.add(GQStateEnum.TWENTY);
ret.add(GQStateEnum.THIRTY);
ret.add(GQStateEnum.FORTY);
ret.add(GQStateEnum.FIFTY);
ret.add(GQStateEnum.SIXTY);
break;
case TEN:
ret.add(GQStateEnum.TWENTY);
ret.add(GQStateEnum.THIRTY);
ret.add(GQStateEnum.FORTY);
ret.add(GQStateEnum.FIFTY);
ret.add(GQStateEnum.SIXTY);
break;
case TWENTY:
ret.add(GQStateEnum.THIRTY);
ret.add(GQStateEnum.FORTY);
ret.add(GQStateEnum.FIFTY);
ret.add(GQStateEnum.SIXTY);
break;
case THIRTY:
ret.add(GQStateEnum.FORTY);
ret.add(GQStateEnum.FIFTY);
ret.add(GQStateEnum.SIXTY);
break;
case FORTY:
ret.add(GQStateEnum.FIFTY);
ret.add(GQStateEnum.SIXTY);
break;
case FIFTY:
ret.add(GQStateEnum.SIXTY);
break;
}

return ret;
}

public static List<String> getHeaders() {
return Arrays.stream(PetFieldEnum.values()).map(String::valueOf).collect(Collectors.toList());
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package org.broadinstitute.hellbender.tools.variantdb.nextgen;

import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;

import org.broadinstitute.hellbender.tools.variantdb.IngestConstants;

public class PetTsvWriter implements Closeable {
private BufferedWriter writer;
private final static char SEPARATOR = IngestConstants.SEPARATOR;

public PetTsvWriter(String outputFile) throws IOException{
writer = Files.newBufferedWriter(Paths.get(outputFile));

writer.append("location");
writer.append(SEPARATOR);
writer.append("sample");
writer.append(SEPARATOR);
writer.append("state");
}

public void addRow(long location, long sampleId, String state) throws IOException {
writer.append(String.valueOf(location));
writer.append(SEPARATOR);
writer.append(String.valueOf(sampleId));
writer.append(SEPARATOR);
writer.append(state);
}

public void close() throws IOException {
writer.flush();
writer.close();
}
}