Skip to content

Commit

Permalink
optimized TSV experiment, and range GQ dropping (#6987)
Browse files Browse the repository at this point in the history
* optimized TSV experiment, and range GQ dropping
* typos
* PR comment
  • Loading branch information
kcibul committed Mar 9, 2021
1 parent d658b25 commit d6d5b08
Show file tree
Hide file tree
Showing 9 changed files with 121 additions and 16 deletions.
9 changes: 5 additions & 4 deletions scripts/variantstore/wdl/ImportGenomes.example.inputs.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,19 @@
"ImportGenomes.vet_schema": "gs://broad-dsp-spec-ops/scratch/andrea/schemas/vet_schema.relaxed.json",
"ImportGenomes.metadata_schema": "gs://broad-dsp-spec-ops/scratch/andrea/schemas/sample_list_schema.json",
"ImportGenomes.interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list",
"ImportGenomes.drop_state": "SIXTY",
"ImportGenomes.gatk_override": "gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/gatk-package-4.1.8.1-149-g6b93ec6-SNAPSHOT-local.jar",
"ImportGenomes.drop_state": "FORTY",
"ImportGenomes.drop_state_includes_greater_than": "true",
"ImportGenomes.gatk_override": "gs://broad-dsp-spec-ops/kcibul/gatk-package-4.1.8.1-153-g9c3b338-SNAPSHOT-local.jar",

"ImportGenomes.project_id": "spec-ops-aou",
"ImportGenomes.dataset_name": "kc_import_test5",
"ImportGenomes.dataset_name": "kc_import_test1",

"ImportGenomes.sample_map": "gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/test_sample_map.csv",

"ImportGenomes.input_vcfs": ["gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/9a5be44d-c032-40ef-bd4f-8de736299484/ReblockGVCF/65c5b1d0-4aeb-44cc-a696-710dbc5f60f1/call-Reblock/BI_NA14170_A11_99000142090_SM-GXZV3_1.vcf.gz",
"gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/9a5be44d-c032-40ef-bd4f-8de736299484/ReblockGVCF/f02d7f4e-a19b-4d84-b0de-0750c9b3cfd9/call-Reblock/BI_NA23079_1_99000142156_SM-GXZVL_1.vcf.gz",
"gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/9a5be44d-c032-40ef-bd4f-8de736299484/ReblockGVCF/dc63b7c7-9d13-4674-8680-072ee04e9237/call-Reblock/BI_NA21987_1_99000142135_SM-GXZVB_1.vcf.gz"],

"ImportGenomes.output_directory": "gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/testrun1_importdir"
"ImportGenomes.output_directory": "gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/testrun2_importdir"
}

4 changes: 4 additions & 0 deletions scripts/variantstore/wdl/ImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ workflow ImportGenomes {
File vet_schema
File metadata_schema
String? drop_state
Boolean? drop_state_includes_greater_than = false
Int? preemptible_tries
File? gatk_override
Expand All @@ -39,6 +40,7 @@ workflow ImportGenomes {
input_metrics = input_metric,
sample_map = sample_map,
drop_state = drop_state,
drop_state_includes_greater_than = drop_state_includes_greater_than,
output_directory = output_directory,
gatk_override = gatk_override,
docker = docker_final,
Expand Down Expand Up @@ -124,6 +126,7 @@ task CreateImportTsvs {
String output_directory
File sample_map
String? drop_state
Boolean? drop_state_includes_greater_than = false

# runtime
Int? preemptible_tries
Expand Down Expand Up @@ -157,6 +160,7 @@ task CreateImportTsvs {
-V ~{input_vcf} \
-L ~{interval_list} \
~{"-IG " + drop_state} \
--ignore-above-gq-threshold ~{drop_state_includes_greater_than} \
~{"-QCF " + input_metrics} \
--mode GENOMES \
-SNM ~{sample_map} \
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/ngs_cohort_extract.inputs.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"NgsCohortExtract.wgs_intervals": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list",
"NgsCohortExtract.scatter_count": 50,

"NgsCohortExtract.gatk_override": "gs://broad-dsp-spec-ops/scratch/mshand/JointGenotyping/jars/gatk_filtering_for_export.jar",
"NgsCohortExtract.gatk_override": "gs://broad-dsp-spec-ops/kcibul/gatk-package-4.1.8.1-153-g9c3b338-SNAPSHOT-local.jar",

"NgsCohortExtract.fq_sample_table": "spec-ops-aou.kc_high_cov_ccdg.cohort_100_of_194",
"NgsCohortExtract.fq_cohort_extract_table": "spec-ops-aou.kc_high_cov_ccdg.exported_cohort_100_test",
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/ngs_filter_extract.inputs.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
"NgsFilterExtract.large_disk": 1000,
"NgsFilterExtract.huge_disk": 2000,

"NgsFilterExtract.gatk_override": "gs://broad-dsp-spec-ops/kcibul/gatk-package-4.1.8.1-145-gefaac86-SNAPSHOT-local.jar",
"NgsFilterExtract.gatk_override": "gs://broad-dsp-spec-ops/kcibul/gatk-package-4.1.8.1-153-g9c3b338-SNAPSHOT-local.jar",

"NgsFilterExtract.fq_sample_table": "spec-ops-aou.kc_acmg.metadata",
"NgsFilterExtract.fq_alt_allele_table": "spec-ops-aou.kc_acmg.alt_allele",
Expand Down
6 changes: 3 additions & 3 deletions scripts/variantstore/wdl/ngs_filter_extract.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ task SNPsVariantRecalibrator {
dbsnp_resource_vcf],
"GiB"))
Int machine_mem = select_first([machine_mem_gb, if auto_mem < 7 then 7 else auto_mem])
Int java_mem = machine_mem - 1
Int java_mem = machine_mem - 3


String model_report_arg = if defined(model_report) then "--input-model $MODEL_REPORT --output-tranches-for-scatter" else ""
Expand All @@ -458,7 +458,7 @@ task SNPsVariantRecalibrator {

MODEL_REPORT=~{model_report}

gatk --java-options -Xms~{java_mem}g \
gatk --java-options -Xmx~{java_mem}g \
VariantRecalibrator \
-V ~{sites_only_variant_filtered_vcf} \
-O ~{recalibration_filename} \
Expand All @@ -480,7 +480,7 @@ task SNPsVariantRecalibrator {
memory: "~{machine_mem} GiB"
cpu: 2
disks: "local-disk " + disk_size + " HDD"
preemptible: 1
preemptible: 0
docker: gatk_docker
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ public enum OutputType {
TSV,
ORC,
AVRO,
PARQUET
PARQUET,
TSV2
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,12 @@ public final class CreateVariantIngestFiles extends VariantWalker {
optional = true)
public PetTsvCreator.GQStateEnum gqStateToIgnore = null;

@Argument(fullName = "ignore-above-gq-threshold",
shortName = "GTIG",
doc = "in addition to dropping the gq block specified by ref-block-gq-to-ignore, also drop higher gq blocks",
optional = true)
public boolean dropAboveGqThreshold = false;

@Argument(fullName = "sample-name-mapping",
shortName = "SNM",
doc = "Sample name to sample id mapping",
Expand Down Expand Up @@ -145,7 +151,7 @@ public void onTraversalStart() {
final GenomeLocSortedSet genomeLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary));
intervalArgumentGenomeLocSortedSet = GenomeLocSortedSet.createSetFromList(genomeLocSortedSet.getGenomeLocParser(), IntervalUtils.genomeLocsFromLocatables(genomeLocSortedSet.getGenomeLocParser(), intervalArgumentCollection.getIntervals(seqDictionary)));

petTsvCreator = new PetTsvCreator(sampleName, sampleId, tableNumberPrefix, seqDictionary, gqStateToIgnore, outputDir, outputType);
petTsvCreator = new PetTsvCreator(sampleName, sampleId, tableNumberPrefix, seqDictionary, gqStateToIgnore, dropAboveGqThreshold, outputDir, outputType);
switch (mode) {
case EXOMES:
case GENOMES:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

public final class PetTsvCreator {
Expand All @@ -24,21 +26,21 @@ public final class PetTsvCreator {
private CommonCode.OutputType outputType;

private SimpleXSVWriter petTsvWriter = null;
private PetTsvWriter petTsv2Writer = null;
private PetOrcWriter petOrcWriter = null;
private PetAvroWriter petAvroWriter = null;
private PetParquetWriter petParquetWriter = null;

private final String sampleId;
private SimpleInterval previousInterval;
private final SAMSequenceDictionary seqDictionary;
private final GQStateEnum gqStateToIgnore;
private final Set<GQStateEnum> gqStatesToIgnore = new HashSet<GQStateEnum>();
private GenomeLocSortedSet coverageLocSortedSet;
private static String PET_FILETYPE_PREFIX = "pet_";

public PetTsvCreator(String sampleName, String sampleId, String tableNumberPrefix, SAMSequenceDictionary seqDictionary, GQStateEnum gqStateToIgnore, final File outputDirectory, final CommonCode.OutputType outputType) {
public PetTsvCreator(String sampleName, String sampleId, String tableNumberPrefix, SAMSequenceDictionary seqDictionary, GQStateEnum gqStateToIgnore, final boolean dropAboveGqThreshold, final File outputDirectory, final CommonCode.OutputType outputType) {
this.sampleId = sampleId;
this.seqDictionary = seqDictionary;
this.gqStateToIgnore = gqStateToIgnore;
this.outputType = outputType;

coverageLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary));
Expand All @@ -51,6 +53,9 @@ public PetTsvCreator(String sampleName, String sampleId, String tableNumberPrefi
petTsvWriter = new SimpleXSVWriter(petOutputFile.toPath(), IngestConstants.SEPARATOR);
petTsvWriter.setHeaderLine(petHeader);
break;
case TSV2:
petTsv2Writer = new PetTsvWriter(petOutputFile.getCanonicalPath());
break;
case ORC:
petOrcWriter = new PetOrcWriter(petOutputFile.getCanonicalPath());
break;
Expand All @@ -64,6 +69,10 @@ public PetTsvCreator(String sampleName, String sampleId, String tableNumberPrefi
throw new UserException("Could not create pet outputs", e);
}

this.gqStatesToIgnore.add(gqStateToIgnore);
if (dropAboveGqThreshold) {
this.gqStatesToIgnore.addAll(getGQStateEnumGreaterThan(gqStateToIgnore));
}
}

/**
Expand Down Expand Up @@ -112,13 +121,13 @@ public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) thro
// TODO throw an error if start and end are the same?

// for each of the reference blocks with the GQ to discard, keep track of the positions for the missing insertions
if (PetTsvCreator.getGQStateEnum(variant.getGenotype(0).getGQ()).equals(gqStateToIgnore)) {
if (this.gqStatesToIgnore.contains(getGQStateEnum(variant.getGenotype(0).getGQ()))) {
// add interval to "covered" intervals
setCoveredInterval(variantChr, start, end);
}

// create PET output if the reference block's GQ is not the one to discard or its a variant
if (!variant.isReferenceBlock() || !PetTsvCreator.getGQStateEnum(variant.getGenotype(0).getGQ()).equals(gqStateToIgnore)) {
if (!variant.isReferenceBlock() || !this.gqStatesToIgnore.contains(PetTsvCreator.getGQStateEnum(variant.getGenotype(0).getGQ()))) {

// add interval to "covered" intervals
setCoveredInterval(variantChr, start, end);
Expand Down Expand Up @@ -151,6 +160,9 @@ public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) thro
case TSV:
petTsvWriter.getNewLineBuilder().setRow(TSVLineToCreatePet).write();
break;
case TSV2:
petTsv2Writer.addRow(location, sampleId, state);
break;
case ORC:
petOrcWriter.addRow(location, sampleId, state);
break;
Expand Down Expand Up @@ -308,6 +320,50 @@ public static GQStateEnum getGQStateEnum(int GQ){

}

// this is ugly.... I think we need to rework the enum to better handle the new use cases
// but just getting this going.
public static Set<GQStateEnum> getGQStateEnumGreaterThan(GQStateEnum s){
Set<GQStateEnum> ret = new HashSet<GQStateEnum>();

switch (s) {
case ZERO:
ret.add(GQStateEnum.TEN);
ret.add(GQStateEnum.TWENTY);
ret.add(GQStateEnum.THIRTY);
ret.add(GQStateEnum.FORTY);
ret.add(GQStateEnum.FIFTY);
ret.add(GQStateEnum.SIXTY);
break;
case TEN:
ret.add(GQStateEnum.TWENTY);
ret.add(GQStateEnum.THIRTY);
ret.add(GQStateEnum.FORTY);
ret.add(GQStateEnum.FIFTY);
ret.add(GQStateEnum.SIXTY);
break;
case TWENTY:
ret.add(GQStateEnum.THIRTY);
ret.add(GQStateEnum.FORTY);
ret.add(GQStateEnum.FIFTY);
ret.add(GQStateEnum.SIXTY);
break;
case THIRTY:
ret.add(GQStateEnum.FORTY);
ret.add(GQStateEnum.FIFTY);
ret.add(GQStateEnum.SIXTY);
break;
case FORTY:
ret.add(GQStateEnum.FIFTY);
ret.add(GQStateEnum.SIXTY);
break;
case FIFTY:
ret.add(GQStateEnum.SIXTY);
break;
}

return ret;
}

public static List<String> getHeaders() {
return Arrays.stream(PetFieldEnum.values()).map(String::valueOf).collect(Collectors.toList());
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package org.broadinstitute.hellbender.tools.variantdb.nextgen;

import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;

import org.broadinstitute.hellbender.tools.variantdb.IngestConstants;

public class PetTsvWriter implements Closeable {
private BufferedWriter writer;
private final static char SEPARATOR = IngestConstants.SEPARATOR;

public PetTsvWriter(String outputFile) throws IOException{
writer = Files.newBufferedWriter(Paths.get(outputFile));

writer.append("location");
writer.append(SEPARATOR);
writer.append("sample");
writer.append(SEPARATOR);
writer.append("state");
}

public void addRow(long location, long sampleId, String state) throws IOException {
writer.append(String.valueOf(location));
writer.append(SEPARATOR);
writer.append(String.valueOf(sampleId));
writer.append(SEPARATOR);
writer.append(state);
}

public void close() throws IOException {
writer.flush();
writer.close();
}
}

0 comments on commit d6d5b08

Please sign in to comment.