Skip to content

Commit

Permalink
remove PET code (#7768)
Browse files Browse the repository at this point in the history
  • Loading branch information
kcibul authored Apr 11, 2022
1 parent 6bf03de commit f49e122
Show file tree
Hide file tree
Showing 10 changed files with 32 additions and 3,846 deletions.
14 changes: 2 additions & 12 deletions scripts/variantstore/wdl/GvsExtractCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ workflow GvsExtractCallset {
fq_ranges_cohort_ref_extract_table = fq_ranges_cohort_ref_extract_table,
fq_ranges_cohort_vet_extract_table = fq_ranges_cohort_vet_extract_table,
read_project_id = query_project,
mode = "RANGES-PREPARED",
do_not_filter_override = false,
fq_ranges_dataset = fq_ranges_dataset,
fq_filter_set_info_table = fq_filter_set_info_table,
Expand Down Expand Up @@ -201,8 +200,6 @@ task ExtractTask {
String output_file
String? output_gcs_dir

String mode

Boolean do_not_filter_override
String fq_ranges_dataset
String fq_filter_set_info_table
Expand Down Expand Up @@ -247,17 +244,10 @@ task ExtractTask {
--filter-set-name ~{filter_set_name}'
fi

if [ ~{mode} = "RANGES-RAW" ]; then
MODE_ARGS="--mode RANGES --vet-ranges-fq-dataset ~{fq_ranges_dataset} "
elif [ ~{mode} = "RANGES-PREPARED" ]; then
MODE_ARGS="--mode RANGES --vet-ranges-extract-fq-table ~{fq_ranges_cohort_vet_extract_table} --ref-ranges-extract-fq-table ~{fq_ranges_cohort_ref_extract_table} "
else
MODE_ARGS="--mode PET --cohort-extract-table ~{fq_cohort_extract_table} "
fi

gatk --java-options "-Xmx9g" \
ExtractCohort \
${MODE_ARGS} \
--vet-ranges-extract-fq-table ~{fq_ranges_cohort_vet_extract_table} \
--ref-ranges-extract-fq-table ~{fq_ranges_cohort_ref_extract_table} \
--ref-version 38 \
-R ~{reference} \
-O ~{output_file} \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,6 @@ public static Set<VCFHeaderLine> getEvoquerVcfHeaderLines() {
return headerLines;
}

public enum ModeEnum {
PET,
RANGES
}

public enum OutputType {
TSV,
ORC,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,6 @@ public abstract class ExtractTool extends GATKTool {
)
protected int localSortMaxRecordsInRam = DEFAULT_LOCAL_SORT_MAX_RECORDS_IN_RAM;

@Argument(
fullName = "mode",
doc = "Reference representation mode. Valid values are PET or RANGES",
optional = false
)
protected CommonCode.ModeEnum mode = CommonCode.ModeEnum.PET;

@Argument(
fullName = "ref-version",
doc = "Remove this option!!!! only for ease of testing. Valid options are 37 or 38",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,6 @@ protected void onStartup() {
annotationEngine,
reference,
sampleIdToName,
mode,
cohortTable,
cohortAvroFileName,
vetRangesFQDataSet,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ public class ExtractCohortEngine {

private final boolean printDebugInformation;
private final int localSortMaxRecordsInRam;
private final TableReference cohortTableRef;
private final List<SimpleInterval> traversalIntervals;
private final Long minLocation;
private final Long maxLocation;
Expand All @@ -60,7 +59,6 @@ public class ExtractCohortEngine {

private final ProgressMeter progressMeter;
private final String projectID;
private final CommonCode.ModeEnum mode;
private final boolean emitPLs;

/** List of sample names seen in the variant data from BigQuery. */
Expand Down Expand Up @@ -99,7 +97,6 @@ public ExtractCohortEngine(final String projectID,
final VariantAnnotatorEngine annotationEngine,
final ReferenceDataSource refSource,
final Map<Long, String> sampleIdToName,
final CommonCode.ModeEnum mode,
final String cohortTableName,
final GATKPath cohortAvroFileName,
final String vetRangesFQDataSet,
Expand Down Expand Up @@ -146,12 +143,8 @@ public ExtractCohortEngine(final String projectID,
sampleIdsToExtractBitSet.set(id.intValue());
}

this.mode = mode;
this.emitPLs = emitPLs;

this.cohortTableRef = cohortTableName == null || "".equals(cohortTableName) ? null :
new TableReference(cohortTableName, emitPLs ? SchemaUtils.COHORT_FIELDS : SchemaUtils.COHORT_FIELDS_NO_PL);

this.vetRangesFQDataSet = vetRangesFQDataSet;
this.fqRangesExtractVetTable = fqRangesExtractVetTable;
this.fqRangesExtractRefTable = fqRangesExtractRefTable;
Expand Down Expand Up @@ -252,30 +245,19 @@ public void traverse() {
}
logger.debug("Initializing Reader");

if (this.mode == CommonCode.ModeEnum.RANGES) {
if (!SchemaUtils.decodeContig(minLocation).equals(SchemaUtils.decodeContig(maxLocation))) {
throw new GATKException("Can not process cross-contig boundaries for Ranges implementation");
}
if (!SchemaUtils.decodeContig(minLocation).equals(SchemaUtils.decodeContig(maxLocation))) {
throw new GATKException("Can not process cross-contig boundaries for Ranges implementation");
}

SortedSet<Long> sampleIdsToExtract = new TreeSet<>(this.sampleIdToName.keySet());
if (fqRangesExtractVetTable != null) {
createVariantsFromUnsortedExtractTableBigQueryRanges(fqRangesExtractVetTable, fqRangesExtractRefTable, sampleIdsToExtract, minLocation, maxLocation, fullVqsLodMap, fullYngMap, siteFilterMap, noVqslodFilteringRequested);
} else if (vetRangesFQDataSet != null) {
createVariantsFromUnsortedBigQueryRanges(vetRangesFQDataSet, sampleIdsToExtract, minLocation, maxLocation, fullVqsLodMap, fullYngMap, siteFilterMap, noVqslodFilteringRequested);
} else {
createVariantsFromUnsortedAvroRanges(vetAvroFileName, refRangesAvroFileName, sampleIdsToExtract, minLocation, maxLocation, fullVqsLodMap, fullYngMap, siteFilterMap, noVqslodFilteringRequested, presortedAvroFiles);
}
SortedSet<Long> sampleIdsToExtract = new TreeSet<>(this.sampleIdToName.keySet());
if (fqRangesExtractVetTable != null) {
createVariantsFromUnsortedExtractTableBigQueryRanges(fqRangesExtractVetTable, fqRangesExtractRefTable, sampleIdsToExtract, minLocation, maxLocation, fullVqsLodMap, fullYngMap, siteFilterMap, noVqslodFilteringRequested);
} else if (vetRangesFQDataSet != null) {
createVariantsFromUnsortedBigQueryRanges(vetRangesFQDataSet, sampleIdsToExtract, minLocation, maxLocation, fullVqsLodMap, fullYngMap, siteFilterMap, noVqslodFilteringRequested);
} else {
if (cohortTableRef != null) {
try (StorageAPIAvroReader storageAPIAvroReader = new StorageAPIAvroReader(cohortTableRef, rowRestriction, projectID)) {
createVariantsFromUnsortedResult(storageAPIAvroReader, fullVqsLodMap, fullYngMap, siteFilterMap, noVqslodFilteringRequested);
processBytesScanned(storageAPIAvroReader);
}
} else {
final AvroFileReader avroFileReader = new AvroFileReader(cohortAvroFileName);
createVariantsFromUnsortedResult(avroFileReader, fullVqsLodMap, fullYngMap, siteFilterMap, noVqslodFilteringRequested);
}
createVariantsFromUnsortedAvroRanges(vetAvroFileName, refRangesAvroFileName, sampleIdsToExtract, minLocation, maxLocation, fullVqsLodMap, fullYngMap, siteFilterMap, noVqslodFilteringRequested, presortedAvroFiles);
}

logger.debug("Finished Initializing Reader");

logger.info("Processed " + totalRangeRecords + " range records and rejected " + totalIrrelevantRangeRecords + " irrelevant ones ");
Expand Down Expand Up @@ -367,63 +349,6 @@ private SortingCollection<GenericRecord> addToRefSortingCollection(final Sorting
return sortingCollection;
}


private void createVariantsFromUnsortedResult(final GATKAvroReader avroReader,
final HashMap<Long, HashMap<Allele, HashMap<Allele, Double>>> fullVqsLodMap,
final HashMap<Long, HashMap<Allele, HashMap<Allele, String>>> fullYngMap,
final HashMap<Long, List<String>> siteFilterMap,
final boolean noVqslodFilteringRequested) {

final org.apache.avro.Schema schema = avroReader.getSchema();

SortingCollection<GenericRecord> sortingCollection = getAvroSortingCollection(schema, localSortMaxRecordsInRam);

int recordsProcessed = 0;
long startTime = System.currentTimeMillis();

for ( final GenericRecord queryRow : avroReader ) {

sortingCollection.add(queryRow);
if (recordsProcessed++ % 1000000 == 0) {
long endTime = System.currentTimeMillis();
logger.info("Processed " + recordsProcessed + " from BigQuery Read API in " + (endTime-startTime) + " ms");
startTime = endTime;
}
}

sortingCollection.printTempFileStats();

final Map<Long, ExtractCohortRecord> currentPositionRecords = new HashMap<>(sampleIdToName.size() * 2);

long currentLocation = -1;

// NOTE: if OverlapDetector takes too long, try using RegionChecker from tws_sv_local_assembler
final OverlapDetector<SimpleInterval> intervalsOverlapDetector = OverlapDetector.create(traversalIntervals);

for ( final GenericRecord sortedRow : sortingCollection ) {
final ExtractCohortRecord cohortRow = new ExtractCohortRecord( sortedRow );

if ( intervalsOverlapDetector.overlapsAny(cohortRow) ) {
final long location = cohortRow.getLocation();

if (location != currentLocation && currentLocation != -1) {
++totalNumberOfSites;
processSampleRecordsForLocation(currentLocation, currentPositionRecords.values(), fullVqsLodMap, fullYngMap, noVqslodFilteringRequested, siteFilterMap, VQSLODFilteringType);

currentPositionRecords.clear();
}

currentPositionRecords.merge(cohortRow.getSampleId(), cohortRow, this::mergeSampleRecord);
currentLocation = location;
}
}

if ( ! currentPositionRecords.isEmpty() ) {
++totalNumberOfSites;
processSampleRecordsForLocation(currentLocation, currentPositionRecords.values(), fullVqsLodMap, fullYngMap, noVqslodFilteringRequested, siteFilterMap, VQSLODFilteringType);
}
}

private ExtractCohortRecord mergeSampleRecord(ExtractCohortRecord r1, ExtractCohortRecord r2) {
if (printDebugInformation) {
logger.info("In SampleRecord Merge Logic for " + r1 + " and " + r2);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.broadinstitute.hellbender.tools.gvs.extract;

import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
Expand All @@ -12,35 +11,9 @@

class ExtractCohortTest extends CommandLineProgramTest {
private final String prefix = getToolTestDataDir();
private final String cohortAvroFileName = prefix +"chr20_subset_3_samples.avro";
private final String sampleFile = prefix +"sample_list";

@Test
public void testFinalVCFfromAvro() throws Exception {
// To create the expectedVCF file (of expected output) --create a temp table in BQ with the following query
// and then export it through the BQ GUI as an avro file into GCS.
// SELECT * FROM `spec-ops-aou.anvil_100_for_testing.exported_cohort_all_samples`
// where location < 20000000200000 and location >= 20000000100000
// and (sample_name="HG00405" or sample_name="HG00418" or sample_name="HG00408")
final File expectedVCF = getTestFile("expected_extract.vcf");

// create a temporary file (that will get cleaned up after the test has run) to hold the output data in
final File outputVCF = createTempFile("extract_output", "vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();
args
.add("mode", "PET")
.add("ref-version", 38)
.add("R", hg38Reference)
.add("O", outputVCF.getAbsolutePath())
.add("local-sort-max-records-in-ram", 10000000)
.add("cohort-avro-file-name", cohortAvroFileName)
.add("sample-file", sampleFile)
.add("emit-pls", true);

runCommandLine(args);
IntegrationTestSpec.assertEqualTextFiles(outputVCF, expectedVCF);
}
private final String quickstart10mbRefRangesAvroFile = prefix + "quickstart_10mb_ref_ranges.avro";
private final String quickstart10mbVetAvroFile = prefix + "quickstart_10mb_vet.avro";
private final String quickstartSampleListFile = prefix + "quickstart.sample.list";

@Test
public void testFinalVCFfromRangesAvro() throws Exception {
Expand All @@ -64,15 +37,14 @@ public void testFinalVCFfromRangesAvro() throws Exception {

final ArgumentsBuilder args = new ArgumentsBuilder();
args
.add("mode", "RANGES")
.add("ref-version", 38)
.add("R", hg38Reference)
.add("O", outputVCF.getAbsolutePath())
.add("local-sort-max-records-in-ram", 10000000)
.add("ref-ranges-avro-file-name", getTestFile("quickstart_10mb_ref_ranges.avro"))
.add("vet-avro-file-name", getTestFile("quickstart_10mb_vet.avro"))
.add("L", "chr20:10000000-20000000")
.add("sample-file", prefix+"quickstart.sample.list");
.add("ref-version", 38)
.add("R", hg38Reference)
.add("O", outputVCF.getAbsolutePath())
.add("local-sort-max-records-in-ram", 10000000)
.add("ref-ranges-avro-file-name", quickstart10mbRefRangesAvroFile)
.add("vet-avro-file-name", quickstart10mbVetAvroFile)
.add("sample-file", quickstartSampleListFile)
.add("L", "chr20:10000000-20000000");

runCommandLine(args);
IntegrationTestSpec.assertEqualTextFiles(outputVCF, expectedVCF);
Expand All @@ -82,13 +54,13 @@ public void testFinalVCFfromRangesAvro() throws Exception {
public void testThrowFilterError() throws Exception {
final ArgumentsBuilder args = new ArgumentsBuilder();
args
.add("mode", "PET")
.add("ref-version", 38)
.add("R", hg38Reference)
.add("O", "anything")
.add("local-sort-max-records-in-ram", 10000000)
.add("cohort-avro-file-name", cohortAvroFileName)
.add("sample-file", sampleFile)
.add("ref-ranges-avro-file-name", quickstart10mbRefRangesAvroFile)
.add("vet-avro-file-name", quickstart10mbVetAvroFile)
.add("sample-file", quickstartSampleListFile)
.add("filter-set-info-table", "something")
.add("filter-set-name", "something")
.add("emit-pls", false);
Expand All @@ -99,13 +71,13 @@ public void testThrowFilterError() throws Exception {
public void testNoFilteringThresholdsError() throws Exception {
final ArgumentsBuilder args = new ArgumentsBuilder();
args
.add("mode", "PET")
.add("ref-version", 38)
.add("R", hg38Reference)
.add("O", "anything")
.add("local-sort-max-records-in-ram", 10000000)
.add("cohort-avro-file-name", cohortAvroFileName)
.add("sample-file", sampleFile)
.add("ref-ranges-avro-file-name", quickstart10mbRefRangesAvroFile)
.add("vet-avro-file-name", quickstart10mbVetAvroFile)
.add("sample-file", quickstartSampleListFile)
.add("emit-pls", false)
.add("filter-set-info-table", "foo")
.add("vqslod-filter-by-site", true);
Expand All @@ -117,13 +89,13 @@ public void testFakeFilteringError() throws Exception {
final ArgumentsBuilder args = new ArgumentsBuilder();
// No filterSetInfoTableName included, so should throw a user error with the performSiteSpecificVQSLODFiltering flag
args
.add("mode", "PET")
.add("ref-version", 38)
.add("R", hg38Reference)
.add("O", "anything")
.add("local-sort-max-records-in-ram", 10000000)
.add("cohort-avro-file-name", cohortAvroFileName)
.add("sample-file", sampleFile)
.add("ref-ranges-avro-file-name", quickstart10mbRefRangesAvroFile)
.add("vet-avro-file-name", quickstart10mbVetAvroFile)
.add("sample-file", quickstartSampleListFile)
.add("emit-pls", false)
.add("filter-set-name", "foo")
.add("vqslod-filter-by-site", true);
Expand Down
Binary file not shown.
Loading

0 comments on commit f49e122

Please sign in to comment.