diff --git a/src/main/java/org/broadinstitute/hellbender/engine/TwoPassVariantWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/TwoPassVariantWalker.java new file mode 100644 index 00000000000..b0ff611956e --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/engine/TwoPassVariantWalker.java @@ -0,0 +1,92 @@ +package org.broadinstitute.hellbender.engine; + +import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.hellbender.engine.filters.CountingReadFilter; +import org.broadinstitute.hellbender.engine.filters.VariantFilter; +import org.broadinstitute.hellbender.utils.SimpleInterval; + +import java.util.stream.StreamSupport; + +/** + * A VariantWalker that makes two passes through the variants. + * This allows the user to store internal states during the first pass, which the user can then + * process and access during the second pass + **/ +public abstract class TwoPassVariantWalker extends VariantWalker { + /** + * Overrides the default, single-pass traversal framework of {@link VariantWalkerBase} + */ + @Override + public void traverse(){ + final VariantFilter variantContextFilter = makeVariantFilter(); + final CountingReadFilter readFilter = makeReadFilter(); + + // First pass through the variants + logger.info("Starting first pass through the variants"); + traverseVariants(variantContextFilter, readFilter, this::firstPassApply); + logger.info("Finished first pass through the variants"); + + // Process the data accumulated during the first pass + afterFirstPass(); + + // Second pass + logger.info("Starting second pass through the variants"); + traverseVariants(variantContextFilter, readFilter, this::secondPassApply); + + logger.info(readFilter.getSummaryLine()); + } + + /** + * + * First pass through the variants. The user may store data in instance variables of the walker + * and process them in {@link #afterFirstPass} before making a second pass + * + * @param variant A variant record in a vcf + * @param readsContext Reads overlapping the current variant. Will be empty if a read source (e.g. bam) isn't provided + * @param referenceContext Reference bases spanning the current variant + * @param featureContext A record overlapping the current variant from an auxiliary data source (e.g. gnomad) + */ + protected abstract void firstPassApply(final VariantContext variant, + final ReadsContext readsContext, + final ReferenceContext referenceContext, + final FeatureContext featureContext ); + /** + * Process the data collected during the first pass. This method is called between the two traversals + */ + protected abstract void afterFirstPass(); + + /** + * + * Having seen all of the variants in a vcf, make a second pass through the variants + * + * See argument docs of {@link #firstPassApply} and {@link VariantWalkerBase#apply} + **/ + protected abstract void secondPassApply(final VariantContext variant, + final ReadsContext readsContext, + final ReferenceContext referenceContext, + final FeatureContext featureContext); + + private void traverseVariants(final VariantFilter variantFilter, final CountingReadFilter readFilter, final VariantConsumer variantConsumer){ + StreamSupport.stream(getSpliteratorForDrivingVariants(), false) + .filter(variantFilter) + .forEach(variant -> { + final SimpleInterval variantInterval = new SimpleInterval(variant); + variantConsumer.consume(variant, + new ReadsContext(reads, variantInterval, readFilter), + new ReferenceContext(reference, variantInterval), + new FeatureContext(features, variantInterval)); + progressMeter.update(variantInterval); + }); + } + + @FunctionalInterface + private interface VariantConsumer { + void consume(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext reference, final FeatureContext features); + } + + /** + * Make final to hide it from subclasses + */ + @Override + public final void apply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) {} +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/examples/ExampleTwoPassVariantWalker.java b/src/main/java/org/broadinstitute/hellbender/tools/examples/ExampleTwoPassVariantWalker.java new file mode 100644 index 00000000000..3c7eb85b6c6 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/examples/ExampleTwoPassVariantWalker.java @@ -0,0 +1,95 @@ +package org.broadinstitute.hellbender.tools.examples; + +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.*; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.ExampleProgramGroup; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.ReadsContext; +import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.engine.TwoPassVariantWalker; +import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + + +/** + * This walker makes two traversals through variants in a vcf. During the first pass, it collects the qual-by-depth (QD) annotation + * of each variant and stores it in a list. After the first traversal, it computes the vcf-wide average and variance of QD's. + * The walker will then traverse the variants the second time and annotate each variant with the distance of its QD from the mean + * in units of standard deviations. + * + */ +@CommandLineProgramProperties( + summary = "Example variant walker that makes two passes through a vcf", + oneLineSummary = "Example two-pass variant walker", + programGroup = ExampleProgramGroup.class, + omitFromCommandLine = true +) +public class ExampleTwoPassVariantWalker extends TwoPassVariantWalker { + @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output vcf", optional = true) + private File outputVcf = null; + + private final List qualByDepths = new ArrayList<>(); + + private VariantContextWriter vcfWriter; + + public static final String QD_DISTANCE_FROM_MEAN = "QD_DIST"; + + public static final String COPY_OF_QD_KEY_NAME = "QD_COPY"; + + private double averageQualByDepth; + + private double sampleVarianceOfQDs; + + private int counter = 0; + + @Override + public void onTraversalStart() { + final VCFHeader inputHeader = getHeaderForVariants(); + inputHeader.addMetaDataLine(new VCFInfoHeaderLine(QD_DISTANCE_FROM_MEAN, 1, VCFHeaderLineType.Float, "distance from the average QD value in the units of standard deviations")); + inputHeader.addMetaDataLine(new VCFInfoHeaderLine(COPY_OF_QD_KEY_NAME, 1, VCFHeaderLineType.Float, "copy of the QD INFO field")); + vcfWriter = createVCFWriter(outputVcf); + vcfWriter.writeHeader(inputHeader); + } + + @Override + protected void firstPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { + qualByDepths.add(variant.getAttributeAsDouble(GATKVCFConstants.QUAL_BY_DEPTH_KEY, 0.0)); + } + + @Override + protected void afterFirstPass() { + final int n = qualByDepths.size(); + if (n == 1){ + return; + } + averageQualByDepth = qualByDepths.stream().mapToDouble(x -> x).average().orElse(0.0); + sampleVarianceOfQDs = (1.0/(n - 1.0))* qualByDepths.stream().mapToDouble(x -> Math.pow((x - averageQualByDepth), 2.0)).sum(); + } + + @Override + protected void secondPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { + VariantContextBuilder vcb = new VariantContextBuilder(variant); + final double qualByDepth = qualByDepths.get(counter++); + final double distanceFromMean = Math.abs((qualByDepth - averageQualByDepth)/ Math.sqrt(sampleVarianceOfQDs)); + vcb.attribute(QD_DISTANCE_FROM_MEAN, distanceFromMean); + vcb.attribute(COPY_OF_QD_KEY_NAME, qualByDepth); + + vcfWriter.add(vcb.make()); + } + + @Override + public void closeTool() { + if ( vcfWriter != null ) { + vcfWriter.close(); + } + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/engine/TwoPassVariantWalkerUnitTest.java b/src/test/java/org/broadinstitute/hellbender/engine/TwoPassVariantWalkerUnitTest.java new file mode 100644 index 00000000000..7672630efb2 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/engine/TwoPassVariantWalkerUnitTest.java @@ -0,0 +1,53 @@ +package org.broadinstitute.hellbender.engine; + +import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.cmdline.TestProgramGroup; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class TwoPassVariantWalkerUnitTest extends GATKBaseTest { + @CommandLineProgramProperties( + summary = "An example subclass of TwoPassVariantWalker", + oneLineSummary = "An example subclass of TwoPassVariantWalker", + programGroup = TestProgramGroup.class, + omitFromCommandLine = true + ) + private static class DummyTwoPassVariantWalker extends TwoPassVariantWalker { + public int firstPass = 0; + public int secondPass = 0; + public boolean visitedAfterFirstPass = false; + + @Override + protected void firstPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { + firstPass++; + } + + @Override + protected void afterFirstPass(){ + visitedAfterFirstPass = true; + } + + @Override + protected void secondPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { + secondPass++; + } + } + + @Test + public void testTwoPassTraversal() { + final DummyTwoPassVariantWalker walker = new DummyTwoPassVariantWalker(); + final String testVcf = "src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/multiallelic.vcf"; + + final String[] args = { "-V", testVcf }; + + walker.instanceMain(args); + + final int expectedNumberOfVariantContexts = 52; + Assert.assertEquals(walker.firstPass, expectedNumberOfVariantContexts); + Assert.assertEquals(walker.secondPass, expectedNumberOfVariantContexts); + Assert.assertTrue(walker.visitedAfterFirstPass); + } + +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/examples/ExampleTwoPassVariantWalkerIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/examples/ExampleTwoPassVariantWalkerIntegrationTest.java new file mode 100644 index 00000000000..719ca05be81 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/examples/ExampleTwoPassVariantWalkerIntegrationTest.java @@ -0,0 +1,46 @@ +package org.broadinstitute.hellbender.tools.examples; + +import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.FeatureDataSource; +import org.broadinstitute.hellbender.utils.test.BaseTest; +import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Iterator; + +public class ExampleTwoPassVariantWalkerIntegrationTest extends CommandLineProgramTest { + @Test + public void test() { + final File outputVcf = BaseTest.createTempFile("output", "vcf"); + final String inputVcf = "src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariants/haploid-multisample.vcf"; + + final String[] args = { + "-" + StandardArgumentDefinitions.VARIANT_SHORT_NAME, inputVcf, + "-" + StandardArgumentDefinitions.OUTPUT_SHORT_NAME, outputVcf.getAbsolutePath() + }; + + runCommandLine(args); + + try (final FeatureDataSource variantsBefore = new FeatureDataSource<>(inputVcf); + final FeatureDataSource variantsAfter = new FeatureDataSource<>(outputVcf)){ + final Iterator beforeIterator = variantsBefore.iterator(); + final Iterator afterIterator = variantsAfter.iterator(); + + while (beforeIterator.hasNext() && afterIterator.hasNext()){ + final VariantContext before = beforeIterator.next(); + final VariantContext after = afterIterator.next(); + + Assert.assertEquals(before.getAttributeAsDouble(GATKVCFConstants.QUAL_BY_DEPTH_KEY, -1.0), + after.getAttributeAsDouble(ExampleTwoPassVariantWalker.COPY_OF_QD_KEY_NAME, -2.0)); + Assert.assertTrue(after.getAttributeAsDouble(ExampleTwoPassVariantWalker.QD_DISTANCE_FROM_MEAN, -1.0) > 0); + } + + Assert.assertTrue(! afterIterator.hasNext() && ! beforeIterator.hasNext()); + } + } + +} \ No newline at end of file