From 7c7599318151fcccd76640e813909ad8f1871aa2 Mon Sep 17 00:00:00 2001 From: Samuel Lee Date: Thu, 8 Oct 2020 16:48:59 -0400 Subject: [PATCH] Added regression test and additional filter to prevent NaNs from zero-median intervals that result when median filter is disabled. --- .../CreateReadCountPanelOfNormals.java | 1 + .../denoising/SVDDenoisingUtils.java | 8 +++- ...eadCountPanelOfNormalsIntegrationTest.java | 43 ++++++++++++++++++- 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java index 9c7ef423fb2..9950d50b048 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java @@ -168,6 +168,7 @@ public final class CreateReadCountPanelOfNormals extends SparkCommandLineProgram @Argument( doc = "Genomic intervals with a median (across samples) of fractional coverage (optionally corrected for GC bias) " + "less than or equal to this percentile are filtered out. " + + "If set to zero, intervals with a zero fractional-coverage median are still filtered out. " + "(This is the first filter applied.)", fullName = MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME, minValue = 0., diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDDenoisingUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDDenoisingUtils.java index f55cdf63a5a..aea9047694f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDDenoisingUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDDenoisingUtils.java @@ -204,8 +204,12 @@ private static PreprocessedStandardizedResult preprocessPanel(final RealMatrix r //filter intervals by fractional median final double[] originalIntervalMedians = MatrixSummaryUtils.getColumnMedians(readCounts); if (minimumIntervalMedianPercentile == 0.) { - logger.info(String.format("A value of 0 was provided for argument %s, so the corresponding filtering step will be skipped...", + logger.info(String.format("A value of 0 was provided for argument %s, so the corresponding filtering step will be skipped; only filtering intervals with median (across samples) of zero...", CreateReadCountPanelOfNormals.MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME)); + IntStream.range(0, numOriginalIntervals) + .filter(intervalIndex -> originalIntervalMedians[intervalIndex] == 0.) + .forEach(intervalIndex -> filterIntervals[intervalIndex] = true); + logger.info(String.format("After filtering, %d out of %d intervals remain...", countNumberPassingFilter(filterIntervals), numOriginalIntervals)); } else { logger.info(String.format("Filtering intervals with median (across samples) less than or equal to the %.2f percentile...", minimumIntervalMedianPercentile)); //calculate percentile @@ -222,7 +226,7 @@ private static PreprocessedStandardizedResult preprocessPanel(final RealMatrix r .filter(intervalIndex -> !filterIntervals[intervalIndex]) .forEach(intervalIndex -> IntStream.range(0, numOriginalSamples).filter(sampleIndex -> !filterSamples[sampleIndex]).forEach(sampleIndex -> { final double value = readCounts.getEntry(sampleIndex, intervalIndex); - readCounts.setEntry(sampleIndex, intervalIndex,value / originalIntervalMedians[intervalIndex]); + readCounts.setEntry(sampleIndex, intervalIndex, value / originalIntervalMedians[intervalIndex]); })); //filter samples by percentage of zero-coverage intervals not already filtered diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormalsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormalsIntegrationTest.java index 392c500b461..57d8c462112 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormalsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormalsIntegrationTest.java @@ -252,7 +252,7 @@ public void test(final List inputFiles, public void testSingleSample(final List inputFiles, final File annotatedIntervalsFile, final int expectedNumberOfEigenvalues) { //ignored in this test - final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".tsv"); + final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".hdf5"); final ArgumentsBuilder argsBuilder = new ArgumentsBuilder() .add(CreateReadCountPanelOfNormals.MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME, Double.toString(MINIMUM_INTERVAL_MEDIAN_PERCENTILE)) .add(CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME, Double.toString(MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE)) @@ -267,11 +267,50 @@ public void testSingleSample(final List inputFiles, runCommandLine(argsBuilder); //just check that we can build the panel; no other assertions checked } + /** + * Regression test for https://github.com/broadinstitute/gatk/pull/6624. + * Logic for filtering was changed in this PR; without the change, running this test would erroneously result in + * none of the bad intervals being filtered out (due to a gap in the previous filtering logic that would ultimately + * yield NaNs resulting from dividing by zero interval medians), so we check that at least some intervals are filtered out. + */ + @Test(dataProvider = "dataPanelOfNormals") + public void testSingleSampleWithTooManyZeros(final List inputFiles, + final File annotatedIntervalsFile, + final int expectedNumberOfEigenvalues) { //ignored in this test + final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".hdf5"); + final ArgumentsBuilder argsBuilder = new ArgumentsBuilder() + .add(CreateReadCountPanelOfNormals.MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME, Double.toString(MINIMUM_INTERVAL_MEDIAN_PERCENTILE)) + .add(CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME, Double.toString(MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE)) + .add(CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE_LONG_NAME, Double.toString(MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE)) + .add(CreateReadCountPanelOfNormals.EXTREME_SAMPLE_MEDIAN_PERCENTILE_LONG_NAME, Double.toString(EXTREME_SAMPLE_MEDIAN_PERCENTILE)) + .add(CopyNumberStandardArgument.NUMBER_OF_EIGENSAMPLES_LONG_NAME, Integer.toString(NUMBER_OF_EIGENVALUES_REQUESTED)) + .addOutput(resultOutputFile); + if (annotatedIntervalsFile != null) { + argsBuilder.add(CopyNumberStandardArgument.ANNOTATED_INTERVALS_FILE_LONG_NAME, annotatedIntervalsFile); + } + argsBuilder.addInput(inputFiles.get(0)); //use only first bad sample + runCommandLine(argsBuilder); + + try (final HDF5File hdf5PanelOfNormalsFile = new HDF5File(resultOutputFile)) { + final SVDReadCountPanelOfNormals panelOfNormals = HDF5SVDReadCountPanelOfNormals.read(hdf5PanelOfNormalsFile); + + //check dimensions of original counts and intervals + final RealMatrix counts = new Array2DRowRealMatrix(panelOfNormals.getOriginalReadCounts()); + Assert.assertEquals(counts.getRowDimension(), 1); + Assert.assertEquals(counts.getColumnDimension(), NUM_INTERVALS); + final List originalIntervals = panelOfNormals.getOriginalIntervals(); + Assert.assertEquals(originalIntervals.size(), NUM_INTERVALS); + + //guard against regression by checking that zero intervals are indeed filtered + Assert.assertTrue(panelOfNormals.getPanelIntervals().size() < NUM_INTERVALS); + } + } + @Test(dataProvider = "dataPanelOfNormals") public void testZeroEigensamples(final List inputFiles, final File annotatedIntervalsFile, final int expectedNumberOfEigenvalues) { //ignored in this test - final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".tsv"); + final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".hdf5"); final ArgumentsBuilder argsBuilder = new ArgumentsBuilder() .add(CreateReadCountPanelOfNormals.MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME, Double.toString(MINIMUM_INTERVAL_MEDIAN_PERCENTILE)) .add(CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME, Double.toString(MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE))