Skip to content

Commit

Permalink
Added regression test and additional filter to prevent NaNs from zero…
Browse files Browse the repository at this point in the history
…-median intervals that result when median filter is disabled.
  • Loading branch information
samuelklee committed Oct 8, 2020
1 parent 2a8001d commit 7c75993
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ public final class CreateReadCountPanelOfNormals extends SparkCommandLineProgram
@Argument(
doc = "Genomic intervals with a median (across samples) of fractional coverage (optionally corrected for GC bias) " +
"less than or equal to this percentile are filtered out. " +
"If set to zero, intervals with a zero fractional-coverage median are still filtered out. " +
"(This is the first filter applied.)",
fullName = MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME,
minValue = 0.,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,12 @@ private static PreprocessedStandardizedResult preprocessPanel(final RealMatrix r
//filter intervals by fractional median
final double[] originalIntervalMedians = MatrixSummaryUtils.getColumnMedians(readCounts);
if (minimumIntervalMedianPercentile == 0.) {
logger.info(String.format("A value of 0 was provided for argument %s, so the corresponding filtering step will be skipped...",
logger.info(String.format("A value of 0 was provided for argument %s, so the corresponding filtering step will be skipped; only filtering intervals with median (across samples) of zero...",
CreateReadCountPanelOfNormals.MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME));
IntStream.range(0, numOriginalIntervals)
.filter(intervalIndex -> originalIntervalMedians[intervalIndex] == 0.)
.forEach(intervalIndex -> filterIntervals[intervalIndex] = true);
logger.info(String.format("After filtering, %d out of %d intervals remain...", countNumberPassingFilter(filterIntervals), numOriginalIntervals));
} else {
logger.info(String.format("Filtering intervals with median (across samples) less than or equal to the %.2f percentile...", minimumIntervalMedianPercentile));
//calculate percentile
Expand All @@ -222,7 +226,7 @@ private static PreprocessedStandardizedResult preprocessPanel(final RealMatrix r
.filter(intervalIndex -> !filterIntervals[intervalIndex])
.forEach(intervalIndex -> IntStream.range(0, numOriginalSamples).filter(sampleIndex -> !filterSamples[sampleIndex]).forEach(sampleIndex -> {
final double value = readCounts.getEntry(sampleIndex, intervalIndex);
readCounts.setEntry(sampleIndex, intervalIndex,value / originalIntervalMedians[intervalIndex]);
readCounts.setEntry(sampleIndex, intervalIndex, value / originalIntervalMedians[intervalIndex]);
}));

//filter samples by percentage of zero-coverage intervals not already filtered
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ public void test(final List<File> inputFiles,
public void testSingleSample(final List<File> inputFiles,
final File annotatedIntervalsFile,
final int expectedNumberOfEigenvalues) { //ignored in this test
final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".tsv");
final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".hdf5");
final ArgumentsBuilder argsBuilder = new ArgumentsBuilder()
.add(CreateReadCountPanelOfNormals.MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME, Double.toString(MINIMUM_INTERVAL_MEDIAN_PERCENTILE))
.add(CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME, Double.toString(MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE))
Expand All @@ -267,11 +267,50 @@ public void testSingleSample(final List<File> inputFiles,
runCommandLine(argsBuilder); //just check that we can build the panel; no other assertions checked
}

/**
* Regression test for https://github.com/broadinstitute/gatk/pull/6624.
* Logic for filtering was changed in this PR; without the change, running this test would erroneously result in
* none of the bad intervals being filtered out (due to a gap in the previous filtering logic that would ultimately
* yield NaNs resulting from dividing by zero interval medians), so we check that at least some intervals are filtered out.
*/
@Test(dataProvider = "dataPanelOfNormals")
public void testSingleSampleWithTooManyZeros(final List<File> inputFiles,
final File annotatedIntervalsFile,
final int expectedNumberOfEigenvalues) { //ignored in this test
final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".hdf5");
final ArgumentsBuilder argsBuilder = new ArgumentsBuilder()
.add(CreateReadCountPanelOfNormals.MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME, Double.toString(MINIMUM_INTERVAL_MEDIAN_PERCENTILE))
.add(CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME, Double.toString(MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE))
.add(CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE_LONG_NAME, Double.toString(MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE))
.add(CreateReadCountPanelOfNormals.EXTREME_SAMPLE_MEDIAN_PERCENTILE_LONG_NAME, Double.toString(EXTREME_SAMPLE_MEDIAN_PERCENTILE))
.add(CopyNumberStandardArgument.NUMBER_OF_EIGENSAMPLES_LONG_NAME, Integer.toString(NUMBER_OF_EIGENVALUES_REQUESTED))
.addOutput(resultOutputFile);
if (annotatedIntervalsFile != null) {
argsBuilder.add(CopyNumberStandardArgument.ANNOTATED_INTERVALS_FILE_LONG_NAME, annotatedIntervalsFile);
}
argsBuilder.addInput(inputFiles.get(0)); //use only first bad sample
runCommandLine(argsBuilder);

try (final HDF5File hdf5PanelOfNormalsFile = new HDF5File(resultOutputFile)) {
final SVDReadCountPanelOfNormals panelOfNormals = HDF5SVDReadCountPanelOfNormals.read(hdf5PanelOfNormalsFile);

//check dimensions of original counts and intervals
final RealMatrix counts = new Array2DRowRealMatrix(panelOfNormals.getOriginalReadCounts());
Assert.assertEquals(counts.getRowDimension(), 1);
Assert.assertEquals(counts.getColumnDimension(), NUM_INTERVALS);
final List<SimpleInterval> originalIntervals = panelOfNormals.getOriginalIntervals();
Assert.assertEquals(originalIntervals.size(), NUM_INTERVALS);

//guard against regression by checking that zero intervals are indeed filtered
Assert.assertTrue(panelOfNormals.getPanelIntervals().size() < NUM_INTERVALS);
}
}

@Test(dataProvider = "dataPanelOfNormals")
public void testZeroEigensamples(final List<File> inputFiles,
final File annotatedIntervalsFile,
final int expectedNumberOfEigenvalues) { //ignored in this test
final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".tsv");
final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".hdf5");
final ArgumentsBuilder argsBuilder = new ArgumentsBuilder()
.add(CreateReadCountPanelOfNormals.MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME, Double.toString(MINIMUM_INTERVAL_MEDIAN_PERCENTILE))
.add(CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME, Double.toString(MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE))
Expand Down

0 comments on commit 7c75993

Please sign in to comment.