broadinstitute · samuelklee · Nov 13, 2020 · May 28, 2020 · Oct 8, 2020 · Oct 9, 2020
diff --git a/...in/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java b/...in/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java
@@ -177,7 +177,7 @@ public final class CreateReadCountPanelOfNormals extends SparkCommandLineProgram
     private double minimumIntervalMedianPercentile = DEFAULT_MINIMUM_INTERVAL_MEDIAN_PERCENTILE;
 
     @Argument(
-            doc = "Samples with a fraction of zero-coverage genomic intervals above this percentage are filtered out.  " +
+            doc = "Samples with a fraction of zero-coverage genomic intervals greater than or equal to this percentage are filtered out.  " +
                     "(This is the second filter applied.)",
             fullName = MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME,
             minValue = 0.,
@@ -187,7 +187,7 @@ public final class CreateReadCountPanelOfNormals extends SparkCommandLineProgram
     private double maximumZerosInSamplePercentage = DEFAULT_MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE;
 
     @Argument(
-            doc = "Genomic intervals with a fraction of zero-coverage samples above this percentage are filtered out.  " +
+            doc = "Genomic intervals with a fraction of zero-coverage samples greater than or equal to this percentage are filtered out.  " +
                     "(This is the third filter applied.)",
             fullName = MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE_LONG_NAME,
             minValue = 0.,
@@ -198,7 +198,7 @@ public final class CreateReadCountPanelOfNormals extends SparkCommandLineProgram
 
     @Argument(
             doc = "Samples with a median (across genomic intervals) of fractional coverage normalized by genomic-interval medians  " +
-                    "below this percentile or above the complementary percentile are filtered out.  " +
+                    "strictly below this percentile or strictly above the complementary percentile are filtered out.  " +
                     "(This is the fourth filter applied.)",
             fullName = EXTREME_SAMPLE_MEDIAN_PERCENTILE_LONG_NAME,
             minValue = 0.,
@@ -217,7 +217,7 @@ public final class CreateReadCountPanelOfNormals extends SparkCommandLineProgram
 
     @Argument(
             doc = "Fractional coverages normalized by genomic-interval medians that are " +
-                    "below this percentile or above the complementary percentile are set to the corresponding percentile value.  " +
+                    "strictly below this percentile or strictly above the complementary percentile are set to the corresponding percentile value.  " +
                     "(This is applied after all filters and imputation.)",
             fullName = EXTREME_OUTLIER_TRUNCATION_PERCENTILE_LONG_NAME,
             minValue = 0.,

diff --git a/...main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDDenoisingUtils.java b/...main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDDenoisingUtils.java
@@ -207,9 +207,10 @@ private static PreprocessedStandardizedResult preprocessPanel(final RealMatrix r
             logger.info(String.format("A value of 0 was provided for argument %s, so the corresponding filtering step will be skipped...",
                     CreateReadCountPanelOfNormals.MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME));
         } else {
-            logger.info(String.format("Filtering intervals with median (across samples) less than or equal to the %.2f percentile...", minimumIntervalMedianPercentile));
             //calculate percentile
             final double minimumIntervalMedianThreshold = new Percentile(minimumIntervalMedianPercentile).evaluate(originalIntervalMedians);
+            logger.info(String.format("Filtering intervals with median (across samples) less than or equal to the %.2f percentile (%.2f)...",
+                    minimumIntervalMedianPercentile, minimumIntervalMedianThreshold));
             //filter intervals
             IntStream.range(0, numOriginalIntervals)
                     .filter(intervalIndex -> originalIntervalMedians[intervalIndex] <= minimumIntervalMedianThreshold)
@@ -222,23 +223,23 @@ private static PreprocessedStandardizedResult preprocessPanel(final RealMatrix r
                 .filter(intervalIndex -> !filterIntervals[intervalIndex])
                 .forEach(intervalIndex -> IntStream.range(0, numOriginalSamples).filter(sampleIndex -> !filterSamples[sampleIndex]).forEach(sampleIndex -> {
                     final double value = readCounts.getEntry(sampleIndex, intervalIndex);
-                    readCounts.setEntry(sampleIndex, intervalIndex,value / originalIntervalMedians[intervalIndex]);
+                    readCounts.setEntry(sampleIndex, intervalIndex, value / originalIntervalMedians[intervalIndex]); //TODO check effect of NaNs here: https://github.com/broadinstitute/gatk/issues/6878
                 }));
 
         //filter samples by percentage of zero-coverage intervals not already filtered
         if (maximumZerosInSamplePercentage == 100.) {
             logger.info(String.format("A value of 100 was provided for argument %s, so the corresponding filtering step will be skipped...",
                     CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME));
         } else {
-            logger.info(String.format("Filtering samples with a fraction of zero-coverage intervals above %.2f percent...", maximumZerosInSamplePercentage));
-            final int maxZerosInSample = calculateMaximumZerosCount(countNumberPassingFilter(filterIntervals), maximumZerosInSamplePercentage);
+            logger.info(String.format("Filtering samples with a fraction of zero-coverage intervals greater than or equal to %.2f percent...", maximumZerosInSamplePercentage));
+            final int numPassingIntervals = countNumberPassingFilter(filterIntervals);
             IntStream.range(0, numOriginalSamples)
                     .filter(sampleIndex -> !filterSamples[sampleIndex])
                     .forEach(sampleIndex -> {
-                        final int numZerosInSample = (int) IntStream.range(0, numOriginalIntervals)
+                        final double numZerosInSample = (double) IntStream.range(0, numOriginalIntervals)
                                 .filter(intervalIndex -> !filterIntervals[intervalIndex] && readCounts.getEntry(sampleIndex, intervalIndex) == 0.)
                                 .count();
-                        if (numZerosInSample > maxZerosInSample) {
+                        if (numZerosInSample / numPassingIntervals >= maximumZerosInSamplePercentage / 100.) {
                             filterSamples[sampleIndex] = true;
                         }
                     });
@@ -250,15 +251,15 @@ private static PreprocessedStandardizedResult preprocessPanel(final RealMatrix r
             logger.info(String.format("A value of 100 was provided for argument %s, so the corresponding filtering step will be skipped...",
                     CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE_LONG_NAME));
         } else {
-            logger.info(String.format("Filtering intervals with a fraction of zero-coverage samples above %.2f percent...", maximumZerosInIntervalPercentage));
-            final int maxZerosInInterval = calculateMaximumZerosCount(countNumberPassingFilter(filterSamples), maximumZerosInIntervalPercentage);
+            logger.info(String.format("Filtering intervals with a fraction of zero-coverage samples greater than or equal to %.2f percent...", maximumZerosInIntervalPercentage));
+            final int numPassingSamples = countNumberPassingFilter(filterSamples);
             IntStream.range(0, numOriginalIntervals)
                     .filter(intervalIndex -> !filterIntervals[intervalIndex])
                     .forEach(intervalIndex -> {
-                        final int numZerosInInterval = (int) IntStream.range(0, numOriginalSamples)
+                        final double numZerosInInterval = (double) IntStream.range(0, numOriginalSamples)
                                 .filter(sampleIndex -> !filterSamples[sampleIndex] && readCounts.getEntry(sampleIndex, intervalIndex) == 0.)
                                 .count();
-                        if (numZerosInInterval > maxZerosInInterval) {
+                        if (numZerosInInterval / numPassingSamples >= maximumZerosInIntervalPercentage / 100.) {
                             filterIntervals[intervalIndex] = true;
                         }
                     });
@@ -270,8 +271,6 @@ private static PreprocessedStandardizedResult preprocessPanel(final RealMatrix r
             logger.info(String.format("A value of 0 was provided for argument %s, so the corresponding filtering step will be skipped...",
                     CreateReadCountPanelOfNormals.EXTREME_SAMPLE_MEDIAN_PERCENTILE_LONG_NAME));
         } else {
-            logger.info(String.format("Filtering samples with a median (across intervals) below the %.2f percentile or above the %.2f percentile...",
-                    extremeSampleMedianPercentile, 100. - extremeSampleMedianPercentile));
             //calculate the medians for all samples (which, although unnecessary, makes bookkeeping easier) across intervals not already filtered
             final double[] sampleMedians = IntStream.range(0, numOriginalSamples)
                     .mapToDouble(sampleIndex -> new Median().evaluate(IntStream.range(0, numOriginalIntervals)
@@ -282,6 +281,8 @@ private static PreprocessedStandardizedResult preprocessPanel(final RealMatrix r
             //calculate percentiles
             final double minimumSampleMedianThreshold = new Percentile(extremeSampleMedianPercentile).evaluate(sampleMedians);
             final double maximumSampleMedianThreshold = new Percentile(100. - extremeSampleMedianPercentile).evaluate(sampleMedians);
+            logger.info(String.format("Filtering samples with a median (across intervals) strictly below the %.2f percentile (%.2f) or strictly above the %.2f percentile (%.2f)...",
+                    extremeSampleMedianPercentile, minimumSampleMedianThreshold, 100. - extremeSampleMedianPercentile, maximumSampleMedianThreshold));
             //filter samples
             IntStream.range(0, numOriginalSamples)
                     .filter(sampleIndex -> sampleMedians[sampleIndex] < minimumSampleMedianThreshold || sampleMedians[sampleIndex] > maximumSampleMedianThreshold)
@@ -352,8 +353,8 @@ public double visit(int sampleIndex, int intervalIndex, double value) {
                     return value;
                 }
             });
-            logger.info(String.format("%d values below the %.2f percentile or above the %.2f percentile were truncated to the corresponding value...",
-                    numTruncated[0], extremeOutlierTruncationPercentile, 100. - extremeOutlierTruncationPercentile));
+            logger.info(String.format("%d values strictly below the %.2f percentile (%.2f) or strictly above the %.2f percentile (%.2f) were truncated to the corresponding value...",
+                    numTruncated[0], extremeOutlierTruncationPercentile, minimumOutlierTruncationThreshold, 100. - extremeOutlierTruncationPercentile, maximumOutlierTruncationThreshold));
         }
         return new PreprocessedStandardizedResult(
                 preprocessedReadCounts, panelIntervalFractionalMedians, filterSamples, filterIntervals);
@@ -492,11 +493,6 @@ public double visit(int sampleIndex, int intervalIndex, double value) {
         });
     }
 
-    private static int calculateMaximumZerosCount(final int numTotalCounts,
-                                                  final double percentage) {
-        return (int) Math.ceil(numTotalCounts * percentage / 100.0);
-    }
-
     private static double safeLog2(final double x) {
         return x < EPSILON ? LN2_EPSILON : Math.log(x) * MathUtils.INV_LOG_2;
     }

diff --git a/...adinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormalsIntegrationTest.java b/...adinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormalsIntegrationTest.java
@@ -33,6 +33,7 @@
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
+import java.util.Random;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.DoubleStream;
@@ -62,8 +63,12 @@ public final class CreateReadCountPanelOfNormalsIntegrationTest extends CommandL
 
     //we test only for filtering of samples and intervals with too many zeros
     private static final double MINIMUM_INTERVAL_MEDIAN_PERCENTILE = 0.;
-    private static final double MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE = 5.;
-    private static final double MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE = 5.;
+    //test filtering of 5 bad samples
+    private static final int NUM_ZEROS_IN_BAD_SAMPLE_FOR_SIMULATION = 20;
+    private static final double MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE = 19.5;              //chosen to guard against regression of an equality check fixed in https://github.com/broadinstitute/gatk/pull/6624
+    //test filtering of 5 bad intervals (applied after sample filter)
+    private static final int NUM_ADDITIONAL_ZEROS_IN_BAD_INTERVAL_FOR_SIMULATION = 15;  //these zeros are added only in remaining good, unfiltered samples
+    private static final double MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE = 14.5;            //chosen to guard against regression of an equality check fixed in https://github.com/broadinstitute/gatk/pull/6624
     private static final double EXTREME_SAMPLE_MEDIAN_PERCENTILE = 0.;
 
     //test that number of eigenvalues is recovered for a few different values using fraction of variance as a heuristic
@@ -163,24 +168,24 @@ public double visit(int row, int column, double value) {
                 }
             });
 
-            //corrupt first NUM_BAD_SAMPLES_WITH_TOO_MANY_ZEROS samples by randomly adding zeros
-            //to 5 * MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE / 100. of intervals
+            //corrupt first NUM_BAD_SAMPLES_WITH_TOO_MANY_ZEROS samples by adding zeros
+            //to NUM_ZEROS_IN_BAD_SAMPLE_FOR_SIMULATION randomly chosen good intervals
             for (int sampleIndex = 0; sampleIndex < NUM_BAD_SAMPLES_WITH_TOO_MANY_ZEROS; sampleIndex++) {
-                for (int intervalIndex = 0; intervalIndex < NUM_INTERVALS; intervalIndex++) {
-                    if (rng.nextUniform(0., 1.) < 5 * MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE / 100.) {
-                        counts.setEntry(sampleIndex, intervalIndex, 0.);
-                    }
-                }
+                final List<Integer> intervalIndicesToZero = IntStream.range(NUM_BAD_INTERVALS_WITH_TOO_MANY_ZEROS, NUM_INTERVALS).boxed().collect(Collectors.toList());
+                Collections.shuffle(intervalIndicesToZero, new Random(sampleIndex));
+                final int si = sampleIndex;
+                intervalIndicesToZero.subList(0, NUM_ZEROS_IN_BAD_SAMPLE_FOR_SIMULATION)
+                        .forEach(intervalIndex -> counts.setEntry(si, intervalIndex, 0.));
             }
 
-            //corrupt first NUM_BAD_INTERVALS_WITH_TOO_MANY_ZEROS intervals by randomly adding zeros
-            //to 5 * MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE / 100. of samples
+            //corrupt first NUM_BAD_INTERVALS_WITH_TOO_MANY_ZEROS intervals by adding zeros
+            //to NUM_ADDITIONAL_ZEROS_IN_BAD_INTERVAL_FOR_SIMULATION randomly chosen from remaining good samples
             for (int intervalIndex = 0; intervalIndex < NUM_BAD_INTERVALS_WITH_TOO_MANY_ZEROS; intervalIndex++) {
-                for (int sampleIndex = 0; sampleIndex < NUM_SAMPLES; sampleIndex++) {
-                    if (rng.nextUniform(0., 1.) < 5 * MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE / 100.) {
-                        counts.setEntry(sampleIndex, intervalIndex, 0.);
-                    }
-                }
+                final List<Integer> sampleIndicesToZero = IntStream.range(NUM_BAD_SAMPLES_WITH_TOO_MANY_ZEROS, NUM_SAMPLES).boxed().collect(Collectors.toList()); //choose only from good samples
+                Collections.shuffle(sampleIndicesToZero, new Random(intervalIndex));
+                final int ii = intervalIndex;
+                sampleIndicesToZero.subList(0, NUM_ADDITIONAL_ZEROS_IN_BAD_INTERVAL_FOR_SIMULATION)
+                        .forEach(sampleIndex -> counts.setEntry(sampleIndex, ii, 0.));
             }
 
             //make input files from counts matrix
@@ -232,7 +237,7 @@ public double visit(int row, int column, double value) {
     public void test(final List<File> inputFiles,
                      final File annotatedIntervalsFile,
                      final int expectedNumberOfEigenvalues) {
-        final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".tsv");
+        final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".hdf5");
         final ArgumentsBuilder argsBuilder = new ArgumentsBuilder()
                 .add(CreateReadCountPanelOfNormals.MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME, Double.toString(MINIMUM_INTERVAL_MEDIAN_PERCENTILE))
                 .add(CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME, Double.toString(MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE))
@@ -252,7 +257,7 @@ public void test(final List<File> inputFiles,
     public void testSingleSample(final List<File> inputFiles,
                                  final File annotatedIntervalsFile,
                                  final int expectedNumberOfEigenvalues) {   //ignored in this test
-        final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".tsv");
+        final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".hdf5");
         final ArgumentsBuilder argsBuilder = new ArgumentsBuilder()
                 .add(CreateReadCountPanelOfNormals.MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME, Double.toString(MINIMUM_INTERVAL_MEDIAN_PERCENTILE))
                 .add(CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME, Double.toString(MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE))
@@ -271,7 +276,7 @@ public void testSingleSample(final List<File> inputFiles,
     public void testZeroEigensamples(final List<File> inputFiles,
                                      final File annotatedIntervalsFile,
                                      final int expectedNumberOfEigenvalues) {   //ignored in this test
-        final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".tsv");
+        final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".hdf5");
         final ArgumentsBuilder argsBuilder = new ArgumentsBuilder()
                 .add(CreateReadCountPanelOfNormals.MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME, Double.toString(MINIMUM_INTERVAL_MEDIAN_PERCENTILE))
                 .add(CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME, Double.toString(MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE))