Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adjusted logic for filtering zero-coverage samples and intervals in CreateReadCountPanelOfNormals. #6624

Merged
merged 3 commits into from
Nov 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ public final class CreateReadCountPanelOfNormals extends SparkCommandLineProgram
private double minimumIntervalMedianPercentile = DEFAULT_MINIMUM_INTERVAL_MEDIAN_PERCENTILE;

@Argument(
doc = "Samples with a fraction of zero-coverage genomic intervals above this percentage are filtered out. " +
doc = "Samples with a fraction of zero-coverage genomic intervals greater than or equal to this percentage are filtered out. " +
"(This is the second filter applied.)",
fullName = MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME,
minValue = 0.,
Expand All @@ -187,7 +187,7 @@ public final class CreateReadCountPanelOfNormals extends SparkCommandLineProgram
private double maximumZerosInSamplePercentage = DEFAULT_MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE;

@Argument(
doc = "Genomic intervals with a fraction of zero-coverage samples above this percentage are filtered out. " +
doc = "Genomic intervals with a fraction of zero-coverage samples greater than or equal to this percentage are filtered out. " +
"(This is the third filter applied.)",
fullName = MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE_LONG_NAME,
minValue = 0.,
Expand All @@ -198,7 +198,7 @@ public final class CreateReadCountPanelOfNormals extends SparkCommandLineProgram

@Argument(
doc = "Samples with a median (across genomic intervals) of fractional coverage normalized by genomic-interval medians " +
"below this percentile or above the complementary percentile are filtered out. " +
"strictly below this percentile or strictly above the complementary percentile are filtered out. " +
"(This is the fourth filter applied.)",
fullName = EXTREME_SAMPLE_MEDIAN_PERCENTILE_LONG_NAME,
minValue = 0.,
Expand All @@ -217,7 +217,7 @@ public final class CreateReadCountPanelOfNormals extends SparkCommandLineProgram

@Argument(
doc = "Fractional coverages normalized by genomic-interval medians that are " +
"below this percentile or above the complementary percentile are set to the corresponding percentile value. " +
"strictly below this percentile or strictly above the complementary percentile are set to the corresponding percentile value. " +
"(This is applied after all filters and imputation.)",
fullName = EXTREME_OUTLIER_TRUNCATION_PERCENTILE_LONG_NAME,
minValue = 0.,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,9 +207,10 @@ private static PreprocessedStandardizedResult preprocessPanel(final RealMatrix r
logger.info(String.format("A value of 0 was provided for argument %s, so the corresponding filtering step will be skipped...",
CreateReadCountPanelOfNormals.MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME));
} else {
logger.info(String.format("Filtering intervals with median (across samples) less than or equal to the %.2f percentile...", minimumIntervalMedianPercentile));
//calculate percentile
final double minimumIntervalMedianThreshold = new Percentile(minimumIntervalMedianPercentile).evaluate(originalIntervalMedians);
logger.info(String.format("Filtering intervals with median (across samples) less than or equal to the %.2f percentile (%.2f)...",
minimumIntervalMedianPercentile, minimumIntervalMedianThreshold));
//filter intervals
IntStream.range(0, numOriginalIntervals)
.filter(intervalIndex -> originalIntervalMedians[intervalIndex] <= minimumIntervalMedianThreshold)
Expand All @@ -222,23 +223,23 @@ private static PreprocessedStandardizedResult preprocessPanel(final RealMatrix r
.filter(intervalIndex -> !filterIntervals[intervalIndex])
.forEach(intervalIndex -> IntStream.range(0, numOriginalSamples).filter(sampleIndex -> !filterSamples[sampleIndex]).forEach(sampleIndex -> {
final double value = readCounts.getEntry(sampleIndex, intervalIndex);
readCounts.setEntry(sampleIndex, intervalIndex,value / originalIntervalMedians[intervalIndex]);
readCounts.setEntry(sampleIndex, intervalIndex, value / originalIntervalMedians[intervalIndex]); //TODO check effect of NaNs here: https://github.com/broadinstitute/gatk/issues/6878
}));

//filter samples by percentage of zero-coverage intervals not already filtered
if (maximumZerosInSamplePercentage == 100.) {
logger.info(String.format("A value of 100 was provided for argument %s, so the corresponding filtering step will be skipped...",
CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME));
} else {
logger.info(String.format("Filtering samples with a fraction of zero-coverage intervals above %.2f percent...", maximumZerosInSamplePercentage));
final int maxZerosInSample = calculateMaximumZerosCount(countNumberPassingFilter(filterIntervals), maximumZerosInSamplePercentage);
logger.info(String.format("Filtering samples with a fraction of zero-coverage intervals greater than or equal to %.2f percent...", maximumZerosInSamplePercentage));
final int numPassingIntervals = countNumberPassingFilter(filterIntervals);
IntStream.range(0, numOriginalSamples)
.filter(sampleIndex -> !filterSamples[sampleIndex])
.forEach(sampleIndex -> {
final int numZerosInSample = (int) IntStream.range(0, numOriginalIntervals)
final double numZerosInSample = (double) IntStream.range(0, numOriginalIntervals)
.filter(intervalIndex -> !filterIntervals[intervalIndex] && readCounts.getEntry(sampleIndex, intervalIndex) == 0.)
.count();
if (numZerosInSample > maxZerosInSample) {
if (numZerosInSample / numPassingIntervals >= maximumZerosInSamplePercentage / 100.) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a little confused, why is numZerosInSample a double rather than an int?
If you need it to be a double so that the fraction is a double, why not cast at the point of computing the fraction?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In both cases, the cast happens in the next line and the variable is not used elsewhere, so I'm OK keeping it like this. Certainly it's valid to represent an integer with a double, at least here...?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@samuelklee It's fine the way it is.

filterSamples[sampleIndex] = true;
}
});
Expand All @@ -250,15 +251,15 @@ private static PreprocessedStandardizedResult preprocessPanel(final RealMatrix r
logger.info(String.format("A value of 100 was provided for argument %s, so the corresponding filtering step will be skipped...",
CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE_LONG_NAME));
} else {
logger.info(String.format("Filtering intervals with a fraction of zero-coverage samples above %.2f percent...", maximumZerosInIntervalPercentage));
final int maxZerosInInterval = calculateMaximumZerosCount(countNumberPassingFilter(filterSamples), maximumZerosInIntervalPercentage);
logger.info(String.format("Filtering intervals with a fraction of zero-coverage samples greater than or equal to %.2f percent...", maximumZerosInIntervalPercentage));
final int numPassingSamples = countNumberPassingFilter(filterSamples);
IntStream.range(0, numOriginalIntervals)
.filter(intervalIndex -> !filterIntervals[intervalIndex])
.forEach(intervalIndex -> {
final int numZerosInInterval = (int) IntStream.range(0, numOriginalSamples)
final double numZerosInInterval = (double) IntStream.range(0, numOriginalSamples)
.filter(sampleIndex -> !filterSamples[sampleIndex] && readCounts.getEntry(sampleIndex, intervalIndex) == 0.)
.count();
if (numZerosInInterval > maxZerosInInterval) {
if (numZerosInInterval / numPassingSamples >= maximumZerosInIntervalPercentage / 100.) {
filterIntervals[intervalIndex] = true;
}
});
Expand All @@ -270,8 +271,6 @@ private static PreprocessedStandardizedResult preprocessPanel(final RealMatrix r
logger.info(String.format("A value of 0 was provided for argument %s, so the corresponding filtering step will be skipped...",
CreateReadCountPanelOfNormals.EXTREME_SAMPLE_MEDIAN_PERCENTILE_LONG_NAME));
} else {
logger.info(String.format("Filtering samples with a median (across intervals) below the %.2f percentile or above the %.2f percentile...",
extremeSampleMedianPercentile, 100. - extremeSampleMedianPercentile));
//calculate the medians for all samples (which, although unnecessary, makes bookkeeping easier) across intervals not already filtered
final double[] sampleMedians = IntStream.range(0, numOriginalSamples)
.mapToDouble(sampleIndex -> new Median().evaluate(IntStream.range(0, numOriginalIntervals)
Expand All @@ -282,6 +281,8 @@ private static PreprocessedStandardizedResult preprocessPanel(final RealMatrix r
//calculate percentiles
final double minimumSampleMedianThreshold = new Percentile(extremeSampleMedianPercentile).evaluate(sampleMedians);
final double maximumSampleMedianThreshold = new Percentile(100. - extremeSampleMedianPercentile).evaluate(sampleMedians);
logger.info(String.format("Filtering samples with a median (across intervals) strictly below the %.2f percentile (%.2f) or strictly above the %.2f percentile (%.2f)...",
extremeSampleMedianPercentile, minimumSampleMedianThreshold, 100. - extremeSampleMedianPercentile, maximumSampleMedianThreshold));
//filter samples
IntStream.range(0, numOriginalSamples)
.filter(sampleIndex -> sampleMedians[sampleIndex] < minimumSampleMedianThreshold || sampleMedians[sampleIndex] > maximumSampleMedianThreshold)
Expand Down Expand Up @@ -352,8 +353,8 @@ public double visit(int sampleIndex, int intervalIndex, double value) {
return value;
}
});
logger.info(String.format("%d values below the %.2f percentile or above the %.2f percentile were truncated to the corresponding value...",
numTruncated[0], extremeOutlierTruncationPercentile, 100. - extremeOutlierTruncationPercentile));
logger.info(String.format("%d values strictly below the %.2f percentile (%.2f) or strictly above the %.2f percentile (%.2f) were truncated to the corresponding value...",
numTruncated[0], extremeOutlierTruncationPercentile, minimumOutlierTruncationThreshold, 100. - extremeOutlierTruncationPercentile, maximumOutlierTruncationThreshold));
}
return new PreprocessedStandardizedResult(
preprocessedReadCounts, panelIntervalFractionalMedians, filterSamples, filterIntervals);
Expand Down Expand Up @@ -492,11 +493,6 @@ public double visit(int sampleIndex, int intervalIndex, double value) {
});
}

private static int calculateMaximumZerosCount(final int numTotalCounts,
final double percentage) {
return (int) Math.ceil(numTotalCounts * percentage / 100.0);
}

private static double safeLog2(final double x) {
return x < EPSILON ? LN2_EPSILON : Math.log(x) * MathUtils.INV_LOG_2;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.DoubleStream;
Expand Down Expand Up @@ -62,8 +63,12 @@ public final class CreateReadCountPanelOfNormalsIntegrationTest extends CommandL

//we test only for filtering of samples and intervals with too many zeros
private static final double MINIMUM_INTERVAL_MEDIAN_PERCENTILE = 0.;
private static final double MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE = 5.;
private static final double MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE = 5.;
//test filtering of 5 bad samples
private static final int NUM_ZEROS_IN_BAD_SAMPLE_FOR_SIMULATION = 20;
private static final double MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE = 19.5; //chosen to guard against regression of an equality check fixed in https://github.com/broadinstitute/gatk/pull/6624
//test filtering of 5 bad intervals (applied after sample filter)
private static final int NUM_ADDITIONAL_ZEROS_IN_BAD_INTERVAL_FOR_SIMULATION = 15; //these zeros are added only in remaining good, unfiltered samples
private static final double MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE = 14.5; //chosen to guard against regression of an equality check fixed in https://github.com/broadinstitute/gatk/pull/6624
private static final double EXTREME_SAMPLE_MEDIAN_PERCENTILE = 0.;

//test that number of eigenvalues is recovered for a few different values using fraction of variance as a heuristic
Expand Down Expand Up @@ -163,24 +168,24 @@ public double visit(int row, int column, double value) {
}
});

//corrupt first NUM_BAD_SAMPLES_WITH_TOO_MANY_ZEROS samples by randomly adding zeros
//to 5 * MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE / 100. of intervals
//corrupt first NUM_BAD_SAMPLES_WITH_TOO_MANY_ZEROS samples by adding zeros
//to NUM_ZEROS_IN_BAD_SAMPLE_FOR_SIMULATION randomly chosen good intervals
for (int sampleIndex = 0; sampleIndex < NUM_BAD_SAMPLES_WITH_TOO_MANY_ZEROS; sampleIndex++) {
for (int intervalIndex = 0; intervalIndex < NUM_INTERVALS; intervalIndex++) {
if (rng.nextUniform(0., 1.) < 5 * MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE / 100.) {
counts.setEntry(sampleIndex, intervalIndex, 0.);
}
}
final List<Integer> intervalIndicesToZero = IntStream.range(NUM_BAD_INTERVALS_WITH_TOO_MANY_ZEROS, NUM_INTERVALS).boxed().collect(Collectors.toList());
Collections.shuffle(intervalIndicesToZero, new Random(sampleIndex));
final int si = sampleIndex;
intervalIndicesToZero.subList(0, NUM_ZEROS_IN_BAD_SAMPLE_FOR_SIMULATION)
.forEach(intervalIndex -> counts.setEntry(si, intervalIndex, 0.));
}

//corrupt first NUM_BAD_INTERVALS_WITH_TOO_MANY_ZEROS intervals by randomly adding zeros
//to 5 * MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE / 100. of samples
//corrupt first NUM_BAD_INTERVALS_WITH_TOO_MANY_ZEROS intervals by adding zeros
//to NUM_ADDITIONAL_ZEROS_IN_BAD_INTERVAL_FOR_SIMULATION randomly chosen from remaining good samples
for (int intervalIndex = 0; intervalIndex < NUM_BAD_INTERVALS_WITH_TOO_MANY_ZEROS; intervalIndex++) {
for (int sampleIndex = 0; sampleIndex < NUM_SAMPLES; sampleIndex++) {
if (rng.nextUniform(0., 1.) < 5 * MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE / 100.) {
counts.setEntry(sampleIndex, intervalIndex, 0.);
}
}
final List<Integer> sampleIndicesToZero = IntStream.range(NUM_BAD_SAMPLES_WITH_TOO_MANY_ZEROS, NUM_SAMPLES).boxed().collect(Collectors.toList()); //choose only from good samples
Collections.shuffle(sampleIndicesToZero, new Random(intervalIndex));
final int ii = intervalIndex;
sampleIndicesToZero.subList(0, NUM_ADDITIONAL_ZEROS_IN_BAD_INTERVAL_FOR_SIMULATION)
.forEach(sampleIndex -> counts.setEntry(sampleIndex, ii, 0.));
}

//make input files from counts matrix
Expand Down Expand Up @@ -232,7 +237,7 @@ public double visit(int row, int column, double value) {
public void test(final List<File> inputFiles,
final File annotatedIntervalsFile,
final int expectedNumberOfEigenvalues) {
final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".tsv");
final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".hdf5");
final ArgumentsBuilder argsBuilder = new ArgumentsBuilder()
.add(CreateReadCountPanelOfNormals.MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME, Double.toString(MINIMUM_INTERVAL_MEDIAN_PERCENTILE))
.add(CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME, Double.toString(MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE))
Expand All @@ -252,7 +257,7 @@ public void test(final List<File> inputFiles,
public void testSingleSample(final List<File> inputFiles,
final File annotatedIntervalsFile,
final int expectedNumberOfEigenvalues) { //ignored in this test
final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".tsv");
final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".hdf5");
final ArgumentsBuilder argsBuilder = new ArgumentsBuilder()
.add(CreateReadCountPanelOfNormals.MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME, Double.toString(MINIMUM_INTERVAL_MEDIAN_PERCENTILE))
.add(CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME, Double.toString(MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE))
Expand All @@ -271,7 +276,7 @@ public void testSingleSample(final List<File> inputFiles,
public void testZeroEigensamples(final List<File> inputFiles,
final File annotatedIntervalsFile,
final int expectedNumberOfEigenvalues) { //ignored in this test
final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".tsv");
final File resultOutputFile = createTempFile("create-read-count-panel-of-normals-test", ".hdf5");
final ArgumentsBuilder argsBuilder = new ArgumentsBuilder()
.add(CreateReadCountPanelOfNormals.MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME, Double.toString(MINIMUM_INTERVAL_MEDIAN_PERCENTILE))
.add(CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME, Double.toString(MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE))
Expand Down