Skip to content

Commit

Permalink
Adjusted logic for filtering zero-coverage samples and intervals in C…
Browse files Browse the repository at this point in the history
…reateReadCountPanelOfNormals.
  • Loading branch information
samuelklee committed Oct 8, 2020
1 parent 851c840 commit 2a8001d
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ public final class CreateReadCountPanelOfNormals extends SparkCommandLineProgram
private double minimumIntervalMedianPercentile = DEFAULT_MINIMUM_INTERVAL_MEDIAN_PERCENTILE;

@Argument(
doc = "Samples with a fraction of zero-coverage genomic intervals above this percentage are filtered out. " +
doc = "Samples with a fraction of zero-coverage genomic intervals greater than or equal to this percentage are filtered out. " +
"(This is the second filter applied.)",
fullName = MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME,
minValue = 0.,
Expand All @@ -187,7 +187,7 @@ public final class CreateReadCountPanelOfNormals extends SparkCommandLineProgram
private double maximumZerosInSamplePercentage = DEFAULT_MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE;

@Argument(
doc = "Genomic intervals with a fraction of zero-coverage samples above this percentage are filtered out. " +
doc = "Genomic intervals with a fraction of zero-coverage samples greater than or equal to this percentage are filtered out. " +
"(This is the third filter applied.)",
fullName = MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE_LONG_NAME,
minValue = 0.,
Expand All @@ -198,7 +198,7 @@ public final class CreateReadCountPanelOfNormals extends SparkCommandLineProgram

@Argument(
doc = "Samples with a median (across genomic intervals) of fractional coverage normalized by genomic-interval medians " +
"below this percentile or above the complementary percentile are filtered out. " +
"strictly below this percentile or strictly above the complementary percentile are filtered out. " +
"(This is the fourth filter applied.)",
fullName = EXTREME_SAMPLE_MEDIAN_PERCENTILE_LONG_NAME,
minValue = 0.,
Expand All @@ -217,7 +217,7 @@ public final class CreateReadCountPanelOfNormals extends SparkCommandLineProgram

@Argument(
doc = "Fractional coverages normalized by genomic-interval medians that are " +
"below this percentile or above the complementary percentile are set to the corresponding percentile value. " +
"strictly below this percentile or strictly above the complementary percentile are set to the corresponding percentile value. " +
"(This is applied after all filters and imputation.)",
fullName = EXTREME_OUTLIER_TRUNCATION_PERCENTILE_LONG_NAME,
minValue = 0.,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,15 +230,15 @@ private static PreprocessedStandardizedResult preprocessPanel(final RealMatrix r
logger.info(String.format("A value of 100 was provided for argument %s, so the corresponding filtering step will be skipped...",
CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME));
} else {
logger.info(String.format("Filtering samples with a fraction of zero-coverage intervals above %.2f percent...", maximumZerosInSamplePercentage));
final int maxZerosInSample = calculateMaximumZerosCount(countNumberPassingFilter(filterIntervals), maximumZerosInSamplePercentage);
logger.info(String.format("Filtering samples with a fraction of zero-coverage intervals greater than or equal to %.2f percent...", maximumZerosInSamplePercentage));
final int numPassingIntervals = countNumberPassingFilter(filterIntervals);
IntStream.range(0, numOriginalSamples)
.filter(sampleIndex -> !filterSamples[sampleIndex])
.forEach(sampleIndex -> {
final int numZerosInSample = (int) IntStream.range(0, numOriginalIntervals)
final double numZerosInSample = (double) IntStream.range(0, numOriginalIntervals)
.filter(intervalIndex -> !filterIntervals[intervalIndex] && readCounts.getEntry(sampleIndex, intervalIndex) == 0.)
.count();
if (numZerosInSample > maxZerosInSample) {
if (numZerosInSample / numPassingIntervals >= maximumZerosInSamplePercentage / 100.) {
filterSamples[sampleIndex] = true;
}
});
Expand All @@ -250,15 +250,15 @@ private static PreprocessedStandardizedResult preprocessPanel(final RealMatrix r
logger.info(String.format("A value of 100 was provided for argument %s, so the corresponding filtering step will be skipped...",
CreateReadCountPanelOfNormals.MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE_LONG_NAME));
} else {
logger.info(String.format("Filtering intervals with a fraction of zero-coverage samples above %.2f percent...", maximumZerosInIntervalPercentage));
final int maxZerosInInterval = calculateMaximumZerosCount(countNumberPassingFilter(filterSamples), maximumZerosInIntervalPercentage);
logger.info(String.format("Filtering intervals with a fraction of zero-coverage samples greater than or equal to %.2f percent...", maximumZerosInIntervalPercentage));
final int numPassingSamples = countNumberPassingFilter(filterSamples);
IntStream.range(0, numOriginalIntervals)
.filter(intervalIndex -> !filterIntervals[intervalIndex])
.forEach(intervalIndex -> {
final int numZerosInInterval = (int) IntStream.range(0, numOriginalSamples)
final double numZerosInInterval = (double) IntStream.range(0, numOriginalSamples)
.filter(sampleIndex -> !filterSamples[sampleIndex] && readCounts.getEntry(sampleIndex, intervalIndex) == 0.)
.count();
if (numZerosInInterval > maxZerosInInterval) {
if (numZerosInInterval / numPassingSamples >= maximumZerosInIntervalPercentage / 100.) {
filterIntervals[intervalIndex] = true;
}
});
Expand All @@ -270,7 +270,7 @@ private static PreprocessedStandardizedResult preprocessPanel(final RealMatrix r
logger.info(String.format("A value of 0 was provided for argument %s, so the corresponding filtering step will be skipped...",
CreateReadCountPanelOfNormals.EXTREME_SAMPLE_MEDIAN_PERCENTILE_LONG_NAME));
} else {
logger.info(String.format("Filtering samples with a median (across intervals) below the %.2f percentile or above the %.2f percentile...",
logger.info(String.format("Filtering samples with a median (across intervals) strictly below the %.2f percentile or strictly above the %.2f percentile...",
extremeSampleMedianPercentile, 100. - extremeSampleMedianPercentile));
//calculate the medians for all samples (which, although unnecessary, makes bookkeeping easier) across intervals not already filtered
final double[] sampleMedians = IntStream.range(0, numOriginalSamples)
Expand Down Expand Up @@ -352,7 +352,7 @@ public double visit(int sampleIndex, int intervalIndex, double value) {
return value;
}
});
logger.info(String.format("%d values below the %.2f percentile or above the %.2f percentile were truncated to the corresponding value...",
logger.info(String.format("%d values strictly below the %.2f percentile or strictly above the %.2f percentile were truncated to the corresponding value...",
numTruncated[0], extremeOutlierTruncationPercentile, 100. - extremeOutlierTruncationPercentile));
}
return new PreprocessedStandardizedResult(
Expand Down Expand Up @@ -492,11 +492,6 @@ public double visit(int sampleIndex, int intervalIndex, double value) {
});
}

private static int calculateMaximumZerosCount(final int numTotalCounts,
final double percentage) {
return (int) Math.ceil(numTotalCounts * percentage / 100.0);
}

private static double safeLog2(final double x) {
return x < EPSILON ? LN2_EPSILON : Math.log(x) * MathUtils.INV_LOG_2;
}
Expand Down

0 comments on commit 2a8001d

Please sign in to comment.