Skip to content

Commit

Permalink
Addressed comments, added support for empirical distribution in read-…
Browse files Browse the repository at this point in the history
…metadata file
  • Loading branch information
vruano committed Aug 23, 2018
1 parent ec7cf82 commit 4e3682c
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 13 deletions.
1 change: 1 addition & 0 deletions scripts/sv/run_whole_pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ case ${GATK_SV_TOOL} in
--breakpoint-intervals ${PROJECT_OUTPUT_DIR}/intervals \
--high-coverage-intervals "${PROJECT_OUTPUT_DIR}/highCoverageIntervals.bed" \
--fastq-dir ${PROJECT_OUTPUT_DIR}/fastq \
--read-metadata ${PROJECT_OUTPUT_DIR}/read-metadata.txt \
--contig-sam-file ${PROJECT_OUTPUT_DIR}/assemblies.bam \
--target-link-file ${PROJECT_OUTPUT_DIR}/target_links.bedpe \
--exp-interpret"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,13 @@
import org.apache.commons.math3.distribution.NormalDistribution;
import org.apache.commons.math3.distribution.RealDistribution;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.spark.sv.evidence.ReadMetadata;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collections;
Expand All @@ -28,6 +34,39 @@ public interface Type {
List<String> getNames();

AbstractRealDistribution fromMeanAndStdDeviation(final double mean, final double stddev);

default AbstractRealDistribution fromReadMetadataFile(final String meanString) {
try (final BufferedReader reader = new BufferedReader(new InputStreamReader(BucketUtils.openFile(meanString)))) {
String line;
int value;
double totalSum = 0;
double totalSqSum = 0;
long totalCount = 0;
while ((line = reader.readLine()) != null) {
if (line.startsWith(ReadMetadata.CDF_PREFIX)) {
final String[] cdf = line.substring(ReadMetadata.CDF_PREFIX.length() + 1).split("\t");
long leftCdf = 0;
for (value = 0; value < cdf.length; value++) {
final long frequency = Long.parseLong(cdf[value]) - leftCdf;
leftCdf += frequency;
totalSum += frequency * value;
totalSqSum += value * value * frequency;
}
totalCount += leftCdf;
}
}
if (totalCount == 0) {
throw new UserException.MalformedFile("Could not find any insert-sizes in " + meanString);
}
final double mean = totalSum / totalCount;
final double stdDev = Math.sqrt(Math.abs(totalSqSum/totalCount - mean * mean));
return fromMeanAndStdDeviation(mean, stdDev);
} catch (final IOException ex) {
throw new UserException.CouldNotReadInputFile(meanString);
} catch (final NumberFormatException ex) {
throw new UserException.MalformedFile("the CDF contains non-numbers in " + meanString);
}
}
}

public static class NormalType implements Type {
Expand All @@ -41,6 +80,7 @@ public List<String> getNames() {
public AbstractRealDistribution fromMeanAndStdDeviation(final double mean, final double stddev) {
return new NormalDistribution(mean, stddev);
}

}

public static class LogNormalType implements Type {
Expand All @@ -60,7 +100,7 @@ public AbstractRealDistribution fromMeanAndStdDeviation(final double mean, final
}

private static Pattern DESCRIPTION_PATTERN =
Pattern.compile("^\\s*(?<name>[^\\s\\(\\)]+)\\s*\\((?<mean>[^,\\(\\)]+?)\\s*,\\s*(?<stddev>[^,\\(\\)]+?)\\s*\\)\\s*");
Pattern.compile("^\\s*(?<name>[^\\s\\(\\)]+)\\s*\\((?<mean>[^,\\(\\)]+?)\\s*(?:,\\s*(?<stddev>[^,\\(\\)]+?)\\s*)?\\)\\s*");

private final String description;

Expand All @@ -71,10 +111,14 @@ private AbstractRealDistribution dist() {
return dist;
}

public double average() {
public double mean() {
return dist().getNumericalMean();
}

public double variance() { return dist().getNumericalVariance(); }

public double stddev() { return Math.sqrt(variance()); }

public InsertSizeDistribution(final String distrString) {
this.description = distrString;
initializeDistribution();
Expand All @@ -90,12 +134,16 @@ private void initializeDistribution() {
final String meanString = matcher.group("mean");
final String stddevString = matcher.group("stddev");
final Type type = extractDistributionType(nameString, description);
final double mean = extractDoubleParameter("mean", description, meanString, 0, Double.MAX_VALUE);
final double stddev = extractDoubleParameter("stddev", description, stddevString, 0, Double.MAX_VALUE);
dist = type.fromMeanAndStdDeviation(mean, stddev);
if (stddevString != null) {
final double mean = extractDoubleParameter("mean", description, meanString, 0, Double.MAX_VALUE);
final double stddev = extractDoubleParameter("stddev", description, stddevString, 0, Double.MAX_VALUE);
dist = type.fromMeanAndStdDeviation(mean, stddev);
} else {
dist = type.fromReadMetadataFile(meanString);
}
}

private static final Type extractDistributionType(final String nameString, final String description) {
private static Type extractDistributionType(final String nameString, final String description) {
for (final Type candidate : SUPPORTED_TYPES) {
if (candidate.getNames().stream().anyMatch(name -> name.toLowerCase().equals(nameString.trim().toLowerCase()))) {
return candidate;
Expand All @@ -105,7 +153,7 @@ private static final Type extractDistributionType(final String nameString, final
+ "' in description: " + description);
}

private static final double extractDoubleParameter(final String name, final String description,
private static double extractDoubleParameter(final String name, final String description,
final String valueString, final double min,
final double max) {
final double value;
Expand All @@ -132,7 +180,7 @@ public int hashCode() {

@Override
public boolean equals(final Object obj) {
if (obj == null || !(obj instanceof InsertSizeDistribution)) {
if (!(obj instanceof InsertSizeDistribution)) {
return false;
} else {
return ((InsertSizeDistribution)obj).dist().equals(dist());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
*/
@DefaultSerializer(ReadMetadata.Serializer.class)
public class ReadMetadata {

public static final String CDF_PREFIX = "template size cumulative counts:";
private final Set<Integer> crossContigIgnoreSet;
private final Map<String, Integer> contigNameToID;
private final String[] contigIDToName;
Expand Down Expand Up @@ -363,7 +365,7 @@ public static void writeMetadata(final ReadMetadata readMetadata,
final IntHistogram.CDF templateSizeCDF = stats.getCDF();
final int cdfSize = templateSizeCDF.size();
final long totalObservations = templateSizeCDF.getTotalObservations();
writer.write("template size cumulative counts:");
writer.write(CDF_PREFIX);
for(int idx = 0; idx < cdfSize; ++idx) {
final long cumulativeCounts = Math.round(templateSizeCDF.getFraction(idx) * totalObservations);
writer.write("\t" + cumulativeCounts);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,38 @@
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.math3.distribution.PoissonDistribution;
import org.apache.commons.math3.random.JDKRandomGenerator;
import org.broadinstitute.hellbender.utils.test.BaseTest;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import java.util.function.IntToDoubleFunction;

/**
* Unit tests for {@link InsertSizeDistribution}.
*/
public class InsertSizeDistributionUnitTest extends BaseTest {
public class InsertSizeDistributionUnitTest extends GATKBaseTest {

private static final String READ_METADATA_FILE_NAME = "read-metadata-example.txt.gz";

private static final File READ_METADATA_FILE = new File( publicTestDir + InsertSizeDistribution.class.getPackage().getName()
.replace('.', File.separatorChar) + File.separatorChar + READ_METADATA_FILE_NAME);

@Test
public void testLogNormalFromMetaData() {
final InsertSizeDistribution lnDist = new InsertSizeDistribution("LogNormal(" + READ_METADATA_FILE.toString() + ")");
final InsertSizeDistribution nDist = new InsertSizeDistribution("Normal(" + READ_METADATA_FILE.toString() + ")");
for (final InsertSizeDistribution dist : Arrays.asList(lnDist, nDist)) {
Assert.assertEquals(dist.mean(), 379.1432, 0.00005); // calculated independently using R.
Assert.assertEquals(dist.variance(), 18163.74, 0.005);
Assert.assertEquals(dist.stddev(), 134.7729, 0.00005);
}
}

@Test(dataProvider = "testData")
public void testProbability(final String description, final int x, final double expected, final double logExpected) {
Expand Down Expand Up @@ -114,5 +131,4 @@ private String composeDescriptionString(final String baseName, final double mean
')' +
StringUtils.repeat(' ', spacesDistr.sample());
}

}
Binary file not shown.

0 comments on commit 4e3682c

Please sign in to comment.