Skip to content

Commit

Permalink
Added new suite of tools.
Browse files Browse the repository at this point in the history
  • Loading branch information
samuelklee committed Jul 28, 2022
1 parent 45ffd12 commit 1cbd55b
Show file tree
Hide file tree
Showing 195 changed files with 4,520 additions and 8 deletions.
1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@ dependencies {

implementation 'org.apache.commons:commons-lang3:3.5'
implementation 'org.apache.commons:commons-math3:3.5'
implementation 'org.hipparchus:hipparchus-stat:2.0'
implementation 'org.apache.commons:commons-collections4:4.1'
implementation 'org.apache.commons:commons-vfs2:2.0'
implementation 'org.apache.commons:commons-configuration2:2.4'
Expand Down
1 change: 1 addition & 0 deletions scripts/gatkcondaenv.yml.template
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ dependencies:
- conda-forge::matplotlib=3.2.1
- conda-forge::pandas=1.0.3
- conda-forge::typing_extensions=4.1.1 # see https://github.com/broadinstitute/gatk/issues/7800 and linked PRs
- conda-forge::dill=0.3.4 # used for pickling lambdas in TrainVariantAnnotationsModel

# core R dependencies; these should only be used for plotting and do not take precedence over core python dependencies!
- r-base=3.6.2
Expand Down
4 changes: 0 additions & 4 deletions scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,6 @@ task TrainVariantAnnotationModel {
command <<<
set -e

conda install -y --name gatk dill

export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}

mode=$(echo "~{mode}" | awk '{print toupper($0)}')
Expand Down Expand Up @@ -245,8 +243,6 @@ task ScoreVariantAnnotations {

ln -s ~{sep=" . && ln -s " model_files} .

conda install -y --name gatk dill

export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}

gatk --java-options "-Xmx~{command_mem}m" \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
* to TSV format. Using HDF5 files with {@link CreateReadCountPanelOfNormals}
* can decrease runtime, by reducing time spent on IO, so this is the default output format.
* The HDF5 format contains information in the paths defined in {@link HDF5SimpleCountCollection}. HDF5 files may be viewed using
* <a href="https://support.hdfgroup.org/products/java/hdfview/">hdfview</a> or loaded in python using
* <a href="https://support.hdfgroup.org/products/java/hdfview/">hdfview</a> or loaded in Python using
* <a href="http://www.pytables.org/">PyTables</a> or <a href="http://www.h5py.org/">h5py</a>.
* The TSV format has a SAM-style header containing a read group sample name, a sequence dictionary, a row specifying the column headers contained in
* {@link SimpleCountCollection.SimpleCountTableColumn}, and the corresponding entry rows.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@
* Panel-of-normals file.
* This is an HDF5 file containing the panel data in the paths defined in {@link HDF5SVDReadCountPanelOfNormals}.
* HDF5 files may be viewed using <a href="https://support.hdfgroup.org/products/java/hdfview/">hdfview</a>
* or loaded in python using <a href="http://www.pytables.org/">PyTables</a> or <a href="http://www.h5py.org/">h5py</a>.
* or loaded in Python using <a href="http://www.pytables.org/">PyTables</a> or <a href="http://www.h5py.org/">h5py</a>.
* </li>
* </ul>
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ public static double[][] readChunkedDoubleMatrix(final HDF5File file,
* Given a large matrix, chunks the matrix into equally sized subsets of rows
* (plus a subset containing the remainder, if necessary) and writes these submatrices to indexed sub-paths
* to avoid a hard limit in Java HDF5 on the number of elements in a matrix given by
* {@code MAX_NUM_VALUES_PER_HDF5_MATRIX}. The number of chunks is determined by {@code maxChunkSize},
* {@code MAX_NUMBER_OF_VALUES_PER_HDF5_MATRIX}. The number of chunks is determined by {@code maxChunkSize},
* which should be set appropriately for the desired number of columns.
*
* @param maxChunkSize The maximum number of values in each chunk. Decreasing this number will reduce
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hdf5.HDF5File;
import org.broadinstitute.hellbender.cmdline.*;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
Expand All @@ -23,6 +24,8 @@
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.MultiVariantWalker;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import picard.cmdline.programgroups.VariantFilteringProgramGroup;
import org.broadinstitute.hellbender.utils.R.RScriptExecutor;
import org.broadinstitute.hellbender.utils.SimpleInterval;
Expand All @@ -41,6 +44,7 @@

import java.io.*;
import java.util.*;
import java.util.stream.IntStream;

/**
* Build a recalibration model to score variant quality for filtering purposes
Expand Down Expand Up @@ -639,6 +643,10 @@ public Object onTraversalSuccess() {
for (int i = 1; i <= max_attempts; i++) {
try {
dataManager.setData(reduceSum);

final String rawAnnotationsOutput = output.toString().endsWith(".recal") ? output.toString().split(".recal")[0] : output.toString();
writeAnnotationsHDF5(new File(rawAnnotationsOutput + ".annot.raw.hdf5"));

dataManager.normalizeData(inputModel == null, annotationOrder); // Each data point is now (x - mean) / standard deviation

final GaussianMixtureModel goodModel;
Expand Down Expand Up @@ -678,6 +686,9 @@ public Object onTraversalSuccess() {
}
}

final String annotationsOutput = output.toString().endsWith(".recal") ? output.toString().split(".recal")[0] : output.toString();
writeAnnotationsHDF5(new File(annotationsOutput + ".annot.hdf5"));

dataManager.dropAggregateData(); // Don't need the aggregate data anymore so let's free up the memory
engine.evaluateData(dataManager.getData(), badModel, true);

Expand All @@ -686,6 +697,10 @@ public Object onTraversalSuccess() {
saveModelReport(report, outputModel);
}

final String modelOutput = output.toString().endsWith(".recal") ? output.toString().split(".recal")[0] : output.toString();
writeModelHDF5(new File(modelOutput + ".positive.hdf5"), goodModel);
writeModelHDF5(new File(modelOutput + ".negative.hdf5"), badModel);

engine.calculateWorstPerformingAnnotation(dataManager.getData(), goodModel, badModel);


Expand Down Expand Up @@ -1176,4 +1191,43 @@ private void createArrangeFunction( final PrintStream stream ) {
stream.println("}");
stream.println("}");
}

public void writeAnnotationsHDF5(final File file) {
try (final HDF5File hdf5File = new HDF5File(file, HDF5File.OpenMode.CREATE)) { // TODO allow appending
IOUtils.canReadFile(hdf5File.getFile());

hdf5File.makeStringArray("/data/annotation_names", dataManager.getAnnotationKeys().stream().toArray(String[]::new));
hdf5File.makeDoubleMatrix("/data/annotations", dataManager.getData().stream().map(vd -> vd.annotations).toArray(double[][]::new));
hdf5File.makeDoubleArray("/data/is_training", dataManager.getData().stream().mapToDouble(vd -> vd.atTrainingSite ? 1 : 0).toArray());
hdf5File.makeDoubleArray("/data/is_truth", dataManager.getData().stream().mapToDouble(vd -> vd.atTruthSite ? 1 : 0).toArray());
hdf5File.makeDoubleArray("/data/is_anti_training", dataManager.getData().stream().mapToDouble(vd -> vd.atAntiTrainingSite ? 1 : 0).toArray());

logger.info(String.format("Annotations written to %s.", file.getAbsolutePath()));
} catch (final RuntimeException exception) {
throw new GATKException(String.format("Exception encountered during writing of annotations (%s). Output file at %s may be in a bad state.",
exception, file.getAbsolutePath()));
}
}

public void writeModelHDF5(final File file,
final GaussianMixtureModel model) {
try (final HDF5File hdf5File = new HDF5File(file, HDF5File.OpenMode.CREATE)) { // TODO allow appending
IOUtils.canReadFile(hdf5File.getFile());

final int nComponents = model.getModelGaussians().size();
final int nFeatures = model.getNumAnnotations();
hdf5File.makeDouble("/vqsr/number_of_components", nComponents);
hdf5File.makeDouble("/vqsr/number_of_features", nComponents);
hdf5File.makeDoubleArray("/vqsr/weights", model.getModelGaussians().stream().mapToDouble(g -> Math.pow(10., (g.pMixtureLog10))).toArray());
IntStream.range(0, nComponents).forEach(
k -> hdf5File.makeDoubleArray("/vqsr/means/" + k, model.getModelGaussians().get(k).mu));
IntStream.range(0, nComponents).forEach(
k -> hdf5File.makeDoubleMatrix("vqsr/covariances/" + k, model.getModelGaussians().get(k).sigma.getArray()));

logger.info(String.format("VQSR model written to %s.", file.getAbsolutePath()));
} catch (final RuntimeException exception) {
throw new GATKException(String.format("Exception encountered during writing of VQSR model (%s). Output file at %s may be in a bad state.",
exception, file.getAbsolutePath()));
}
}
}
Loading

0 comments on commit 1cbd55b

Please sign in to comment.