Added new suite of tools.

broadinstitute · Jul 28, 2022 · 1cbd55b · 1cbd55b
1 parent 45ffd12
commit 1cbd55b
Show file tree

Hide file tree

Showing 195 changed files with 4,520 additions and 8 deletions.
diff --git a/build.gradle b/build.gradle
@@ -285,6 +285,7 @@ dependencies {
 
     implementation 'org.apache.commons:commons-lang3:3.5'
     implementation 'org.apache.commons:commons-math3:3.5'
+    implementation 'org.hipparchus:hipparchus-stat:2.0'
     implementation 'org.apache.commons:commons-collections4:4.1'
     implementation 'org.apache.commons:commons-vfs2:2.0'
     implementation 'org.apache.commons:commons-configuration2:2.4'

diff --git a/scripts/gatkcondaenv.yml.template b/scripts/gatkcondaenv.yml.template
@@ -42,6 +42,7 @@ dependencies:
 - conda-forge::matplotlib=3.2.1
 - conda-forge::pandas=1.0.3
 - conda-forge::typing_extensions=4.1.1   # see https://github.com/broadinstitute/gatk/issues/7800 and linked PRs
+- conda-forge::dill=0.3.4                # used for pickling lambdas in TrainVariantAnnotationsModel
 
 # core R dependencies; these should only be used for plotting and do not take precedence over core python dependencies!
 - r-base=3.6.2

diff --git a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
@@ -192,8 +192,6 @@ task TrainVariantAnnotationModel {
 	command <<<
 		set -e
 
-		conda install -y --name gatk dill
-
 		export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
 
 		mode=$(echo "~{mode}" | awk '{print toupper($0)}')
@@ -245,8 +243,6 @@ task ScoreVariantAnnotations {
 
 		ln -s ~{sep=" . && ln -s " model_files} .
 
-		conda install -y --name gatk dill
-
 		export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
 
 		gatk --java-options "-Xmx~{command_mem}m" \

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectReadCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectReadCounts.java
@@ -68,7 +68,7 @@
  *         to TSV format. Using HDF5 files with {@link CreateReadCountPanelOfNormals}
  *         can decrease runtime, by reducing time spent on IO, so this is the default output format.
  *         The HDF5 format contains information in the paths defined in {@link HDF5SimpleCountCollection}. HDF5 files may be viewed using
- *         <a href="https://support.hdfgroup.org/products/java/hdfview/">hdfview</a> or loaded in python using
+ *         <a href="https://support.hdfgroup.org/products/java/hdfview/">hdfview</a> or loaded in Python using
  *         <a href="http://www.pytables.org/">PyTables</a> or <a href="http://www.h5py.org/">h5py</a>.
  *         The TSV format has a SAM-style header containing a read group sample name, a sequence dictionary, a row specifying the column headers contained in
  *         {@link SimpleCountCollection.SimpleCountTableColumn}, and the corresponding entry rows.

diff --git a/...in/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java b/...in/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java
@@ -85,7 +85,7 @@
  *         Panel-of-normals file.
  *         This is an HDF5 file containing the panel data in the paths defined in {@link HDF5SVDReadCountPanelOfNormals}.
  *         HDF5 files may be viewed using <a href="https://support.hdfgroup.org/products/java/hdfview/">hdfview</a>
- *         or loaded in python using <a href="http://www.pytables.org/">PyTables</a> or <a href="http://www.h5py.org/">h5py</a>.
+ *         or loaded in Python using <a href="http://www.pytables.org/">PyTables</a> or <a href="http://www.h5py.org/">h5py</a>.
  *     </li>
  * </ul>
  *

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/HDF5Utils.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/HDF5Utils.java
@@ -135,7 +135,7 @@ public static double[][] readChunkedDoubleMatrix(final HDF5File file,
      * Given a large matrix, chunks the matrix into equally sized subsets of rows
      * (plus a subset containing the remainder, if necessary) and writes these submatrices to indexed sub-paths
      * to avoid a hard limit in Java HDF5 on the number of elements in a matrix given by
-     * {@code MAX_NUM_VALUES_PER_HDF5_MATRIX}. The number of chunks is determined by {@code maxChunkSize},
+     * {@code MAX_NUMBER_OF_VALUES_PER_HDF5_MATRIX}. The number of chunks is determined by {@code maxChunkSize},
      * which should be set appropriately for the desired number of columns.
      *
      * @param maxChunkSize  The maximum number of values in each chunk. Decreasing this number will reduce

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibrator.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibrator.java
@@ -10,6 +10,7 @@
 import htsjdk.variant.vcf.VCFHeader;
 import htsjdk.variant.vcf.VCFHeaderLine;
 import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hdf5.HDF5File;
 import org.broadinstitute.hellbender.cmdline.*;
 import org.broadinstitute.barclay.argparser.CommandLineException;
 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
@@ -23,6 +24,8 @@
 import org.broadinstitute.hellbender.engine.ReadsContext;
 import org.broadinstitute.hellbender.engine.ReferenceContext;
 import org.broadinstitute.hellbender.engine.MultiVariantWalker;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.utils.io.IOUtils;
 import picard.cmdline.programgroups.VariantFilteringProgramGroup;
 import org.broadinstitute.hellbender.utils.R.RScriptExecutor;
 import org.broadinstitute.hellbender.utils.SimpleInterval;
@@ -41,6 +44,7 @@
 
 import java.io.*;
 import java.util.*;
+import java.util.stream.IntStream;
 
 /**
  * Build a recalibration model to score variant quality for filtering purposes
@@ -639,6 +643,10 @@ public Object onTraversalSuccess() {
         for (int i = 1; i <= max_attempts; i++) {
             try {
                 dataManager.setData(reduceSum);
+
+                final String rawAnnotationsOutput = output.toString().endsWith(".recal") ? output.toString().split(".recal")[0] : output.toString();
+                writeAnnotationsHDF5(new File(rawAnnotationsOutput + ".annot.raw.hdf5"));
+
                 dataManager.normalizeData(inputModel == null, annotationOrder); // Each data point is now (x - mean) / standard deviation
 
                 final GaussianMixtureModel goodModel;
@@ -678,6 +686,9 @@ public Object onTraversalSuccess() {
                     }
                 }
 
+                final String annotationsOutput = output.toString().endsWith(".recal") ? output.toString().split(".recal")[0] : output.toString();
+                writeAnnotationsHDF5(new File(annotationsOutput + ".annot.hdf5"));
+
                 dataManager.dropAggregateData(); // Don't need the aggregate data anymore so let's free up the memory
                 engine.evaluateData(dataManager.getData(), badModel, true);
 
@@ -686,6 +697,10 @@ public Object onTraversalSuccess() {
                     saveModelReport(report, outputModel);
                 }
 
+                final String modelOutput = output.toString().endsWith(".recal") ? output.toString().split(".recal")[0] : output.toString();
+                writeModelHDF5(new File(modelOutput + ".positive.hdf5"), goodModel);
+                writeModelHDF5(new File(modelOutput + ".negative.hdf5"), badModel);
+
                 engine.calculateWorstPerformingAnnotation(dataManager.getData(), goodModel, badModel);
 
 
@@ -1176,4 +1191,43 @@ private void createArrangeFunction( final PrintStream stream ) {
         stream.println("}");
         stream.println("}");
     }
+
+    public void writeAnnotationsHDF5(final File file) {
+        try (final HDF5File hdf5File = new HDF5File(file, HDF5File.OpenMode.CREATE)) { // TODO allow appending
+            IOUtils.canReadFile(hdf5File.getFile());
+
+            hdf5File.makeStringArray("/data/annotation_names", dataManager.getAnnotationKeys().stream().toArray(String[]::new));
+            hdf5File.makeDoubleMatrix("/data/annotations", dataManager.getData().stream().map(vd -> vd.annotations).toArray(double[][]::new));
+            hdf5File.makeDoubleArray("/data/is_training", dataManager.getData().stream().mapToDouble(vd -> vd.atTrainingSite ? 1 : 0).toArray());
+            hdf5File.makeDoubleArray("/data/is_truth", dataManager.getData().stream().mapToDouble(vd -> vd.atTruthSite ? 1 : 0).toArray());
+            hdf5File.makeDoubleArray("/data/is_anti_training", dataManager.getData().stream().mapToDouble(vd -> vd.atAntiTrainingSite ? 1 : 0).toArray());
+
+            logger.info(String.format("Annotations written to %s.", file.getAbsolutePath()));
+        } catch (final RuntimeException exception) {
+            throw new GATKException(String.format("Exception encountered during writing of annotations (%s). Output file at %s may be in a bad state.",
+                    exception, file.getAbsolutePath()));
+        }
+    }
+
+    public void writeModelHDF5(final File file,
+                               final GaussianMixtureModel model) {
+        try (final HDF5File hdf5File = new HDF5File(file, HDF5File.OpenMode.CREATE)) { // TODO allow appending
+            IOUtils.canReadFile(hdf5File.getFile());
+
+            final int nComponents = model.getModelGaussians().size();
+            final int nFeatures = model.getNumAnnotations();
+            hdf5File.makeDouble("/vqsr/number_of_components", nComponents);
+            hdf5File.makeDouble("/vqsr/number_of_features", nComponents);
+            hdf5File.makeDoubleArray("/vqsr/weights", model.getModelGaussians().stream().mapToDouble(g -> Math.pow(10., (g.pMixtureLog10))).toArray());
+            IntStream.range(0, nComponents).forEach(
+                    k -> hdf5File.makeDoubleArray("/vqsr/means/" + k, model.getModelGaussians().get(k).mu));
+            IntStream.range(0, nComponents).forEach(
+                    k -> hdf5File.makeDoubleMatrix("vqsr/covariances/" + k, model.getModelGaussians().get(k).sigma.getArray()));
+
+            logger.info(String.format("VQSR model written to %s.", file.getAbsolutePath()));
+        } catch (final RuntimeException exception) {
+            throw new GATKException(String.format("Exception encountered during writing of VQSR model (%s). Output file at %s may be in a bad state.",
+                    exception, file.getAbsolutePath()));
+        }
+    }
 }