diff --git a/build.gradle b/build.gradle index 3af1061cad7..e2754b5a489 100644 --- a/build.gradle +++ b/build.gradle @@ -285,6 +285,7 @@ dependencies { implementation 'org.apache.commons:commons-lang3:3.5' implementation 'org.apache.commons:commons-math3:3.5' + implementation 'org.hipparchus:hipparchus-stat:2.0' implementation 'org.apache.commons:commons-collections4:4.1' implementation 'org.apache.commons:commons-vfs2:2.0' implementation 'org.apache.commons:commons-configuration2:2.4' diff --git a/scripts/gatkcondaenv.yml.template b/scripts/gatkcondaenv.yml.template index 10467af3cdf..dbe29ed5a28 100644 --- a/scripts/gatkcondaenv.yml.template +++ b/scripts/gatkcondaenv.yml.template @@ -42,6 +42,7 @@ dependencies: - conda-forge::matplotlib=3.2.1 - conda-forge::pandas=1.0.3 - conda-forge::typing_extensions=4.1.1 # see https://github.com/broadinstitute/gatk/issues/7800 and linked PRs +- conda-forge::dill=0.3.4 # used for pickling lambdas in TrainVariantAnnotationsModel # core R dependencies; these should only be used for plotting and do not take precedence over core python dependencies! - r-base=3.6.2 diff --git a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl index 89e05abcb89..87b520aca0d 100644 --- a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl +++ b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl @@ -192,8 +192,6 @@ task TrainVariantAnnotationModel { command <<< set -e - conda install -y --name gatk dill - export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} mode=$(echo "~{mode}" | awk '{print toupper($0)}') @@ -245,8 +243,6 @@ task ScoreVariantAnnotations { ln -s ~{sep=" . && ln -s " model_files} . - conda install -y --name gatk dill - export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} gatk --java-options "-Xmx~{command_mem}m" \ diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectReadCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectReadCounts.java index 4706a1e1d7d..dbe972fa541 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectReadCounts.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectReadCounts.java @@ -68,7 +68,7 @@ * to TSV format. Using HDF5 files with {@link CreateReadCountPanelOfNormals} * can decrease runtime, by reducing time spent on IO, so this is the default output format. * The HDF5 format contains information in the paths defined in {@link HDF5SimpleCountCollection}. HDF5 files may be viewed using - * hdfview or loaded in python using + * hdfview or loaded in Python using * PyTables or h5py. * The TSV format has a SAM-style header containing a read group sample name, a sequence dictionary, a row specifying the column headers contained in * {@link SimpleCountCollection.SimpleCountTableColumn}, and the corresponding entry rows. diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java index dbcc0cc1c4d..d4d6b8db9c0 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java @@ -85,7 +85,7 @@ * Panel-of-normals file. * This is an HDF5 file containing the panel data in the paths defined in {@link HDF5SVDReadCountPanelOfNormals}. * HDF5 files may be viewed using hdfview - * or loaded in python using PyTables or h5py. + * or loaded in Python using PyTables or h5py. * * * diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/HDF5Utils.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/HDF5Utils.java index 8590e3476f2..870ce37b7dc 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/HDF5Utils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/HDF5Utils.java @@ -135,7 +135,7 @@ public static double[][] readChunkedDoubleMatrix(final HDF5File file, * Given a large matrix, chunks the matrix into equally sized subsets of rows * (plus a subset containing the remainder, if necessary) and writes these submatrices to indexed sub-paths * to avoid a hard limit in Java HDF5 on the number of elements in a matrix given by - * {@code MAX_NUM_VALUES_PER_HDF5_MATRIX}. The number of chunks is determined by {@code maxChunkSize}, + * {@code MAX_NUMBER_OF_VALUES_PER_HDF5_MATRIX}. The number of chunks is determined by {@code maxChunkSize}, * which should be set appropriately for the desired number of columns. * * @param maxChunkSize The maximum number of values in each chunk. Decreasing this number will reduce diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibrator.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibrator.java index f7148d043f1..2da7997a51c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibrator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibrator.java @@ -10,6 +10,7 @@ import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hdf5.HDF5File; import org.broadinstitute.hellbender.cmdline.*; import org.broadinstitute.barclay.argparser.CommandLineException; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; @@ -23,6 +24,8 @@ import org.broadinstitute.hellbender.engine.ReadsContext; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.engine.MultiVariantWalker; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.utils.io.IOUtils; import picard.cmdline.programgroups.VariantFilteringProgramGroup; import org.broadinstitute.hellbender.utils.R.RScriptExecutor; import org.broadinstitute.hellbender.utils.SimpleInterval; @@ -41,6 +44,7 @@ import java.io.*; import java.util.*; +import java.util.stream.IntStream; /** * Build a recalibration model to score variant quality for filtering purposes @@ -639,6 +643,10 @@ public Object onTraversalSuccess() { for (int i = 1; i <= max_attempts; i++) { try { dataManager.setData(reduceSum); + + final String rawAnnotationsOutput = output.toString().endsWith(".recal") ? output.toString().split(".recal")[0] : output.toString(); + writeAnnotationsHDF5(new File(rawAnnotationsOutput + ".annot.raw.hdf5")); + dataManager.normalizeData(inputModel == null, annotationOrder); // Each data point is now (x - mean) / standard deviation final GaussianMixtureModel goodModel; @@ -678,6 +686,9 @@ public Object onTraversalSuccess() { } } + final String annotationsOutput = output.toString().endsWith(".recal") ? output.toString().split(".recal")[0] : output.toString(); + writeAnnotationsHDF5(new File(annotationsOutput + ".annot.hdf5")); + dataManager.dropAggregateData(); // Don't need the aggregate data anymore so let's free up the memory engine.evaluateData(dataManager.getData(), badModel, true); @@ -686,6 +697,10 @@ public Object onTraversalSuccess() { saveModelReport(report, outputModel); } + final String modelOutput = output.toString().endsWith(".recal") ? output.toString().split(".recal")[0] : output.toString(); + writeModelHDF5(new File(modelOutput + ".positive.hdf5"), goodModel); + writeModelHDF5(new File(modelOutput + ".negative.hdf5"), badModel); + engine.calculateWorstPerformingAnnotation(dataManager.getData(), goodModel, badModel); @@ -1176,4 +1191,43 @@ private void createArrangeFunction( final PrintStream stream ) { stream.println("}"); stream.println("}"); } + + public void writeAnnotationsHDF5(final File file) { + try (final HDF5File hdf5File = new HDF5File(file, HDF5File.OpenMode.CREATE)) { // TODO allow appending + IOUtils.canReadFile(hdf5File.getFile()); + + hdf5File.makeStringArray("/data/annotation_names", dataManager.getAnnotationKeys().stream().toArray(String[]::new)); + hdf5File.makeDoubleMatrix("/data/annotations", dataManager.getData().stream().map(vd -> vd.annotations).toArray(double[][]::new)); + hdf5File.makeDoubleArray("/data/is_training", dataManager.getData().stream().mapToDouble(vd -> vd.atTrainingSite ? 1 : 0).toArray()); + hdf5File.makeDoubleArray("/data/is_truth", dataManager.getData().stream().mapToDouble(vd -> vd.atTruthSite ? 1 : 0).toArray()); + hdf5File.makeDoubleArray("/data/is_anti_training", dataManager.getData().stream().mapToDouble(vd -> vd.atAntiTrainingSite ? 1 : 0).toArray()); + + logger.info(String.format("Annotations written to %s.", file.getAbsolutePath())); + } catch (final RuntimeException exception) { + throw new GATKException(String.format("Exception encountered during writing of annotations (%s). Output file at %s may be in a bad state.", + exception, file.getAbsolutePath())); + } + } + + public void writeModelHDF5(final File file, + final GaussianMixtureModel model) { + try (final HDF5File hdf5File = new HDF5File(file, HDF5File.OpenMode.CREATE)) { // TODO allow appending + IOUtils.canReadFile(hdf5File.getFile()); + + final int nComponents = model.getModelGaussians().size(); + final int nFeatures = model.getNumAnnotations(); + hdf5File.makeDouble("/vqsr/number_of_components", nComponents); + hdf5File.makeDouble("/vqsr/number_of_features", nComponents); + hdf5File.makeDoubleArray("/vqsr/weights", model.getModelGaussians().stream().mapToDouble(g -> Math.pow(10., (g.pMixtureLog10))).toArray()); + IntStream.range(0, nComponents).forEach( + k -> hdf5File.makeDoubleArray("/vqsr/means/" + k, model.getModelGaussians().get(k).mu)); + IntStream.range(0, nComponents).forEach( + k -> hdf5File.makeDoubleMatrix("vqsr/covariances/" + k, model.getModelGaussians().get(k).sigma.getArray())); + + logger.info(String.format("VQSR model written to %s.", file.getAbsolutePath())); + } catch (final RuntimeException exception) { + throw new GATKException(String.format("Exception encountered during writing of VQSR model (%s). Output file at %s may be in a bad state.", + exception, file.getAbsolutePath())); + } + } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java new file mode 100644 index 00000000000..48f73007767 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java @@ -0,0 +1,361 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable; + +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import org.apache.commons.lang3.tuple.Triple; +import org.apache.commons.math3.random.RandomGenerator; +import org.apache.commons.math3.random.RandomGeneratorFactory; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.ReadsContext; +import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils; +import org.broadinstitute.hellbender.tools.walkers.vqsr.VariantRecalibrator; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType; +import picard.cmdline.programgroups.VariantFilteringProgramGroup; + +import java.io.File; +import java.util.List; +import java.util.Random; +import java.util.Set; +import java.util.TreeSet; +import java.util.stream.Collectors; + +/** + * Extracts site-level variant annotations, labels, and other metadata from a VCF file to HDF5 files. + * + *

+ * This tool is intended to be used as the first step in a variant-filtering workflow that supersedes the + * {@link VariantRecalibrator} workflow. This tool extracts site-level annotations, labels, and other relevant metadata + * from variant sites (or alleles, in allele-specific mode) that are or are not present in specified labeled + * resource VCFs (e.g., training or calibration VCFs). The former, present sites are considered labeled; each site + * can have multiple labels. The latter sites are considered unlabeled and can be randomly downsampled using + * reservoir sampling; extraction of these is optional. The outputs of the tool are HDF5 files containing the + * extracted data for labeled and (optional) unlabeled variant sets, as well as a sites-only indexed VCF containing + * the labeled variants. + *

+ * + *

+ * The extracted sets can be provided as input to the {@link TrainVariantAnnotationsModel} tool + * to produce an annotation-based model for scoring variant calls. This model can in turn be provided + * along with a VCF file to the {@link ScoreVariantAnnotations} tool, which assigns a score to each call + * (with a lower score indicating that a call is more likely to be an artifact and should perhaps be filtered). + * Each score can also be converted to a corresponding sensitivity to a calibration set, if the latter is available. + *

+ * + *

+ * Note that annotations and metadata are collected in memory during traversal until they are written to HDF5 files + * upon completion of the traversal. Memory requirements thus roughly scale linearly with both the number of sites + * extracted and the number of annotations. + *

+ * + *

+ * Note that HDF5 files may be viewed using hdfview + * or loaded in Python using PyTables or h5py. + *

+ * + *

Inputs

+ * + * + * + *

Outputs

+ * + * + * + *

Usage examples

+ * + *

+ * Extract annotations from training/calibration SNP/INDEL sites, producing the outputs + * 1) {@code extract.annot.hdf5}, 2) {@code extract.vcf.gz}, and 3) {@code extract.vcf.gz.tbi}. + * The HDF5 file can then be provided to {@link TrainVariantAnnotationsModel} + * to train a model using a positive-only approach. + * + *

+ *     gatk ExtractVariantAnnotations \
+ *          -V input.vcf \
+ *          -A annotation_1 \
+ *          ...
+ *          -A annotation_N \
+ *          --mode SNP \
+ *          --resource snp-training,training=true snp-training.vcf \
+ *          --resource snp-calibration,calibration=true snp-calibration.vcf \
+ *          --mode INDEL \
+ *          --resource indel-training,training=true indel-training.vcf \
+ *          --resource indel-calibration,calibration=true indel-calibration.vcf \
+ *          -O extract
+ * 
+ *

+ * + *

+ * Extract annotations from both training/calibration SNP/INDEL sites and a random sample of + * 1000000 unlabeled (i.e., non-training/calibration) sites, producing the outputs + * 1) {@code extract.annot.hdf5}, 2) {@code extract.unlabeled.annot.hdf5}, 3) {@code extract.vcf.gz}, + * and 4) {@code extract.vcf.gz.tbi}. The HDF5 files can then be provided to {@link TrainVariantAnnotationsModel} + * to train a model using a positive-negative approach (similar to that used in {@link VariantRecalibrator}). + * + *

+ *     gatk ExtractVariantAnnotations \
+ *          -V input.vcf \
+ *          -A annotation_1 \
+ *          ...
+ *          -A annotation_N \
+ *          --mode SNP \
+ *          --resource snp-training,training=true snp-training.vcf \
+ *          --resource snp-calibration,calibration=true snp-calibration.vcf \
+ *          --mode INDEL \
+ *          --resource indel-training,training=true indel-training.vcf \
+ *          --resource indel-calibration,calibration=true indel-calibration.vcf \
+ *          --maximum-number-of-unlableled-variants 1000000
+ *          -O extract
+ * 
+ *

+ * + *

+ * In the (atypical) event that resource VCFs are unavailable, one can still extract annotations from a random sample of + * unlabeled sites, producing the outputs 1) {@code extract.unlabeled.annot.hdf5}, + * 2) {@code extract.vcf.gz} (which will contain no records), and 3) {@code extract.vcf.gz.tbi}. + * This random sample cannot be used by {@link TrainVariantAnnotationsModel}, but may still be useful for + * exploratory analyses. + * + *

+ *     gatk ExtractVariantAnnotations \
+ *          -V input.vcf \
+ *          -A annotation_1 \
+ *          ...
+ *          -A annotation_N \
+ *          --mode SNP \
+ *          --mode INDEL \
+ *          --maximum-number-of-unlableled-variants 1000000
+ *          -O extract
+ * 
+ *

+ * + * DEVELOPER NOTE: See documentation in {@link LabeledVariantAnnotationsWalker}. + * + * @author Samuel Lee <slee@broadinstitute.org> + */ +@CommandLineProgramProperties( + summary = "Extracts site-level variant annotations, labels, and other metadata from a VCF file to HDF5 files.", + oneLineSummary = "Extracts site-level variant annotations, labels, and other metadata from a VCF file to HDF5 files", + programGroup = VariantFilteringProgramGroup.class +) +@DocumentedFeature +@BetaFeature +public final class ExtractVariantAnnotations extends LabeledVariantAnnotationsWalker { + + public static final String MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME = "maximum-number-of-unlabeled-variants"; + public static final String RESERVOIR_SAMPLING_RANDOM_SEED_LONG_NAME = "reservoir-sampling-random-seed"; + + public static final String UNLABELED_TAG = ".unlabeled"; + + @Argument( + fullName = MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME, + doc = "Maximum number of unlabeled variants to extract. " + + "If greater than zero, reservoir sampling will be used to randomly sample this number " + + "of sites from input sites that are not present in the specified resources.", + minValue = 0) + private int maximumNumberOfUnlabeledVariants = 0; + + @Argument( + fullName = RESERVOIR_SAMPLING_RANDOM_SEED_LONG_NAME, + doc = "Random seed to use for reservoir sampling of unlabeled variants.") + private int reservoirSamplingRandomSeed = 0; + + private RandomGenerator rng; + private LabeledVariantAnnotationsData unlabeledDataReservoir; // will not be sorted in genomic order + private int unlabeledIndex = 0; + + @Override + public void afterOnTraversalStart() { + if (!resourceLabels.contains(LabeledVariantAnnotationsData.TRAINING_LABEL)) { + logger.warn("No training set found! If you are using the downstream TrainVariantAnnotationsModel and ScoreVariantAnnotations tools, " + + "provide sets of known polymorphic loci marked with the training=true feature input tag. " + + "For example, --resource:hapmap,training=true hapmap.vcf"); + } + if (!resourceLabels.contains(LabeledVariantAnnotationsData.CALIBRATION_LABEL)) { + logger.warn("No calibration set found! If you are using the downstream TrainVariantAnnotationsModel and ScoreVariantAnnotations tools " + + "and wish to convert scores to sensitivity to a calibration set of variants, " + + "provide sets of known polymorphic loci marked with the calibration=true feature input tag. " + + "For example, --resource:hapmap,calibration=true hapmap.vcf"); + } + + rng = RandomGeneratorFactory.createRandomGenerator(new Random(reservoirSamplingRandomSeed)); + unlabeledDataReservoir = maximumNumberOfUnlabeledVariants == 0 + ? null + : new LabeledVariantAnnotationsData(annotationNames, resourceLabels, useASAnnotations, maximumNumberOfUnlabeledVariants); + } + + @Override + protected void nthPassApply(final VariantContext variant, + final ReadsContext readsContext, + final ReferenceContext referenceContext, + final FeatureContext featureContext, + final int n) { + if (n == 0) { + final List, VariantType, TreeSet>> metadata = extractVariantMetadata( + variant, featureContext, unlabeledDataReservoir != null); + final boolean isVariantExtracted = !metadata.isEmpty(); + if (isVariantExtracted) { + final boolean isUnlabeled = metadata.stream().map(Triple::getRight).allMatch(Set::isEmpty); + if (!isUnlabeled) { + addExtractedVariantToData(data, variant, metadata); + writeExtractedVariantToVCF(variant, metadata); + } else { + // Algorithm R for reservoir sampling: https://en.wikipedia.org/wiki/Reservoir_sampling#Simple_algorithm + if (unlabeledIndex < maximumNumberOfUnlabeledVariants) { + addExtractedVariantToData(unlabeledDataReservoir, variant, metadata); + } else { + final int j = rng.nextInt(unlabeledIndex); + if (j < maximumNumberOfUnlabeledVariants) { + setExtractedVariantInData(unlabeledDataReservoir, variant, metadata, j); + } + } + unlabeledIndex++; + } + } + } + } + + @Override + protected void afterNthPass(final int n) { + if (n == 0) { + writeAnnotationsToHDF5(); + data.clear(); + if (unlabeledDataReservoir != null) { + writeUnlabeledAnnotationsToHDF5(); + // TODO write extracted unlabeled variants to VCF, which can be used to mark extraction in scoring step + unlabeledDataReservoir.clear(); + } + if (vcfWriter != null) { + vcfWriter.close(); + } + } + } + + @Override + public Object onTraversalSuccess() { + + logger.info(String.format("%s complete.", getClass().getSimpleName())); + + return null; + } + + private static void setExtractedVariantInData(final LabeledVariantAnnotationsData data, + final VariantContext variant, + final List, VariantType, TreeSet>> metadata, + final int index) { + data.set(index, variant, + metadata.stream().map(Triple::getLeft).collect(Collectors.toList()), + metadata.stream().map(Triple::getMiddle).collect(Collectors.toList()), + metadata.stream().map(Triple::getRight).collect(Collectors.toList())); + } + + private void writeUnlabeledAnnotationsToHDF5() { + final File outputUnlabeledAnnotationsFile = new File(outputPrefix + UNLABELED_TAG + ANNOTATIONS_HDF5_SUFFIX); + if (unlabeledDataReservoir.size() == 0) { + throw new GATKException("No unlabeled variants were present in the input VCF."); + } + for (final VariantType variantType : variantTypesToExtract) { + logger.info(String.format("Extracted unlabeled annotations for %d variants of type %s.", + unlabeledDataReservoir.getVariantTypeFlat().stream().mapToInt(t -> t == variantType ? 1 : 0).sum(), variantType)); + } + logger.info(String.format("Extracted unlabeled annotations for %s total variants.", unlabeledDataReservoir.size())); + + logger.info("Writing unlabeled annotations..."); + // TODO coordinate sort + unlabeledDataReservoir.writeHDF5(outputUnlabeledAnnotationsFile, omitAllelesInHDF5); + logger.info(String.format("Unlabeled annotations and metadata written to %s.", outputUnlabeledAnnotationsFile.getAbsolutePath())); + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java new file mode 100644 index 00000000000..128b3bcf1df --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java @@ -0,0 +1,382 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.apache.commons.collections4.ListUtils; +import org.apache.commons.lang3.tuple.Triple; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.FeatureInput; +import org.broadinstitute.hellbender.engine.MultiplePassVariantWalker; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsScorer; +import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; +import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; +import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.hellbender.utils.variant.VcfUtils; +import picard.cmdline.programgroups.VariantFilteringProgramGroup; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.EnumSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.stream.Collectors; + +/** + * Base walker for both {@link ExtractVariantAnnotations} and {@link ScoreVariantAnnotations}, + * which enforces identical variant-extraction behavior in both tools via {@link #extractVariantMetadata}. + * + * This base implementation covers functionality for {@link ExtractVariantAnnotations}. Namely, it is a single-pass + * walker, performing the operations: + * + * - nthPassApply(n = 0) + * - if variant/alleles pass filters and variant-type/overlapping-resource checks, then: + * - add variant/alleles to a {@link LabeledVariantAnnotationsData} collection + * - write variant/alleles with labels appended to a sites-only VCF file + * - afterNthPass(n = 0) + * - write the resulting {@link LabeledVariantAnnotationsData} collection to an HDF5 file + * + * This results in the following output: + * + * - an HDF5 file, with the directory structure documented in {@link LabeledVariantAnnotationsData#writeHDF5}; + * note that the matrix of annotations contains a single row per datum (i.e., per allele, in allele-specific mode, + * and per variant otherwise) + * - a sites-only VCF file, containing a single line per extracted variant, with labels appended + * + * In contrast, the {@link ScoreVariantAnnotations} implementation overrides methods to yield a two-pass walker, + * performing the operations: + * + * - nthPassApply(n = 0) + * - if variant/alleles pass filters and variant-type checks, then: + * - add variant/alleles to a {@link LabeledVariantAnnotationsData} collection + * - afterNthPass(n = 0) + * - write the resulting {@link LabeledVariantAnnotationsData} collection to an HDF5 file + * - pass this annotations HDF5 file to a {@link VariantAnnotationsScorer}, which generates and writes scores to an HDF5 file + * - read the scores back in and load them into an iterator + * - nthPassApply(n = 1) + * - if variant/alleles pass filters and variant-type checks (which are identical to the first pass), then: + * - draw the corresponding score (or scores, in allele-specific mode) from the iterator + * - write the variant (with all alleles, not just those extracted) with the score + * (or best score, in allele-specific mode) appended to a VCF file + * - else: + * - write an unprocessed copy of the variant to a VCF file + * + * This results in the following output: + * + * - an HDF5 file, as above + * - a VCF file, containing the input variants, with labels and scores appended for those passing variant-type checks TODO + calibration-sensitivity scores + filters applied? + */ +@CommandLineProgramProperties( + // TODO + summary = "", + oneLineSummary = "", + programGroup = VariantFilteringProgramGroup.class +) +@DocumentedFeature +public abstract class LabeledVariantAnnotationsWalker extends MultiplePassVariantWalker { + + public static final String MODE_LONG_NAME = "mode"; + public static final String USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME = "use-allele-specific-annotations"; + public static final String IGNORE_FILTER_LONG_NAME = "ignore-filter"; + public static final String IGNORE_ALL_FILTERS_LONG_NAME = "ignore-all-filters"; + public static final String DO_NOT_TRUST_ALL_POLYMORPHIC_LONG_NAME = "do-not-trust-all-polymorphic"; + public static final String OMIT_ALLELES_IN_HDF5_LONG_NAME = "omit-alleles-in-hdf5"; + public static final String DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME = "do-not-gzip-vcf-output"; + + public static final String ANNOTATIONS_HDF5_SUFFIX = ".annot.hdf5"; + + public static final String RESOURCE_LABEL_INFO_HEADER_LINE_FORMAT_STRING = "This site was labeled as %s according to resources"; + + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Prefix for output filenames.") + String outputPrefix; + + @Argument( + fullName = StandardArgumentDefinitions.RESOURCE_LONG_NAME, + doc = "Resource VCFs used to label extracted variants.", + optional = true) + private List> resources = new ArrayList<>(10); + + @Argument( + fullName = StandardArgumentDefinitions.ANNOTATION_LONG_NAME, + shortName = StandardArgumentDefinitions.ANNOTATION_SHORT_NAME, + doc = "Names of the annotations to extract. Note that a requested annotation may in fact not be present " + + "at any extraction site; NaN missing values will be generated for such annotations.", + minElements = 1) + List annotationNames = new ArrayList<>(); + + @Argument( + fullName = MODE_LONG_NAME, + doc = "Variant types to extract.", + minElements = 1) + private List variantTypesToExtractList = new ArrayList<>(Arrays.asList(VariantType.SNP, VariantType.INDEL)); + + @Argument( + fullName = USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME, + doc = "If true, use the allele-specific versions of the specified annotations.", + optional = true) + boolean useASAnnotations = false; + + @Argument( + fullName = IGNORE_FILTER_LONG_NAME, + doc = "Ignore the specified filter(s) in the input VCF.", + optional = true) + private List ignoreInputFilters = new ArrayList<>(); + + @Argument( + fullName = IGNORE_ALL_FILTERS_LONG_NAME, + doc = "If true, ignore all filters in the input VCF.", + optional = true) + private boolean ignoreAllFilters = false; + + // TODO this is a perhaps vestigial argument inherited from VQSR; its impact and necessity could be reevaluated + @Argument( + fullName = DO_NOT_TRUST_ALL_POLYMORPHIC_LONG_NAME, + doc = "If true, do not trust that unfiltered records in the resources contain only polymorphic sites. " + + "This may increase runtime.", + optional = true) + private boolean doNotTrustAllPolymorphic = false; + + @Argument( + fullName = OMIT_ALLELES_IN_HDF5_LONG_NAME, + doc = "If true, omit alleles in output HDF5 files in order to decrease file sizes.", + optional = true + ) + boolean omitAllelesInHDF5 = false; + + @Argument( + fullName = DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME, + doc = "If true, VCF output will not be compressed.", + optional = true + ) + boolean doNotGZIPVCFOutput = false; + + private final Set ignoreInputFilterSet = new TreeSet<>(); + Set variantTypesToExtract; + TreeSet resourceLabels = new TreeSet<>(); + + File outputAnnotationsFile; + VariantContextWriter vcfWriter; + + LabeledVariantAnnotationsData data; + + @Override + public void onTraversalStart() { + + ignoreInputFilterSet.addAll(ignoreInputFilters); + + variantTypesToExtract = EnumSet.copyOf(variantTypesToExtractList); + + outputAnnotationsFile = new File(outputPrefix + ANNOTATIONS_HDF5_SUFFIX); + final String vcfSuffix = doNotGZIPVCFOutput ? ".vcf" : ".vcf.gz"; + final File outputVCFFile = new File(outputPrefix + vcfSuffix); + + // TODO this validation method should perhaps be moved outside of the CNV code + CopyNumberArgumentValidationUtils.validateOutputFiles(outputAnnotationsFile, outputVCFFile); + + for (final FeatureInput resource : resources) { + final TreeSet trackResourceLabels = resource.getTagAttributes().entrySet().stream() + .filter(e -> e.getValue().equals("true")) + .map(Map.Entry::getKey) + .sorted() + .collect(Collectors.toCollection(TreeSet::new)); + resourceLabels.addAll(trackResourceLabels); + logger.info( String.format("Found %s track: labels = %s", resource.getName(), trackResourceLabels)); + } + resourceLabels.forEach(String::intern); + + if (resourceLabels.contains(LabeledVariantAnnotationsData.SNP_LABEL)) { + throw new UserException.BadInput(String.format("The resource label \"%s\" is reserved for labeling variant types.", + LabeledVariantAnnotationsData.SNP_LABEL)); + } + + data = new LabeledVariantAnnotationsData(annotationNames, resourceLabels, useASAnnotations); + + vcfWriter = createVCFWriter(outputVCFFile); + vcfWriter.writeHeader(constructVCFHeader(data.getSortedLabels())); + + afterOnTraversalStart(); // perform additional validation, set modes in child tools, etc. + } + + public void afterOnTraversalStart() { + // override + } + + @Override + protected int numberOfPasses() { + return 1; + } + + @Override + public Object onTraversalSuccess() { + return null; + } + + // TODO maybe clean up all this Triple and metadata business with a class? + static void addExtractedVariantToData(final LabeledVariantAnnotationsData data, + final VariantContext variant, + final List, VariantType, TreeSet>> metadata) { + data.add(variant, + metadata.stream().map(Triple::getLeft).collect(Collectors.toList()), + metadata.stream().map(Triple::getMiddle).collect(Collectors.toList()), + metadata.stream().map(Triple::getRight).collect(Collectors.toList())); + } + + void writeExtractedVariantToVCF(final VariantContext variant, + final List, VariantType, TreeSet>> metadata) { + writeExtractedVariantToVCF(variant, + metadata.stream().map(Triple::getLeft).flatMap(List::stream).collect(Collectors.toList()), + metadata.stream().map(Triple::getRight).flatMap(Set::stream).collect(Collectors.toSet())); + } + + void writeAnnotationsToHDF5() { + if (data.size() == 0) { + logger.warn("Found no input variants for extraction. This may be because the specified " + + "genomic region contains no input variants of the requested type(s) or, if extracting " + + "training labels, because none of the input variants were contained in the resource VCFs " + + "or no resource VCFs were provided. The annotations HDF5 file will not be generated."); + return; + } + for (final VariantType variantType : variantTypesToExtract) { + logger.info(String.format("Extracted annotations for %d variants of type %s.", + data.getVariantTypeFlat().stream().mapToInt(t -> t == variantType ? 1 : 0).sum(), variantType)); + } + for (final String label : data.getSortedLabels()) { + logger.info(String.format("Extracted annotations for %d variants labeled as %s.", + data.isLabelFlat(label).stream().mapToInt(b -> b ? 1 : 0).sum(), label)); + } + logger.info(String.format("Extracted annotations for %s total variants.", data.size())); + + logger.info("Writing annotations..."); + data.writeHDF5(outputAnnotationsFile, omitAllelesInHDF5); + logger.info(String.format("Annotations and metadata written to %s.", outputAnnotationsFile.getAbsolutePath())); + } + + /** + * Writes a sites-only VCF containing the extracted variants and corresponding labels. + */ + void writeExtractedVariantToVCF(final VariantContext vc, + final List altAlleles, + final Set labels) { + final List alleles = ListUtils.union(Collections.singletonList(vc.getReference()), altAlleles); + final VariantContextBuilder builder = new VariantContextBuilder( + vc.getSource(), vc.getContig(), vc.getStart(), vc.getEnd(), alleles); + labels.forEach(l -> builder.attribute(l, true)); // labels should already be sorted as a TreeSet + vcfWriter.add(builder.make()); + } + + // modified from VQSR code + // TODO we're just writing a standard sites-only VCF here, maybe there's a nicer way to do this? + VCFHeader constructVCFHeader(final List sortedLabels) { + Set hInfo = getDefaultToolVCFHeaderLines(); + hInfo.addAll(sortedLabels.stream() + .map(l -> new VCFInfoHeaderLine(l, 1, VCFHeaderLineType.Flag, String.format(RESOURCE_LABEL_INFO_HEADER_LINE_FORMAT_STRING, l))) + .collect(Collectors.toList())); + hInfo.add(GATKVCFHeaderLines.getFilterLine(VCFConstants.PASSES_FILTERS_v4)); + final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary(); + hInfo = VcfUtils.updateHeaderContigLines(hInfo, null, sequenceDictionary, true); + return new VCFHeader(hInfo); + } + + /** + * Performs variant-filter and variant-type checks to determine variants/alleles suitable for extraction, and returns + * a corresponding list of metadata. This method should not be overridden, as it is intended to enforce identical + * variant-extraction behavior in all child tools. Logic here and below for filtering and determining variant type + * was retained from VQSR, but has been heavily refactored. + */ + final List, VariantType, TreeSet>> extractVariantMetadata(final VariantContext vc, + final FeatureContext featureContext, + final boolean isExtractUnlabeled) { + // if variant is filtered, do not consume here + if (vc == null || !(ignoreAllFilters || vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters()))) { + return Collections.emptyList(); + } + if (!useASAnnotations) { + // in non-allele-specific mode, get a singleton list of the triple + // (list of alt alleles passing variant-type and overlapping-resource checks, variant type, set of labels) + final VariantType variantType = VariantType.getVariantType(vc); + if (variantTypesToExtract.contains(variantType)) { + final TreeSet overlappingResourceLabels = findOverlappingResourceLabels(vc, null, null, featureContext); + if (isExtractUnlabeled || !overlappingResourceLabels.isEmpty()) { + return Collections.singletonList(Triple.of(vc.getAlternateAlleles(), variantType, overlappingResourceLabels)); + } + } + } else { + // in allele-specific mode, get a list containing the triples + // (singleton list of alt allele, variant type, set of labels) + // corresponding to alt alleles that pass variant-type and overlapping-resource checks + return vc.getAlternateAlleles().stream() + .filter(a -> !GATKVCFConstants.isSpanningDeletion(a)) + .filter(a -> variantTypesToExtract.contains(VariantType.getVariantType(vc, a))) + .map(a -> Triple.of(Collections.singletonList(a), VariantType.getVariantType(vc, a), + findOverlappingResourceLabels(vc, vc.getReference(), a, featureContext))) + .filter(t -> isExtractUnlabeled || !t.getRight().isEmpty()) + .collect(Collectors.toList()); + } + // if variant-type and overlapping-resource checks failed, return an empty list + return Collections.emptyList(); + } + + private TreeSet findOverlappingResourceLabels(final VariantContext vc, + final Allele refAllele, + final Allele altAllele, + final FeatureContext featureContext) { + final TreeSet overlappingResourceLabels = new TreeSet<>(); + for (final FeatureInput resource : resources) { + final List resourceVCs = featureContext.getValues(resource, featureContext.getInterval().getStart()); + for (final VariantContext resourceVC : resourceVCs) { + if (useASAnnotations && !doAllelesMatch(refAllele, altAllele, resourceVC)) { + continue; + } + if (isValidVariant(vc, resourceVC, !doNotTrustAllPolymorphic)) { + resource.getTagAttributes().entrySet().stream() + .filter(e -> e.getValue().equals("true")) + .map(Map.Entry::getKey) + .forEach(overlappingResourceLabels::add); + } + } + } + return overlappingResourceLabels; + } + + private static boolean isValidVariant(final VariantContext vc, + final VariantContext resourceVC, + final boolean trustAllPolymorphic) { + return resourceVC != null && resourceVC.isNotFiltered() && resourceVC.isVariant() && VariantType.checkVariantType(vc, resourceVC) && + (trustAllPolymorphic || !resourceVC.hasGenotypes() || resourceVC.isPolymorphicInSamples()); + } + + private static boolean doAllelesMatch(final Allele refAllele, + final Allele altAllele, + final VariantContext resourceVC) { + if (altAllele == null) { + return true; + } + try { + return GATKVariantContextUtils.isAlleleInList(refAllele, altAllele, resourceVC.getReference(), resourceVC.getAlternateAlleles()); + } catch (final IllegalStateException e) { + throw new IllegalStateException("Reference allele mismatch at position " + resourceVC.getContig() + ':' + resourceVC.getStart() + " : ", e); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotations.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotations.java new file mode 100644 index 00000000000..33fefe62ad1 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotations.java @@ -0,0 +1,624 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable; + +import com.google.common.primitives.Doubles; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.vcf.VCFFilterHeaderLine; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.apache.commons.lang3.tuple.Triple; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.ReadsContext; +import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils; +import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils; +import org.broadinstitute.hellbender.tools.walkers.vqsr.VariantRecalibrator; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.BGMMVariantAnnotationsScorer; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonSklearnVariantAnnotationsScorer; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModel; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModelBackend; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsScorer; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.io.IOUtils; +import org.broadinstitute.hellbender.utils.io.Resource; +import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor; +import picard.cmdline.programgroups.VariantFilteringProgramGroup; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Scores variant calls in a VCF file based on site-level annotations using a previously trained model. + * + *

+ * This tool is intended to be used as the last step in a variant-filtering workflow that supersedes the + * {@link VariantRecalibrator} workflow. Using a previously trained model produced by {@link TrainVariantAnnotationsModel}, + * this tool assigns a score to each call (with a lower score indicating that a call is more likely to be an artifact). + * Each score can also be converted to a corresponding sensitivity to a calibration set, if the latter is available. + * Each VCF record can also be annotated with additional resource labels and/or hard filtered based on its + * calibration-set sensitivity, if desired. + *

+ * + *

+ * Note that annotations and metadata are collected in memory during traversal until they are written to HDF5 files + * upon completion of the traversal. Memory requirements thus roughly scale linearly with both the number of sites + * scored and the number of annotations. For large callsets, this tool may be run in parallel over separate + * genomic shards using the {@value StandardArgumentDefinitions#INTERVALS_LONG_NAME} argument as usual. + *

+ * + *

+ * Scores and annotations are also output to HDF5 files, which may be viewed using + * hdfview or loaded in Python using + * PyTables or h5py. + *

+ * + *

Inputs

+ * + *
    + *
  • + * Input VCF file. Site-level annotations will be extracted from the contained variants (or alleles, + * if the {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} argument is specified). + *
  • + *
  • + * Annotations to use for scoring. These should be identical to those used in the {@link ExtractVariantAnnotations} + * step to create the training set. + *
  • + *
  • + * Variant types (i.e., SNP and/or INDEL) to score. Logic for determining variant type was retained from + * {@link VariantRecalibrator}; see {@link VariantType}. To use different models for SNPs and INDELs + * (e.g., if it is desired to use different sets of annotations for each variant type), one can first run + * this tool to score SNPs and then again on the resulting output to score INDELs. + *
  • + *
  • + * Model prefix. This should denote the path of model files produced by {@link TrainVariantAnnotationsModel}. + *
  • + *
  • + * (Optional) Model backend. This should be identical to that specified in {@link TrainVariantAnnotationsModel}. + * The default Python IsolationForest implementation requires either the GATK Python environment + * or that certain Python packages (argparse, h5py, numpy, sklearn, and dill) are otherwise available. + * A custom backend can also be specified in conjunction with the {@value PYTHON_SCRIPT_LONG_NAME} argument. + *
  • + *
  • + * (Optional) Resource VCF file(s). See the corresponding documentation in {@link ExtractVariantAnnotations}. + * In typical usage, the same resource VCFs and tags provided to that tool should also be provided here. + * In addition, the sites-only VCF that is produced by that tool can also be provided here and used to + * mark those labeled sites that were extracted, which can be useful if these are a subset of the resource sites. + *
  • + *
  • + * (Optional) Calibration-set sensitivity thresholds for SNPs and INDELs. If the corresponding SNP or INDEL + * calibration-set scores are available in the provided model files, sites that have a calibration-set + * sensitivity falling above the corresponding threshold (i.e., a score falling below the corresponding + * score threshold) will have a filter applied. + *
  • + *
  • + * Output prefix. + * This is used as the basename for output files. + *
  • + *
+ * + *

Outputs

+ * + *
    + *
  • + * Scored VCF file and index. The VCF will not be gzipped if the {@value DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME} + * argument is set to true. The INFO field in each VCF record will be annotated with: + * + *

    + * 1) a score (with a key as given by the {@value SCORE_KEY_LONG_NAME} argument, + * which has a default value of {@value DEFAULT_SCORE_KEY}), + *

    + *

    + * 2) if resources are provided, flags corresponding to the labels (e.g., + * {@value LabeledVariantAnnotationsData#TRAINING_LABEL}, {@value LabeledVariantAnnotationsData#CALIBRATION_LABEL}, etc.) + * of resources containing the record, + *

    + *

    + * 3) if the {@value SNP_KEY_LONG_NAME} argument (which has a default value of {@value DEFAULT_SNP_KEY}) + * is non-null, a flag corresponding to whether a site is treated as a SNP, + *

    + *

    + * 4) if {@value SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} and/or + * {@value INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} are provided, a filter (with name given by + * the {@value LOW_SCORE_FILTER_NAME_LONG_NAME} argument, which has a default value of + * {@value DEFAULT_LOW_SCORE_FILTER_NAME}) will be applied if a record has a calibration-set sensitivity + * falling above the appropriate threshold (i.e., if it has a score falling below the corresponding + * score threshold). + *

    + *

    + * If {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} is true, the score, SNP flag, calibration sensitivity, + * and filter appropriate for the highest scoring allele are used; however, the resource labels for all alleles + * are applied. + *

    + * + *
  • + *
  • + * (Optional) Annotations HDF5 file (.annot.hdf5). Annotation data and metadata for all scored sites + * (labeled and unlabeled) are stored in the HDF5 directory structure given in the documentation for the + * {@link ExtractVariantAnnotations} tool. This file will only be produced if the number of scored sites + * is nonzero. + *

    + * + *
  • + *
  • + * (Optional) Scores HDF5 file (.scores.hdf5). Scores for all scored sites are stored in the + * HDF5 path {@value VariantAnnotationsScorer#SCORES_PATH}. Scores are given in the same order as records + * in both the VCF and the annotations HDF5 file. This file will only be produced if the number of scored sites + * is nonzero. + *

    + *
  • + *
+ * + *

Usage examples

+ * + *

+ * Score sites using a model (produced by {@link TrainVariantAnnotationsModel} using the default + * {@link VariantAnnotationsModelBackend#PYTHON_IFOREST} model backend and contained in the directory + * {@code model_dir}), producing the outputs 1) {@code output.vcf.gz}, 2) {@code output.vcf.gz.tbi}, + * 3) {@code output.annot.hdf5}, and 4) {@code output.scores.hdf5}. Note that {@code extract.vcf.gz} is + * produced by {@link ExtractVariantAnnotations}. Records will be filtered according to the values provided to the + * {@value SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} and {@value INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} + * arguments; the values below are only meant to be illustrative and should be set as appropriate for a given analysis. + * + *

+ *     gatk ScoreVariantAnnotations \
+ *          -V input.vcf \
+ *          -A annotation_1 \
+ *          ...
+ *          -A annotation_N \
+ *          --model-prefix model_dir \
+ *          --mode SNP \
+ *          --resource snp-training,training=true snp-training.vcf \
+ *          --resource snp-calibration,calibration=true snp-calibration.vcf \
+ *          --mode INDEL \
+ *          --resource indel-training,training=true indel-training.vcf \
+ *          --resource indel-calibration,calibration=true indel-calibration.vcf \
+ *          --resource extracted,extracted=true extract.vcf.gz \
+ *          --snp-calibration-sensitivity-threshold 0.99 \
+ *          --indel-calibration-sensitivity-threshold 0.99 \
+ *          -O output
+ * 
+ * + *

+ * One may chain together two runs of this tool to score SNPs and INDELs using different models + * (note that SNP and INDEL models have "snp" and "indel" tags in their respective filenames, so these + * models can still be contained in the same {@code model_dir} directory). + * This may have implications for mixed SNP/INDEL sites, especially if filters are applied; see also the + * {@value IGNORE_ALL_FILTERS_LONG_NAME} and {@value IGNORE_FILTER_LONG_NAME} arguments. + * + *

+ *     gatk ScoreVariantAnnotations \
+ *          -V input.vcf \
+ *          -A snp_annotation_1 \
+ *          ...
+ *          -A snp_annotation_N \
+ *          --model-prefix model_dir \
+ *          --mode SNP \
+ *          --resource snp-training,training=true snp-training.vcf \
+ *          --resource snp-calibration,calibration=true snp-calibration.vcf \
+ *          --resource extracted,extracted=true snp-extract.vcf.gz \
+ *          --snp-calibration-sensitivity-threshold 0.99 \
+ *          -O intermediate-output
+ *
+ *     gatk ScoreVariantAnnotations \
+ *          -V intermediate-output.vcf \
+ *          -A indel_annotation_1 \
+ *          ...
+ *          -A indel_annotation_M \
+ *          --model-prefix model_dir \
+ *          --mode INDEL \
+ *          --resource indel-training,training=true indel-training.vcf \
+ *          --resource indel-calibration,calibration=true indel-calibration.vcf \
+ *          --resource extracted,extracted=true indel-extract.vcf.gz \
+ *          --indel-calibration-sensitivity-threshold 0.99 \
+ *          -O output
+ * 
+ * + *

Custom modeling/scoring backends (ADVANCED)

+ * + *

+ * The primary scoring functionality performed by this tool is accomplished by a "scoring backend" + * whose fundamental contract is to take an input annotation matrix and to output corresponding scores, + * with both input and output given as HDF5 files. Rather than using one of the available, implemented backends, + * advanced users may provide their own backend via the {@value PYTHON_SCRIPT_LONG_NAME} argument. + * See documentation in the modeling and scoring interfaces ({@link VariantAnnotationsModel} and + * {@link VariantAnnotationsScorer}, respectively), as well as the default Python IsolationForest implementation at + * org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py. + *

+ * + * DEVELOPER NOTE: See documentation in {@link LabeledVariantAnnotationsWalker}. + * + * @author Samuel Lee <slee@broadinstitute.org> + */ +@CommandLineProgramProperties( + summary = "Scores variant calls in a VCF file based on site-level annotations using a previously trained model.", + oneLineSummary = "Scores variant calls in a VCF file based on site-level annotations using a previously trained model", + programGroup = VariantFilteringProgramGroup.class +) +@DocumentedFeature +@BetaFeature +public class ScoreVariantAnnotations extends LabeledVariantAnnotationsWalker { + + public static final String MODEL_PREFIX_LONG_NAME = "model-prefix"; + public static final String MODEL_BACKEND_LONG_NAME = TrainVariantAnnotationsModel.MODEL_BACKEND_LONG_NAME; + public static final String PYTHON_SCRIPT_LONG_NAME = "python-script"; + public static final String SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME = "snp-calibration-sensitivity-threshold"; + public static final String INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME = "indel-calibration-sensitivity-threshold"; + + public static final String SNP_KEY_LONG_NAME = "snp-key"; + public static final String SCORE_KEY_LONG_NAME = "score-key"; + public static final String CALIBRATION_SENSITIVITY_KEY_LONG_NAME = "calibration-sensitivity-key"; + public static final String LOW_SCORE_FILTER_NAME_LONG_NAME = "low-score-filter-name"; + public static final String DOUBLE_FORMAT_LONG_NAME = "double-format"; + + public static final String DEFAULT_SNP_KEY = LabeledVariantAnnotationsData.SNP_LABEL; + public static final String DEFAULT_SCORE_KEY = "SCORE"; + public static final String DEFAULT_CALIBRATION_SENSITIVITY_KEY = "CALIBRATION_SENSITIVITY"; + public static final String DEFAULT_LOW_SCORE_FILTER_NAME = "LOW_SCORE"; + public static final String DEFAULT_DOUBLE_FORMAT = "%.4f"; + + public static final String SCORES_HDF5_SUFFIX = ".scores.hdf5"; + + @Argument( + fullName = MODEL_PREFIX_LONG_NAME) + private String modelPrefix; + + @Argument( + fullName = MODEL_BACKEND_LONG_NAME, + doc = "Backend to use for scoring. " + + "JAVA_BGMM will use a pure Java implementation (ported from Python scikit-learn) of the Bayesian Gaussian Mixture Model. " + + "PYTHON_IFOREST will use the Python scikit-learn implementation of the IsolationForest method and " + + "will require that the corresponding Python dependencies are present in the environment. " + + "PYTHON_SCRIPT will use the script specified by the " + PYTHON_SCRIPT_LONG_NAME + " argument. " + + "See the tool documentation for more details." ) + private VariantAnnotationsModelBackend modelBackend = VariantAnnotationsModelBackend.PYTHON_IFOREST; + + @Argument( + fullName = PYTHON_SCRIPT_LONG_NAME, + doc = "Python script used for specifying a custom scoring backend. If provided, " + MODEL_BACKEND_LONG_NAME + " must also be set to PYTHON_SCRIPT.", + optional = true) + private File pythonScriptFile; + + @Argument( + fullName = SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, + doc = "If specified, SNPs with scores corresponding to a calibration sensitivity that is greater than or equal to this threshold will be hard filtered.", + optional = true, + minValue = 0., + maxValue = 1.) + private Double snpCalibrationSensitivityThreshold; + + @Argument( + fullName = INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, + doc = "If specified, indels with scores corresponding to a calibration sensitivity that is greater than or equal to this threshold will be hard filtered.", + optional = true, + minValue = 0., + maxValue = 1.) + private Double indelCalibrationSensitivityThreshold; + + @Argument( + fullName = SNP_KEY_LONG_NAME, + doc = "Annotation flag to use for labeling sites as SNPs in output. " + + "Set this to \"null\" to omit these labels.") + private String snpKey = DEFAULT_SNP_KEY; + + @Argument( + fullName = SCORE_KEY_LONG_NAME, + doc = "Annotation key to use for score values in output.") + private String scoreKey = DEFAULT_SCORE_KEY; + + @Argument( + fullName = CALIBRATION_SENSITIVITY_KEY_LONG_NAME, + doc = "Annotation key to use for calibration-sensitivity values in output.") + private String calibrationSensitivityKey = DEFAULT_CALIBRATION_SENSITIVITY_KEY; + + @Argument( + fullName = LOW_SCORE_FILTER_NAME_LONG_NAME, + doc = "Name to use for low-score filter in output.") + private String lowScoreFilterName = DEFAULT_LOW_SCORE_FILTER_NAME; + + @Argument( + fullName = DOUBLE_FORMAT_LONG_NAME, + doc = "Format string to use for formatting score and calibration-sensitivity values in output.") + private String doubleFormat = DEFAULT_DOUBLE_FORMAT; + + private File outputScoresFile; + private Iterator scoresIterator; + private Iterator isSNPIterator; + + private VariantAnnotationsScorer snpScorer; + private VariantAnnotationsScorer indelScorer; + + private Function snpCalibrationSensitivityConverter; + private Function indelCalibrationSensitivityConverter; + + @Override + protected int numberOfPasses() { + return 2; + } + + @Override + public void afterOnTraversalStart() { + + Utils.nonNull(scoreKey); + Utils.nonNull(calibrationSensitivityKey); + Utils.nonNull(lowScoreFilterName); + Utils.nonNull(doubleFormat); + + switch (modelBackend) { + case JAVA_BGMM: + Utils.validateArg(pythonScriptFile == null, + "Python script should not be provided when using JAVA_BGMM backend."); + logger.info("Running in JAVA_BGMM mode..."); + snpScorer = deserializeScorerFromSerFiles(VariantType.SNP); + indelScorer = deserializeScorerFromSerFiles(VariantType.INDEL); + break; + case PYTHON_IFOREST: + Utils.validateArg(pythonScriptFile == null, + "Python script should not be provided when using PYTHON_IFOREST backend."); + + pythonScriptFile = IOUtils.writeTempResource(new Resource(TrainVariantAnnotationsModel.ISOLATION_FOREST_PYTHON_SCRIPT, TrainVariantAnnotationsModel.class)); + PythonScriptExecutor.checkPythonEnvironmentForPackage("argparse"); + PythonScriptExecutor.checkPythonEnvironmentForPackage("h5py"); + PythonScriptExecutor.checkPythonEnvironmentForPackage("numpy"); + PythonScriptExecutor.checkPythonEnvironmentForPackage("sklearn"); + PythonScriptExecutor.checkPythonEnvironmentForPackage("dill"); + logger.info("Running in PYTHON_IFOREST mode..."); + snpScorer = deserializeScorerFromPklFiles(VariantType.SNP); + indelScorer = deserializeScorerFromPklFiles(VariantType.INDEL); + break; + case PYTHON_SCRIPT: + IOUtils.canReadFile(pythonScriptFile); + logger.info("Running in PYTHON_SCRIPT mode..."); + snpScorer = deserializeScorerFromPklFiles(VariantType.SNP); + indelScorer = deserializeScorerFromPklFiles(VariantType.INDEL); + break; + default: + throw new GATKException.ShouldNeverReachHereException("Unknown model-backend mode."); + } + + if (snpScorer == null && indelScorer == null) { + throw new UserException.BadInput(String.format("At least one serialized scorer must be present " + + "in the model files with the prefix %s.", modelPrefix)); + } + if (variantTypesToExtract.contains(VariantType.SNP) && snpScorer == null) { + throw new UserException.BadInput(String.format("SNPs were indicated for extraction via the %s argument, " + + "but no serialized SNP scorer was available in the model files with the prefix.", MODE_LONG_NAME, modelPrefix)); + } + if (variantTypesToExtract.contains(VariantType.INDEL) && indelScorer == null) { + throw new UserException.BadInput(String.format("INDELs were indicated for extraction via the %s argument, " + + "but no serialized INDEL scorer was available in the model files with the prefix.", MODE_LONG_NAME, modelPrefix)); + } + + snpCalibrationSensitivityConverter = readCalibrationScoresAndCreateConverter(VariantType.SNP); + indelCalibrationSensitivityConverter = readCalibrationScoresAndCreateConverter(VariantType.INDEL); + + if (snpCalibrationSensitivityConverter == null && snpCalibrationSensitivityThreshold != null) { + throw new UserException.BadInput(String.format("The %s argument was specified, " + + "but no SNP calibration scores were provided in the model files with the prefix %s.", + SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, modelPrefix)); + } + if (indelCalibrationSensitivityConverter == null && indelCalibrationSensitivityThreshold != null) { + throw new UserException.BadInput(String.format("The %s argument was specified, " + + "but no INDEL calibration scores were provided in the model files with the prefix %s.", + INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, modelPrefix)); + } + + outputScoresFile = new File(outputPrefix + SCORES_HDF5_SUFFIX); + + // TODO this validation method should perhaps be moved outside of the CNV code + CopyNumberArgumentValidationUtils.validateOutputFiles(outputScoresFile); + } + + @Override + protected void nthPassApply(final VariantContext variant, + final ReadsContext readsContext, + final ReferenceContext referenceContext, + final FeatureContext featureContext, + final int n) { + final List, VariantType, TreeSet>> metadata = extractVariantMetadata(variant, featureContext, true); + final boolean isVariantExtracted = !metadata.isEmpty(); + if (n == 0 && isVariantExtracted) { + addExtractedVariantToData(data, variant, metadata); + } + if (n == 1) { + if (isVariantExtracted) { + writeExtractedVariantToVCF(variant, metadata); + } else { + vcfWriter.add(variant); + } + } + } + + @Override + protected void afterNthPass(final int n) { + if (n == 0) { + // TODO if BGMM, preprocess annotations and write to HDF5 with BGMMVariantAnnotationsScorer.preprocessAnnotationsWithBGMMAndWriteHDF5 + writeAnnotationsToHDF5(); + if (data.size() > 0) { + data.clear(); + readAnnotationsAndWriteScoresToHDF5(); + scoresIterator = Arrays.stream(VariantAnnotationsScorer.readScores(outputScoresFile)).iterator(); + isSNPIterator = LabeledVariantAnnotationsData.readLabel(outputAnnotationsFile, LabeledVariantAnnotationsData.SNP_LABEL).iterator(); + } else { + scoresIterator = Collections.emptyIterator(); + isSNPIterator = Collections.emptyIterator(); + } + } + if (n == 1) { + if (scoresIterator.hasNext()) { + throw new IllegalStateException("Traversals of scores and variants " + + "(or alleles, in allele-specific mode) were not correctly synchronized."); + } + if (vcfWriter != null) { + vcfWriter.close(); + } + } + } + + private VariantAnnotationsScorer deserializeScorerFromPklFiles(final VariantType variantType) { + final String variantTypeTag = '.' + variantType.toString().toLowerCase(); + final File scorerPklFile = new File( + modelPrefix + variantTypeTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX); + final File negativeScorerPklFile = new File( + modelPrefix + variantTypeTag + TrainVariantAnnotationsModel.NEGATIVE_TAG + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX); + return scorerPklFile.canRead() + ? negativeScorerPklFile.canRead() + ? VariantAnnotationsScorer.combinePositiveAndNegativeScorer( + new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, scorerPklFile), + new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, negativeScorerPklFile)) + : new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, scorerPklFile) + : null; + } + + private VariantAnnotationsScorer deserializeScorerFromSerFiles(final VariantType variantType) { + final String variantTypeTag = '.' + variantType.toString().toLowerCase(); + final File scorerSerFile = new File( + modelPrefix + variantTypeTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX); + final File negativeScorerSerFile = new File( + modelPrefix + variantTypeTag + TrainVariantAnnotationsModel.NEGATIVE_TAG + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX); + return scorerSerFile.canRead() + ? negativeScorerSerFile.canRead() + ? VariantAnnotationsScorer.combinePositiveAndNegativeScorer( + BGMMVariantAnnotationsScorer.deserialize(scorerSerFile), + BGMMVariantAnnotationsScorer.deserialize(negativeScorerSerFile)) + : BGMMVariantAnnotationsScorer.deserialize(scorerSerFile) + : null; + } + + private Function readCalibrationScoresAndCreateConverter(final VariantType variantType) { + final String variantTypeTag = '.' + variantType.toString().toLowerCase(); + final File calibrationScores = new File( + modelPrefix + variantTypeTag + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX); + return calibrationScores.canRead() + ? VariantAnnotationsScorer.createScoreToCalibrationSensitivityConverter(VariantAnnotationsScorer.readScores(calibrationScores)) + : null; + } + + private void readAnnotationsAndWriteScoresToHDF5() { + final List annotationNames = LabeledVariantAnnotationsData.readAnnotationNames(outputAnnotationsFile); + final List isSNP = LabeledVariantAnnotationsData.readLabel(outputAnnotationsFile, LabeledVariantAnnotationsData.SNP_LABEL); + final double[][] allAnnotations = LabeledVariantAnnotationsData.readAnnotations(outputAnnotationsFile); + final int numAll = allAnnotations.length; + final List allScores = new ArrayList<>(Collections.nCopies(numAll, Double.NaN)); + if (variantTypesToExtract.contains(VariantType.SNP)) { + logger.info("Scoring SNP variants..."); + scoreVariantTypeAndSetElementsOfAllScores(annotationNames, allAnnotations, isSNP, snpScorer, allScores); + } + if (variantTypesToExtract.contains(VariantType.INDEL)) { + logger.info("Scoring INDEL variants..."); + final List isIndel = isSNP.stream().map(x -> !x).collect(Collectors.toList()); + scoreVariantTypeAndSetElementsOfAllScores(annotationNames, allAnnotations, isIndel, indelScorer, allScores); + } + VariantAnnotationsScorer.writeScores(outputScoresFile, Doubles.toArray(allScores)); + logger.info(String.format("Scores written to %s.", outputScoresFile.getAbsolutePath())); + } + + private static void scoreVariantTypeAndSetElementsOfAllScores(final List annotationNames, + final double[][] allAnnotations, + final List isVariantType, + final VariantAnnotationsScorer variantTypeScorer, + final List allScores) { + final File variantTypeAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, allAnnotations, isVariantType); + final File variantTypeScoresFile = IOUtils.createTempFile("temp", ".scores.hdf5"); + variantTypeScorer.score(variantTypeAnnotationsFile, variantTypeScoresFile); // TODO we do not fail until here in the case of mismatched annotation names; we could fail earlier + final double[] variantTypeScores = VariantAnnotationsScorer.readScores(variantTypeScoresFile); + final Iterator variantTypeScoresIterator = Arrays.stream(variantTypeScores).iterator(); + IntStream.range(0, allScores.size()).filter(isVariantType::get).forEach(i -> allScores.set(i, variantTypeScoresIterator.next())); + } + + @Override + void writeExtractedVariantToVCF(final VariantContext vc, + final List altAlleles, + final Set labels) { + final VariantContextBuilder builder = new VariantContextBuilder(vc); + labels.forEach(l -> builder.attribute(l, true)); // labels should already be sorted as a TreeSet + + final List scores = useASAnnotations + ? altAlleles.stream().map(a -> scoresIterator.next()).collect(Collectors.toList()) + : Collections.singletonList(scoresIterator.next()); + final double score = Collections.max(scores); + final int scoreIndex = scores.indexOf(score); + builder.attribute(scoreKey, formatDouble(score)); + + final List isSNP = useASAnnotations + ? altAlleles.stream().map(a -> isSNPIterator.next()).collect(Collectors.toList()) + : Collections.singletonList(isSNPIterator.next()); + final boolean isSNPMax = isSNP.get(scoreIndex); + + if (snpKey != null) { + builder.attribute(snpKey, isSNPMax); + } + + final Function calibrationSensitivityConverter = isSNPMax ? snpCalibrationSensitivityConverter : indelCalibrationSensitivityConverter; + if (calibrationSensitivityConverter != null) { + final double calibrationSensitivity = calibrationSensitivityConverter.apply(score); + builder.attribute(calibrationSensitivityKey, formatDouble(calibrationSensitivity)); + final Double calibrationSensitivityThreshold = isSNPMax ? snpCalibrationSensitivityThreshold : indelCalibrationSensitivityThreshold; + if (calibrationSensitivityThreshold != null && calibrationSensitivity >= calibrationSensitivityThreshold) { + builder.filter(lowScoreFilterName); // TODO does this sufficiently cover the desired behavior when dealing with previously filtered sites, etc.? + } + } + + vcfWriter.add(builder.make()); + } + + private String formatDouble(final double x) { + return String.format(doubleFormat, x); + } + + /** + * Copies the header from the input VCF and adds info lines for the score, calibration-sensitivity, and label keys, + * as well as the filter line. + */ + @Override + VCFHeader constructVCFHeader(final List sortedLabels) { + final VCFHeader inputHeader = getHeaderForVariants(); + final Set inputHeaders = inputHeader.getMetaDataInSortedOrder(); + + final Set hInfo = new HashSet<>(inputHeaders); + hInfo.add(new VCFInfoHeaderLine(scoreKey, 1, VCFHeaderLineType.Float, + "Score according to the model applied by ScoreVariantAnnotations")); + hInfo.add(new VCFInfoHeaderLine(calibrationSensitivityKey, 1, VCFHeaderLineType.Float, + String.format("Calibration sensitivity corresponding to the value of %s", scoreKey))); + hInfo.add(new VCFFilterHeaderLine(lowScoreFilterName, "Low score (corresponding to high calibration sensitivity)")); + + hInfo.addAll(getDefaultToolVCFHeaderLines()); + if (snpKey != null) { + hInfo.add(new VCFInfoHeaderLine(snpKey, 1, VCFHeaderLineType.Flag, "This site was considered a SNP during filtering")); + } + hInfo.addAll(sortedLabels.stream() + .map(l -> new VCFInfoHeaderLine(l, 1, VCFHeaderLineType.Flag, String.format(RESOURCE_LABEL_INFO_HEADER_LINE_FORMAT_STRING, l))) + .collect(Collectors.toList())); + + return new VCFHeader(hInfo, inputHeader.getGenotypeSamples()); + } + + @Override + public Object onTraversalSuccess() { + + logger.info(String.format("%s complete.", getClass().getSimpleName())); + + return null; + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModel.java new file mode 100644 index 00000000000..9a8a1c8b845 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModel.java @@ -0,0 +1,570 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable; + +import com.google.common.collect.Streams; +import com.google.common.primitives.Doubles; +import org.apache.commons.math3.stat.descriptive.moment.Variance; +import org.apache.commons.math3.stat.descriptive.rank.Percentile; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.CommandLineProgram; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.walkers.vqsr.VariantRecalibrator; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.BGMMVariantAnnotationsModel; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.BGMMVariantAnnotationsScorer; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonSklearnVariantAnnotationsModel; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonSklearnVariantAnnotationsScorer; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModel; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModelBackend; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsScorer; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.io.IOUtils; +import org.broadinstitute.hellbender.utils.io.Resource; +import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor; +import picard.cmdline.programgroups.VariantFilteringProgramGroup; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Trains a model for scoring variant calls based on site-level annotations. + * + *

+ * This tool is intended to be used as the second step in a variant-filtering workflow that supersedes the + * {@link VariantRecalibrator} workflow. Given training (and optionally, calibration) sets of site-level annotations + * produced by {@link ExtractVariantAnnotations}, this tool can be used to train a model for scoring variant + * calls. The outputs of the tool are TODO + *

+ * + *

+ * The model trained by this tool can in turn be provided along with a VCF file to the {@link ScoreVariantAnnotations} + * tool, which assigns a score to each call (with a lower score indicating that a call is more likely to be an artifact + * and should perhaps be filtered). Each score can also be converted to a corresponding sensitivity to a + * calibration set, if the latter is available. + *

+ * + *

+ * TODO model definition + *

+ * + *

+ * TODO calibration-sensitivity conversion, considerations, and comparison to tranche files + *

+ * + *

+ * TODO positive vs. positive-negative + *

+ * * + *

+ * TODO IsolationForest section with description of method and hyperparameters + *

+ * + *

+ * Note that HDF5 files may be viewed using hdfview + * or loaded in Python using PyTables or h5py. + *

+ * + *

Inputs

+ * + *
    + *
  • + * Labeled-annotations HDF5 file (.annot.hdf5). Annotation data and metadata for labeled sites are stored in the + * HDF5 directory structure given in the documentation for the {@link ExtractVariantAnnotations} tool. In typical + * usage, both the {@value LabeledVariantAnnotationsData#TRAINING_LABEL} and + * {@value LabeledVariantAnnotationsData#CALIBRATION_LABEL} labels would be available for non-empty sets of + * sites of the requested variant type. + *
  • + *
  • + * (Optional) Unlabeled-annotations HDF5 file (.unlabeled.annot.hdf5). Annotation data and metadata for + * unlabeled sites are stored in the HDF5 directory structure given in the documentation for the + * {@link ExtractVariantAnnotations} tool. If provided, a positive-negative modeling approach (similar to + * that used in {@link VariantRecalibrator} will be used. + *
  • + *
  • + * Variant types (i.e., SNP and/or INDEL) for which to train models. Logic for determining variant type was retained from + * {@link VariantRecalibrator}; see {@link VariantType}. A separate model will be trained for each variant type + * and separate sets of outputs with corresponding tags in the filenames (i.e., "snp" or "indel") will be produced. + * TODO can run tool twice + *
  • + *
  • + * (Optional) Model backend. The default Python IsolationForest implementation requires either the GATK Python environment + * or that certain Python packages (argparse, h5py, numpy, sklearn, and dill) are otherwise available. + * A custom backend can also be specified in conjunction with the {@value PYTHON_SCRIPT_LONG_NAME} argument. + *
  • + *
  • + * (Optional) Model hyperparameters JSON file. TODO + *
  • + *
  • + * (Optional) Calibration-set sensitivity threshold. TODO if separate SNP/INDEL thresholds, run tool twice + *
  • + *
  • + * Output prefix. + * This is used as the basename for output files. + *
  • + *
+ * + *

Outputs

+ * + *
    + *
  • + * TODO + *
  • + *
  • + * (Optional) TODO + *
  • + *
+ * + *

Usage examples

+ * + *

+ * TODO, positive-only, producing the outputs 1) + * + *

+ *     gatk TrainVariantAnnotationsModel \
+ *          TODO
+ * 
+ *

+ * + *

+ * TODO, positive-negative, producing the outputs 1) + * + *

+ *     gatk TrainVariantAnnotationsModel \
+ *          TODO
+ * 
+ *

+ * + *

Custom modeling/scoring backends (ADVANCED)

+ * + *

+ * The primary modeling functionality performed by this tool is accomplished by a "modeling backend" + * whose fundamental contract is to take an input HDF5 file containing an annotation matrix for sites of a + * single variant type (i.e., SNP or INDEL) and to output a serialized scorer for that variant type. + * Rather than using one of the available, implemented backends, advanced users may provide their own backend + * via the {@value PYTHON_SCRIPT_LONG_NAME} argument. See documentation in the modeling and scoring interfaces + * ({@link VariantAnnotationsModel} and {@link VariantAnnotationsScorer}, respectively), as well as the default + * Python IsolationForest implementation at org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py. + *

+ * + *

+ * Extremely advanced users could potentially substitute their own implementation for the entire + * {@link TrainVariantAnnotationsModel} tool, while still making use of the up/downstream + * {@link ExtractVariantAnnotations} and {@link ScoreVariantAnnotations} tools. To do so, one would additionally + * have to implement functionality for subsetting training/calibration sets by variant type, + * calling modeling backends as appropriate, and scoring calibration sets. + *

+ * + * @author Samuel Lee <slee@broadinstitute.org> + */ +@CommandLineProgramProperties( + summary = "Trains a model for scoring variant calls based on site-level annotations.", + oneLineSummary = "Trains a model for scoring variant calls based on site-level annotations", + programGroup = VariantFilteringProgramGroup.class +) +@DocumentedFeature +@BetaFeature +public final class TrainVariantAnnotationsModel extends CommandLineProgram { + + public static final String MODE_LONG_NAME = "mode"; + public static final String ANNOTATIONS_HDF5_LONG_NAME = "annotations-hdf5"; + public static final String UNLABELED_ANNOTATIONS_HDF5_LONG_NAME = "unlabeled-annotations-hdf5"; + public static final String MODEL_BACKEND_LONG_NAME = "model-backend"; + public static final String PYTHON_SCRIPT_LONG_NAME = "python-script"; + public static final String HYPERPARAMETERS_JSON_LONG_NAME = "hyperparameters-json"; + public static final String CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME = "calibration-sensitivity-threshold"; + + public static final String ISOLATION_FOREST_PYTHON_SCRIPT = "isolation-forest.py"; + public static final String ISOLATION_FOREST_HYPERPARAMETERS_JSON = "isolation-forest-hyperparameters.json"; + + enum AvailableLabelsMode { + POSITIVE_ONLY, POSITIVE_UNLABELED + } + + public static final String TRAINING_SCORES_HDF5_SUFFIX = ".trainingScores.hdf5"; + public static final String CALIBRATION_SCORES_HDF5_SUFFIX = ".calibrationScores.hdf5"; + public static final String UNLABELED_SCORES_HDF5_SUFFIX = ".unlabeledScores.hdf5"; + public static final String NEGATIVE_TAG = ".negative"; + + @Argument( + fullName = ANNOTATIONS_HDF5_LONG_NAME, + doc = "HDF5 file containing annotations extracted with ExtractVariantAnnotations.") + private File inputAnnotationsFile; + + @Argument( + fullName = UNLABELED_ANNOTATIONS_HDF5_LONG_NAME, + doc = "HDF5 file containing annotations extracted with ExtractVariantAnnotations. " + + "If specified with " + CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME + ", " + + "a positive-unlabeled modeling approach will be used; otherwise, a positive-only modeling " + + "approach will be used.", + optional = true) + private File inputUnlabeledAnnotationsFile; + + @Argument( + fullName = MODEL_BACKEND_LONG_NAME, + doc = "Backend to use for training models. " + + "JAVA_BGMM will use a pure Java implementation (ported from Python scikit-learn) of the Bayesian Gaussian Mixture Model. " + + "PYTHON_IFOREST will use the Python scikit-learn implementation of the IsolationForest method and " + + "will require that the corresponding Python dependencies are present in the environment. " + + "PYTHON_SCRIPT will use the script specified by the " + PYTHON_SCRIPT_LONG_NAME + " argument. " + + "See the tool documentation for more details.") + private VariantAnnotationsModelBackend modelBackend = VariantAnnotationsModelBackend.PYTHON_IFOREST; + + @Argument( + fullName = PYTHON_SCRIPT_LONG_NAME, + doc = "Python script used for specifying a custom scoring backend. If provided, " + MODEL_BACKEND_LONG_NAME + " must also be set to PYTHON_SCRIPT.", + optional = true) + private File pythonScriptFile; + + @Argument( + fullName = HYPERPARAMETERS_JSON_LONG_NAME, + doc = "JSON file containing hyperparameters. Optional if the PYTHON_IFOREST backend is used " + + "(if not specified, a default set of hyperparameters will be used); otherwise required.", + optional = true) + private File hyperparametersJSONFile; + + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output prefix.") + private String outputPrefix; + + @Argument( + fullName = CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, + doc = "Calibration-sensitivity threshold that determines which sites will be used for training the negative model " + + "in the positive-unlabeled modeling approach. " + + "Increasing this will decrease the corresponding positive-model score threshold; sites with scores below this score " + + "threshold will be used for training the negative model. Thus, this parameter should typically be chosen to " + + "be close to 1, so that sites that score highly according to the positive model will not be used to train the negative model. " + + "The " + UNLABELED_ANNOTATIONS_HDF5_LONG_NAME + " argument must be specified in conjunction with this argument. " + + "If separate thresholds for SNP and INDEL models are desired, run the tool separately for each mode with its respective threshold.", + optional = true, + minValue = 0., + maxValue = 1.) + private Double calibrationSensitivityThreshold; + + @Argument( + fullName = MODE_LONG_NAME, + doc = "Variant types for which to train models. Duplicate values will be ignored.", + minElements = 1) + public List variantTypes = new ArrayList<>(Arrays.asList(VariantType.SNP, VariantType.INDEL)); + + private AvailableLabelsMode availableLabelsMode; + + @Override + protected Object doWork() { + + validateArgumentsAndSetModes(); + + logger.info("Starting training..."); + + for (final VariantType variantType : VariantType.values()) { // enforces order in which models are trained + if (variantTypes.contains(variantType)) { + doModelingWorkForVariantType(variantType); + } + } + + logger.info(String.format("%s complete.", getClass().getSimpleName())); + + return null; + } + + private void validateArgumentsAndSetModes() { + IOUtils.canReadFile(inputAnnotationsFile); + + Utils.validateArg((inputUnlabeledAnnotationsFile == null) == (calibrationSensitivityThreshold == null), + "Unlabeled annotations and calibration-sensitivity threshold must both be unspecified (for positive-only model training) " + + "or specified (for positive-unlabeled model training)."); + + availableLabelsMode = inputUnlabeledAnnotationsFile != null && calibrationSensitivityThreshold != null + ? AvailableLabelsMode.POSITIVE_UNLABELED + : AvailableLabelsMode.POSITIVE_ONLY; + + if (inputUnlabeledAnnotationsFile != null) { + IOUtils.canReadFile(inputUnlabeledAnnotationsFile); + final List annotationNames = LabeledVariantAnnotationsData.readAnnotationNames(inputAnnotationsFile); + final List unlabeledAnnotationNames = LabeledVariantAnnotationsData.readAnnotationNames(inputUnlabeledAnnotationsFile); + Utils.validateArg(annotationNames.equals(unlabeledAnnotationNames), "Annotation names must be identical for positive and unlabeled annotations."); + } + + switch (modelBackend) { + case JAVA_BGMM: + Utils.validateArg(pythonScriptFile == null, + "Python script should not be provided when using JAVA_BGMM backend."); + IOUtils.canReadFile(hyperparametersJSONFile); + logger.info("Running in JAVA_BGMM mode..."); + break; + case PYTHON_IFOREST: + Utils.validateArg(pythonScriptFile == null, + "Python script should not be provided when using PYTHON_IFOREST backend."); + + pythonScriptFile = IOUtils.writeTempResource(new Resource(ISOLATION_FOREST_PYTHON_SCRIPT, TrainVariantAnnotationsModel.class)); + if (hyperparametersJSONFile == null) { + hyperparametersJSONFile = IOUtils.writeTempResource(new Resource(ISOLATION_FOREST_HYPERPARAMETERS_JSON, TrainVariantAnnotationsModel.class)); + } + IOUtils.canReadFile(hyperparametersJSONFile); + PythonScriptExecutor.checkPythonEnvironmentForPackage("argparse"); + PythonScriptExecutor.checkPythonEnvironmentForPackage("h5py"); + PythonScriptExecutor.checkPythonEnvironmentForPackage("numpy"); + PythonScriptExecutor.checkPythonEnvironmentForPackage("sklearn"); + PythonScriptExecutor.checkPythonEnvironmentForPackage("dill"); + logger.info("Running in PYTHON_IFOREST mode..."); + break; + case PYTHON_SCRIPT: + IOUtils.canReadFile(pythonScriptFile); + IOUtils.canReadFile(hyperparametersJSONFile); + logger.info("Running in PYTHON_SCRIPT mode..."); + break; + default: + throw new GATKException.ShouldNeverReachHereException("Unknown model-backend mode."); + } + } + + /** + * TODO + */ + private void doModelingWorkForVariantType(final VariantType variantType) { + // positive model + final List annotationNames = LabeledVariantAnnotationsData.readAnnotationNames(inputAnnotationsFile); + final double[][] annotations = LabeledVariantAnnotationsData.readAnnotations(inputAnnotationsFile); + + final List isTraining = LabeledVariantAnnotationsData.readLabel(inputAnnotationsFile, LabeledVariantAnnotationsData.TRAINING_LABEL); + final List isCalibration = LabeledVariantAnnotationsData.readLabel(inputAnnotationsFile, LabeledVariantAnnotationsData.CALIBRATION_LABEL); + final List isSNP = LabeledVariantAnnotationsData.readLabel(inputAnnotationsFile, LabeledVariantAnnotationsData.SNP_LABEL); + final List isVariantType = variantType == VariantType.SNP ? isSNP : isSNP.stream().map(x -> !x).collect(Collectors.toList()); + + final List isTrainingAndVariantType = Streams.zip(isTraining.stream(), isVariantType.stream(), (a, b) -> a && b).collect(Collectors.toList()); + final int numTrainingAndVariantType = numPassingFilter(isTrainingAndVariantType); + + final String variantTypeString = variantType.toString(); + final String outputPrefixTag = '.' + variantType.toString().toLowerCase(); + + if (numTrainingAndVariantType > 0) { + logger.info(String.format("Training %s model with %d training sites x %d annotations %s...", + variantTypeString, numTrainingAndVariantType, annotationNames.size(), annotationNames)); + final File labeledTrainingAndVariantTypeAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, annotations, isTrainingAndVariantType); + trainAndSerializeModel(labeledTrainingAndVariantTypeAnnotationsFile, outputPrefixTag); + logger.info(String.format("%s model trained and serialized with output prefix \"%s\".", variantTypeString, outputPrefix + outputPrefixTag)); + + if (modelBackend == VariantAnnotationsModelBackend.JAVA_BGMM) { + BGMMVariantAnnotationsScorer.preprocessAnnotationsWithBGMMAndWriteHDF5( + annotationNames, outputPrefix + outputPrefixTag, labeledTrainingAndVariantTypeAnnotationsFile, logger); + } + + logger.info(String.format("Scoring %d %s training sites...", numTrainingAndVariantType, variantTypeString)); + final File labeledTrainingAndVariantTypeScoresFile = score(labeledTrainingAndVariantTypeAnnotationsFile, outputPrefixTag, TRAINING_SCORES_HDF5_SUFFIX); + logger.info(String.format("%s training scores written to %s.", variantTypeString, labeledTrainingAndVariantTypeScoresFile.getAbsolutePath())); + + final List isLabeledCalibrationAndVariantType = Streams.zip(isCalibration.stream(), isVariantType.stream(), (a, b) -> a && b).collect(Collectors.toList()); + final int numLabeledCalibrationAndVariantType = numPassingFilter(isLabeledCalibrationAndVariantType); + if (numLabeledCalibrationAndVariantType > 0) { + logger.info(String.format("Scoring %d %s calibration sites...", numLabeledCalibrationAndVariantType, variantTypeString)); + final File labeledCalibrationAndVariantTypeAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, annotations, isLabeledCalibrationAndVariantType); + final File labeledCalibrationAndVariantTypeScoresFile = score(labeledCalibrationAndVariantTypeAnnotationsFile, outputPrefixTag, CALIBRATION_SCORES_HDF5_SUFFIX); + logger.info(String.format("%s calibration scores written to %s.", variantTypeString, labeledCalibrationAndVariantTypeScoresFile.getAbsolutePath())); + } else { + logger.warn(String.format("No %s calibration sites were available.", variantTypeString)); + } + + // negative model + if (availableLabelsMode == AvailableLabelsMode.POSITIVE_UNLABELED) { + final double[][] unlabeledAnnotations = LabeledVariantAnnotationsData.readAnnotations(inputUnlabeledAnnotationsFile); + final List unlabeledIsSNP = LabeledVariantAnnotationsData.readLabel(inputUnlabeledAnnotationsFile, "snp"); + final List isUnlabeledVariantType = variantType == VariantType.SNP ? unlabeledIsSNP : unlabeledIsSNP.stream().map(x -> !x).collect(Collectors.toList()); + + final int numUnlabeledVariantType = numPassingFilter(isUnlabeledVariantType); + + if (numUnlabeledVariantType > 0) { + final File labeledCalibrationAndVariantTypeScoresFile = new File(outputPrefix + outputPrefixTag + CALIBRATION_SCORES_HDF5_SUFFIX); + final double[] labeledCalibrationAndVariantTypeScores = VariantAnnotationsScorer.readScores(labeledCalibrationAndVariantTypeScoresFile); + final double scoreThreshold = calibrationSensitivityThreshold == 1. // Percentile requires quantile > 0, so we treat this as a special case + ? Doubles.min(labeledCalibrationAndVariantTypeScores) + : new Percentile(100. * (1. - calibrationSensitivityThreshold)).evaluate(labeledCalibrationAndVariantTypeScores); + logger.info(String.format("Using %s score threshold of %.4f corresponding to specified calibration-sensitivity threshold of %.4f ...", + variantTypeString, scoreThreshold, calibrationSensitivityThreshold)); + + final double[] labeledTrainingAndVariantTypeScores = VariantAnnotationsScorer.readScores(labeledTrainingAndVariantTypeScoresFile); + final List isNegativeTrainingFromLabeledTrainingAndVariantType = Arrays.stream(labeledTrainingAndVariantTypeScores).boxed().map(s -> s < scoreThreshold).collect(Collectors.toList()); + final int numNegativeTrainingFromLabeledTrainingAndVariantType = numPassingFilter(isNegativeTrainingFromLabeledTrainingAndVariantType); + logger.info(String.format("Selected %d labeled %s sites below score threshold of %.4f for negative-model training...", + numNegativeTrainingFromLabeledTrainingAndVariantType, variantTypeString, scoreThreshold)); + + logger.info(String.format("Scoring %d unlabeled %s sites...", numUnlabeledVariantType, variantTypeString)); + final File unlabeledVariantTypeAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, unlabeledAnnotations, isUnlabeledVariantType); + final File unlabeledVariantTypeScoresFile = score(unlabeledVariantTypeAnnotationsFile, outputPrefixTag, UNLABELED_SCORES_HDF5_SUFFIX); + final double[] unlabeledVariantTypeScores = VariantAnnotationsScorer.readScores(unlabeledVariantTypeScoresFile); + final List isNegativeTrainingFromUnlabeledVariantType = Arrays.stream(unlabeledVariantTypeScores).boxed().map(s -> s < scoreThreshold).collect(Collectors.toList()); // length matches unlabeledAnnotationsFile + final int numNegativeTrainingFromUnlabeledVariantType = numPassingFilter(isNegativeTrainingFromUnlabeledVariantType); + logger.info(String.format("Selected %d unlabeled %s sites below score threshold of %.4f for negative-model training...", + numNegativeTrainingFromUnlabeledVariantType, variantTypeString, scoreThreshold)); + + final double[][] negativeTrainingAndVariantTypeAnnotations = concatenateLabeledAndUnlabeledNegativeTrainingData( + annotationNames, annotations, unlabeledAnnotations, isNegativeTrainingFromLabeledTrainingAndVariantType, isNegativeTrainingFromUnlabeledVariantType); + final int numNegativeTrainingAndVariantType = negativeTrainingAndVariantTypeAnnotations.length; + final List isNegativeTrainingAndVariantType = Collections.nCopies(numNegativeTrainingAndVariantType, true); + + logger.info(String.format("Training %s negative model with %d negative-training sites x %d annotations %s...", + variantTypeString, numNegativeTrainingAndVariantType, annotationNames.size(), annotationNames)); + final File negativeTrainingAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile( + annotationNames, negativeTrainingAndVariantTypeAnnotations, isNegativeTrainingAndVariantType); + trainAndSerializeModel(negativeTrainingAnnotationsFile, outputPrefixTag + NEGATIVE_TAG); + logger.info(String.format("%s negative model trained and serialized with output prefix \"%s\".", variantTypeString, outputPrefix + outputPrefixTag + NEGATIVE_TAG)); + + if (numLabeledCalibrationAndVariantType > 0) { + logger.info(String.format("Re-scoring %d %s calibration sites...", numLabeledCalibrationAndVariantType, variantTypeString)); + final File labeledCalibrationAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, annotations, isLabeledCalibrationAndVariantType); + final File labeledCalibrationScoresFile = positiveNegativeScore(labeledCalibrationAnnotationsFile, outputPrefixTag, CALIBRATION_SCORES_HDF5_SUFFIX); + logger.info(String.format("Calibration scores written to %s.", labeledCalibrationScoresFile.getAbsolutePath())); + } + } else { + throw new UserException.BadInput(String.format("Attempted to train %s negative model, " + + "but no suitable sites were found in the provided annotations.", variantTypeString)); + } + } + } else { + throw new UserException.BadInput(String.format("Attempted to train %s model, " + + "but no suitable training sites were found in the provided annotations.", variantTypeString)); + } + } + + private static int numPassingFilter(List isPassing) { + return isPassing.stream().mapToInt(x -> x ? 1 : 0).sum(); + } + + private void trainAndSerializeModel(final File trainingAnnotationsFile, + final String outputPrefixTag) { + readAndValidateTrainingAnnotations(trainingAnnotationsFile, outputPrefixTag); + final VariantAnnotationsModel model; + switch (modelBackend) { + case JAVA_BGMM: + model = new BGMMVariantAnnotationsModel(hyperparametersJSONFile); + break; + case PYTHON_IFOREST: + model = new PythonSklearnVariantAnnotationsModel(pythonScriptFile, hyperparametersJSONFile); + break; + case PYTHON_SCRIPT: + model = new PythonSklearnVariantAnnotationsModel(pythonScriptFile, hyperparametersJSONFile); + break; + default: + throw new GATKException.ShouldNeverReachHereException("Unknown model mode."); + } + model.trainAndSerialize(trainingAnnotationsFile, outputPrefix + outputPrefixTag); + } + + /** + * When training models on data that has been subset to a given variant type, + * we FAIL if any annotation is completely missing and WARN if any annotation has zero variance. + */ + private void readAndValidateTrainingAnnotations(final File trainingAnnotationsFile, + final String outputPrefixTag) { + final List annotationNames = LabeledVariantAnnotationsData.readAnnotationNames(trainingAnnotationsFile); + final double[][] annotations = LabeledVariantAnnotationsData.readAnnotations(trainingAnnotationsFile); + + // these checks are redundant, but we err on the side of robustness + final int numAnnotationNames = annotationNames.size(); + final int numData = annotations.length; + Utils.validateArg(numAnnotationNames > 0, "Number of annotation names must be positive."); + Utils.validateArg(numData > 0, "Number of data points must be positive."); + final int numFeatures = annotations[0].length; + Utils.validateArg(numAnnotationNames == numFeatures, + "Number of annotation names must match the number of features in the annotation data."); + + final List completelyMissingAnnotationNames = new ArrayList<>(numFeatures); + IntStream.range(0, numFeatures).forEach( + i -> { + if (new Variance().evaluate(IntStream.range(0, numData).mapToDouble(n -> annotations[n][i]).toArray()) == 0.) { + logger.warn(String.format("All values of the annotation %s are identical in the training data for the %s model.", + annotationNames.get(i), outputPrefix + outputPrefixTag)); + } + if (IntStream.range(0, numData).boxed().map(n -> annotations[n][i]).allMatch(x -> Double.isNaN(x))) { + completelyMissingAnnotationNames.add(annotationNames.get(i)); + } + } + ); + + if (!completelyMissingAnnotationNames.isEmpty()) { + throw new UserException.BadInput( + String.format("All values of the following annotations are missing in the training data for the %s model: %s. " + + "Consider repeating the extraction step with this annotation dropped. " + + "If this is a negative model and the amount of negative training data is small, " + + "perhaps also consider lowering the value of the %s argument so that more " + + "training data is considered, which may ultimately admit data with non-missing values for the annotation " + + "(although note that this will also have implications for the resulting model fit); " + + "alternatively, consider excluding the %s and %s arguments and running positive-only modeling.", + outputPrefix + outputPrefixTag, completelyMissingAnnotationNames, + CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, UNLABELED_ANNOTATIONS_HDF5_LONG_NAME, CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME)); + } + } + + private File score(final File annotationsFile, + final String outputPrefixTag, + final String outputSuffix) { + final VariantAnnotationsScorer scorer; + switch (modelBackend) { + case JAVA_BGMM: + scorer = BGMMVariantAnnotationsScorer.deserialize(new File(outputPrefix + outputPrefixTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX)); + break; + case PYTHON_IFOREST: + case PYTHON_SCRIPT: + scorer = new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, new File(outputPrefix + outputPrefixTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX)); + break; + + default: + throw new GATKException.ShouldNeverReachHereException("Unknown model mode."); + } + final File outputScoresFile = new File(outputPrefix + outputPrefixTag + outputSuffix); + scorer.score(annotationsFile, outputScoresFile); + return outputScoresFile; + } + + private File positiveNegativeScore(final File annotationsFile, + final String outputPrefixTag, + final String outputSuffix) { + final VariantAnnotationsScorer scorer; + switch (modelBackend) { + case JAVA_BGMM: + scorer = VariantAnnotationsScorer.combinePositiveAndNegativeScorer( + BGMMVariantAnnotationsScorer.deserialize(new File(outputPrefix + outputPrefixTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX)), + BGMMVariantAnnotationsScorer.deserialize(new File(outputPrefix + outputPrefixTag + NEGATIVE_TAG + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX))); + break; + case PYTHON_IFOREST: + case PYTHON_SCRIPT: + scorer = VariantAnnotationsScorer.combinePositiveAndNegativeScorer( + new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, new File(outputPrefix + outputPrefixTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX)), + new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, new File(outputPrefix + outputPrefixTag + NEGATIVE_TAG + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX))); + break; + default: + throw new GATKException.ShouldNeverReachHereException("Unknown model mode."); + } + final File outputScoresFile = new File(outputPrefix + outputPrefixTag + outputSuffix); + scorer.score(annotationsFile, outputScoresFile); + return outputScoresFile; + } + + private static double[][] concatenateLabeledAndUnlabeledNegativeTrainingData(final List annotationNames, + final double[][] annotations, + final double[][] unlabeledAnnotations, + final List isNegativeTrainingFromLabeledTrainingAndVariantType, + final List isNegativeTrainingFromUnlabeledVariantType) { + final File negativeTrainingFromLabeledTrainingAndVariantTypeAnnotationsFile = + LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, annotations, isNegativeTrainingFromLabeledTrainingAndVariantType); + final double[][] negativeTrainingFromLabeledTrainingAndVariantTypeAnnotations = LabeledVariantAnnotationsData.readAnnotations(negativeTrainingFromLabeledTrainingAndVariantTypeAnnotationsFile); + + final File negativeTrainingFromUnlabeledVariantTypeAnnotationsFile = + LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, unlabeledAnnotations, isNegativeTrainingFromUnlabeledVariantType); + final double[][] negativeTrainingFromUnlabeledVariantTypeAnnotations = LabeledVariantAnnotationsData.readAnnotations(negativeTrainingFromUnlabeledVariantTypeAnnotationsFile); + + return Streams.concat( + Arrays.stream(negativeTrainingFromLabeledTrainingAndVariantTypeAnnotations), + Arrays.stream(negativeTrainingFromUnlabeledVariantTypeAnnotations)).toArray(double[][]::new); + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsData.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsData.java new file mode 100644 index 00000000000..2abd7fce48b --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsData.java @@ -0,0 +1,284 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data; + +import com.google.common.collect.ImmutableList; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hdf5.HDF5File; +import org.broadinstitute.hdf5.HDF5LibException; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModel; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsScorer; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.io.IOUtils; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.TreeSet; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +/** + * Represents a collection of {@link LabeledVariantAnnotationsDatum} as a list of lists of datums. + * The outer list is always per-variant. In allele-specific mode, each datum in the inner lists + * corresponds to a single allele; otherwise, each inner list trivially contains a single datum corresponding + * to the variant. + */ +public final class LabeledVariantAnnotationsData { + private static final Logger logger = LogManager.getLogger(LabeledVariantAnnotationsData.class); + + // chunk size in temporary annotation files + // TODO this could be exposed + private static final int CHUNK_DIVISOR = 16; + private static final int MAXIMUM_CHUNK_SIZE = HDF5Utils.MAX_NUMBER_OF_VALUES_PER_HDF5_MATRIX / CHUNK_DIVISOR; + + private static final int INITIAL_SIZE = 10_000_000; + + public static final String TRAINING_LABEL = "training"; + public static final String CALIBRATION_LABEL = "calibration"; + public static final String SNP_LABEL = "snp"; + + public static final String INTERVALS_PATH = "/intervals"; + public static final String ALLELES_REF_PATH = "/alleles/ref"; + public static final String ALLELES_ALT_PATH = "/alleles/alt"; + public static final String ANNOTATIONS_NAMES_PATH = "/annotations/names"; + public static final String ANNOTATIONS_PATH = "/annotations"; + public static final String LABELS_PATH = "/labels"; + public static final String LABELS_SNP_PATH = LABELS_PATH + "/snp"; + + private final List sortedAnnotationNames; + final List sortedLabels; + + private final List> data; + private final boolean useASAnnotations; + + public LabeledVariantAnnotationsData(final Collection annotationNames, + final Collection labels, + final boolean useASAnnotations, + final int initialSize) { + data = new ArrayList<>(initialSize); + sortedAnnotationNames = ImmutableList.copyOf(annotationNames.stream().distinct().sorted().collect(Collectors.toList())); + Utils.validateArg(sortedAnnotationNames.size() > 0, "Number of annotation names must be positive."); + if (sortedAnnotationNames.size() != annotationNames.size()) { + logger.warn(String.format("Ignoring duplicate annotations: %s.", Utils.getDuplicatedItems(annotationNames))); + } + sortedLabels = ImmutableList.copyOf(labels.stream().distinct().sorted().collect(Collectors.toList())); + if (sortedLabels.size() != labels.size()) { + logger.warn(String.format("Ignoring duplicate labels: %s.", Utils.getDuplicatedItems(labels))); + } + this.useASAnnotations = useASAnnotations; + } + + public LabeledVariantAnnotationsData(final Collection annotationNames, + final Collection labels, + final boolean useASAnnotations) { + this(annotationNames, labels, useASAnnotations, INITIAL_SIZE); + } + + public List getSortedAnnotationNames() { + return sortedAnnotationNames; + } + + public List getSortedLabels() { + return sortedLabels; + } + + public int size() { + return data.size(); + } + + public void clear() { + data.clear(); + } + + /** + * Adds an element to the underlying {@link #data} collection. + */ + public void add(final VariantContext vc, + final List> altAllelesPerDatum, + final List variantTypePerDatum, + final List> labelsPerDatum) { + if (!useASAnnotations) { + data.add(Collections.singletonList(new LabeledVariantAnnotationsDatum( + vc, altAllelesPerDatum.get(0), variantTypePerDatum.get(0), labelsPerDatum.get(0), sortedAnnotationNames, useASAnnotations))); + } else { + data.add(IntStream.range(0, altAllelesPerDatum.size()).boxed() + .map(i -> new LabeledVariantAnnotationsDatum( + vc, altAllelesPerDatum.get(i), variantTypePerDatum.get(i), labelsPerDatum.get(i), sortedAnnotationNames, useASAnnotations)) + .collect(Collectors.toList())); + } + } + + /** + * Sets the element at a specified index in the underlying {@link #data} collection. + */ + public void set(final int index, + final VariantContext vc, + final List> altAllelesPerDatum, + final List variantTypePerDatum, + final List> labelsPerDatum) { + if (!useASAnnotations) { + data.set(index, Collections.singletonList(new LabeledVariantAnnotationsDatum( + vc, altAllelesPerDatum.get(0), variantTypePerDatum.get(0), labelsPerDatum.get(0), sortedAnnotationNames, useASAnnotations))); + } else { + data.set(index, IntStream.range(0, altAllelesPerDatum.size()).boxed() + .map(i -> new LabeledVariantAnnotationsDatum( + vc, altAllelesPerDatum.get(i), variantTypePerDatum.get(i), labelsPerDatum.get(i), sortedAnnotationNames, useASAnnotations)) + .collect(Collectors.toList())); + } + } + + /** + * @return list of {@link VariantType} indicators, with length given by the number of corresponding sites + */ + public List getVariantTypeFlat() { + return streamFlattenedData().map(datum -> datum.variantType).collect(Collectors.toList()); + } + + /** + * @return list of boolean label indicators, with length given by the number of sites; + * an element in the list will be true if the corresponding site is assigned to the specified label + */ + public List isLabelFlat(final String label) { + return streamFlattenedData().map(datum -> datum.labels.contains(label)).collect(Collectors.toList()); + } + + private Stream streamFlattenedData() { + return data.stream().flatMap(List::stream); + } + + /** + * Writes a representation of the collection to an HDF5 file with the following directory structure: + * + *

+ * |--- alleles
+ * │ |--- alt
+ * │ |--- ref
+ * |--- annotations
+ * │ |--- chunk_0
+ * │ |--- ...
+ * │ |--- chunk_{num_chunks - 1}
+ * │ |--- names
+ * │ |--- num_chunks
+ * │ |--- num_columns
+ * │ |--- num_rows
+ * |--- intervals
+ * │ |--- indexed_contig_names
+ * │ |--- transposed_index_start_end
+ * |--- labels
+ * │ |--- snp
+ * │ |--- ... (e.g., training, calibration, etc.)
+ * │ |--- ...
+ *

+ * + * Here, each chunk is a double matrix, with dimensions given by (number of sites in the chunk) x (number of annotations). + * See the methods {@link HDF5Utils#writeChunkedDoubleMatrix} and {@link HDF5Utils#writeIntervals} for additional details. + * + * @param omitAllelesInHDF5 string arrays containing ref/alt alleles can be large, so we allow the option of omitting them + */ + public void writeHDF5(final File outputFile, + final boolean omitAllelesInHDF5) { + + try (final HDF5File outputHDF5File = new HDF5File(outputFile, HDF5File.OpenMode.CREATE)) { + IOUtils.canReadFile(outputHDF5File.getFile()); + HDF5Utils.writeIntervals(outputHDF5File, INTERVALS_PATH, + streamFlattenedData().map(datum -> datum.interval).collect(Collectors.toList())); + if (!omitAllelesInHDF5) { + outputHDF5File.makeStringArray(ALLELES_REF_PATH, + streamFlattenedData().map(datum -> datum.refAllele.getDisplayString()).toArray(String[]::new)); + if (!useASAnnotations) { + outputHDF5File.makeStringArray(ALLELES_ALT_PATH, + streamFlattenedData() + .map(datum -> datum.altAlleles.stream().map(Allele::getDisplayString).collect(Collectors.joining(","))) + .toArray(String[]::new)); + } else { + outputHDF5File.makeStringArray(ALLELES_ALT_PATH, + streamFlattenedData().map(datum -> datum.altAlleles.get(0).getDisplayString()).toArray(String[]::new)); + } + } + outputHDF5File.makeStringArray(ANNOTATIONS_NAMES_PATH, sortedAnnotationNames.toArray(new String[0])); + HDF5Utils.writeChunkedDoubleMatrix(outputHDF5File, ANNOTATIONS_PATH, + streamFlattenedData().map(datum -> datum.annotations).toArray(double[][]::new), MAXIMUM_CHUNK_SIZE); + outputHDF5File.makeDoubleArray(LABELS_SNP_PATH, + streamFlattenedData().mapToDouble(datum -> datum.variantType == VariantType.SNP ? 1 : 0).toArray()); + for (final String label : sortedLabels) { + outputHDF5File.makeDoubleArray(String.format("%s/%s", LABELS_PATH, label), + streamFlattenedData().mapToDouble(datum -> datum.labels.contains(label) ? 1 : 0).toArray()); + } + } catch (final HDF5LibException exception) { + throw new GATKException(String.format("Exception encountered during writing of annotations and metadata (%s). Output file at %s may be in a bad state.", + exception, outputFile.getAbsolutePath())); + } + } + + /** + * @return list of annotation names, with length given by the number of annotations, read from the specified file + */ + public static List readAnnotationNames(final File annotationsFile) { + try (final HDF5File annotationsHDF5File = new HDF5File(annotationsFile, HDF5File.OpenMode.READ_ONLY)) { + IOUtils.canReadFile(annotationsHDF5File.getFile()); + return Arrays.asList(annotationsHDF5File.readStringArray(ANNOTATIONS_NAMES_PATH)); + } catch (final HDF5LibException exception) { + throw new GATKException(String.format("Exception encountered during reading of annotation names from %s: %s", + annotationsFile.getAbsolutePath(), exception)); + } + } + + /** + * @return matrix with dimensions (number of sites) x (number of annotations), read from the specified file + */ + public static double[][] readAnnotations(final File annotationsFile) { + try (final HDF5File annotationsHDF5File = new HDF5File(annotationsFile, HDF5File.OpenMode.READ_ONLY)) { + IOUtils.canReadFile(annotationsHDF5File.getFile()); + return HDF5Utils.readChunkedDoubleMatrix(annotationsHDF5File, ANNOTATIONS_PATH); + } catch (final HDF5LibException exception) { + throw new GATKException(String.format("Exception encountered during reading of annotations from %s: %s", + annotationsFile.getAbsolutePath(), exception)); + } + } + + /** + * @return list of boolean label indicators, with length given by the number of corresponding sites, read from the specified file; + * an element in the list will be true if the corresponding site is assigned to the specified label + */ + public static List readLabel(final File annotationsFile, + final String label) { + try (final HDF5File annotationsHDF5File = new HDF5File(annotationsFile, HDF5File.OpenMode.READ_ONLY)) { + IOUtils.canReadFile(annotationsHDF5File.getFile()); + return Arrays.stream(annotationsHDF5File.readDoubleArray(String.format("/labels/%s", label))).boxed().map(d -> d == 1).collect(Collectors.toList()); + } catch (final HDF5LibException exception) { + throw new GATKException(String.format("Exception encountered during reading of label %s from %s: %s", + label, annotationsFile.getAbsolutePath(), exception)); + } + } + + /** + * Subsets annotation data according to a boolean filter and writes a limited representation to a temporary HDF5 file. + * Intended for passing annotations via the file interfaces of {@link VariantAnnotationsModel} and {@link VariantAnnotationsScorer}. + */ + public static File subsetAnnotationsToTemporaryFile(final List annotationNames, + final double[][] allAnnotations, + final List isSubset) { + Utils.validateArg(annotationNames.size() > 0, "Number of annotation names must be positive."); + Utils.validateArg(allAnnotations.length > 0, "Number of annotation data points must be positive."); + Utils.validateArg(annotationNames.size() == allAnnotations[0].length, + "Number of annotation names must match number of features in annotation data."); + final double[][] subsetData = IntStream.range(0, isSubset.size()).boxed().filter(isSubset::get).map(i -> allAnnotations[i]).toArray(double[][]::new); + final File subsetAnnotationsFile = IOUtils.createTempFile("subset.annot", ".hdf5"); + try (final HDF5File subsetAnnotationsHDF5File = new HDF5File(subsetAnnotationsFile, HDF5File.OpenMode.CREATE)) { + subsetAnnotationsHDF5File.makeStringArray(ANNOTATIONS_NAMES_PATH, annotationNames.toArray(new String[0])); + HDF5Utils.writeChunkedDoubleMatrix(subsetAnnotationsHDF5File, ANNOTATIONS_PATH, subsetData, MAXIMUM_CHUNK_SIZE); + } catch (final HDF5LibException exception) { + throw new GATKException(String.format("Exception encountered during writing of annotations (%s). Output file at %s may be in a bad state.", + exception, subsetAnnotationsFile.getAbsolutePath())); + } + return subsetAnnotationsFile; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsDatum.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsDatum.java new file mode 100644 index 00000000000..884529f5c56 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsDatum.java @@ -0,0 +1,104 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import htsjdk.samtools.util.Locatable; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.LabeledVariantAnnotationsWalker; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; + +import java.util.List; +import java.util.TreeSet; + +/** + * Represents metadata and annotations extracted from either a variant or a single alt allele (if in allele-specific mode). + * Intended to be package-private and accessed only by {@link LabeledVariantAnnotationsData}. + */ +final class LabeledVariantAnnotationsDatum implements Locatable { + final SimpleInterval interval; + final Allele refAllele; + final ImmutableList altAlleles; // in allele-specific mode, this contains a single alt allele; otherwise, it contains all alt alleles that passed variant-type checks + final VariantType variantType; + final ImmutableSet labels; // sorted TreeSet + final double[] annotations; // TODO use ImmutableDoubleArray? + + LabeledVariantAnnotationsDatum(final VariantContext vc, + final List altAlleles, + final VariantType variantType, + final TreeSet labels, + final List sortedAnnotationNames, + final boolean useASAnnotations) { + Utils.validate(!useASAnnotations || altAlleles.size() == 1, + "Datum should only be associated with one alt allele in allele-specific mode."); + this.interval = new SimpleInterval(vc); + this.refAllele = vc.getReference(); + this.altAlleles = ImmutableList.copyOf(altAlleles); + this.variantType = variantType; + this.labels = ImmutableSet.copyOf(labels); + this.annotations = sortedAnnotationNames.stream() + .mapToDouble(a -> decodeAnnotation(vc, altAlleles, a, useASAnnotations)) + .toArray(); + } + + @Override + public String getContig() { + return interval.getContig(); + } + + @Override + public int getStart() { + return interval.getStart(); + } + + @Override + public int getEnd() { + return interval.getEnd(); + } + + // code mostly retained from VQSR; some exception catching added + private static double decodeAnnotation(final VariantContext vc, + final List altAlleles, + final String annotationName, + final boolean useASAnnotations) { + double value; + try { + // if we're in allele-specific mode and an allele-specific annotation has been requested, parse the appropriate value from the list + // TODO: can we trigger allele-specific parsing based on annotation prefix or some other logic? + if (useASAnnotations && annotationName.startsWith(GATKVCFConstants.ALLELE_SPECIFIC_PREFIX)) { + final List valueList = vc.getAttributeAsList(annotationName); + final Allele altAllele = altAlleles.get(0); + // FIXME: we need to look at the ref allele here too (SL: this comment was retained from VQSR code, I'm not sure what it means...) + if (vc.hasAllele(altAllele)) { + final int altIndex = vc.getAlleleIndex(altAllele) - 1; //- 1 is to convert the index from all alleles (including reference) to just alternate alleles + try { + value = Double.parseDouble((String) valueList.get(altIndex)); + } catch (final IndexOutOfBoundsException e) { + throw new UserException(String.format("Could not extract annotation %s from variant context: %s. " + + "Encountered exception: %s", annotationName, vc, e)); + } + } else { + //if somehow our alleles got mixed up + throw new IllegalStateException("Allele " + altAllele + " is not contained in the input VariantContext."); + } + } else { + try { + value = vc.getAttributeAsDouble(annotationName, Double.NaN); + } catch (final ClassCastException e) { + throw new UserException(String.format("Could not extract annotation %s from variant context: %s. " + + "Ensure that %s is specified, if desired. Encountered exception: %s", + annotationName, vc, LabeledVariantAnnotationsWalker.USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME, e)); + } + } + if (Double.isInfinite(value)) { + value = Double.NaN; + } + } catch (final NumberFormatException e) { + value = Double.NaN; + } + return value; + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/VariantType.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/VariantType.java new file mode 100644 index 00000000000..0bfeb7df4e7 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/VariantType.java @@ -0,0 +1,49 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data; + +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; + +/** + * Logic for determining variant types was retained from VQSR. + */ +public enum VariantType { + SNP, + INDEL; + + public static boolean checkVariantType(final VariantContext vc, + final VariantContext resourceVC) { + switch (resourceVC.getType()) { + case SNP: + case MNP: + return getVariantType(vc) == SNP; + case INDEL: + case MIXED: + case SYMBOLIC: + return getVariantType(vc) == INDEL; + default: + return false; + } + } + + public static VariantType getVariantType(final VariantContext vc) { + if (vc.isSNP() || vc.isMNP()) { + return SNP; + } else if (vc.isStructuralIndel() || vc.isIndel() || vc.isMixed() || vc.isSymbolic()) { + return INDEL; + } else { + throw new IllegalStateException("Encountered unknown variant type: " + vc.getType()); + } + } + + public static VariantType getVariantType(final VariantContext vc, + final Allele allele) { + if (vc.getReference().length() == allele.length()) { + //note that spanning deletions are considered SNPs by this logic + return SNP; + } else if ((vc.getReference().length() != allele.length()) || allele.isSymbolic()) { + return INDEL; + } else { + throw new IllegalStateException("Encountered unknown variant type: " + vc.getType()); + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsModel.java new file mode 100644 index 00000000000..53e616bb515 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsModel.java @@ -0,0 +1,31 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling; + +import org.apache.commons.lang.NotImplementedException; + +import java.io.File; +import java.io.Serializable; + +// TODO this is just a stub, will be fleshed out in a separate PR +public final class BGMMVariantAnnotationsModel implements VariantAnnotationsModel { + + public BGMMVariantAnnotationsModel(final File hyperparametersJSONFile) { + throw new NotImplementedException("BGMM module implemented in separate PR."); + } + + @Override + public void trainAndSerialize(final File trainingAnnotationsFile, + final String outputPrefix) { + throw new NotImplementedException("BGMM module implemented in separate PR."); + } + + static final class Preprocesser implements Serializable { + private static final long serialVersionUID = 1L; + + Preprocesser() { + } + + double[][] transform(final double[][] data) { + throw new NotImplementedException("BGMM module implemented in separate PR."); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsScorer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsScorer.java new file mode 100644 index 00000000000..0ae9a5e09a8 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsScorer.java @@ -0,0 +1,67 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling; + +import org.apache.commons.lang.NotImplementedException; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hdf5.HDF5File; +import org.broadinstitute.hdf5.HDF5LibException; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; +import org.broadinstitute.hellbender.utils.clustering.BayesianGaussianMixtureModeller; +import org.broadinstitute.hellbender.utils.io.IOUtils; + +import java.io.File; +import java.io.Serializable; +import java.util.List; + +// TODO this is just a stub, will be fleshed out in a separate PR +public final class BGMMVariantAnnotationsScorer implements VariantAnnotationsScorer, Serializable { + + private static final long serialVersionUID = 1L; + + public static final String BGMM_SCORER_SER_SUFFIX = ".bgmmScorer.ser"; + + public BGMMVariantAnnotationsScorer(final List annotationNames, + final BGMMVariantAnnotationsModel.Preprocesser preprocesser, + final BayesianGaussianMixtureModeller bgmm) { + throw new NotImplementedException("BGMM module implemented in separate PR."); + } + + @Override + public void score(final File inputAnnotationsFile, + final File outputScoresFile) { + throw new NotImplementedException("BGMM module implemented in separate PR."); + } + + public double[][] preprocess(final double[][] annotations) { + throw new NotImplementedException("BGMM module implemented in separate PR."); + } + + public void serialize(final File scorerFile) { + throw new NotImplementedException("BGMM module implemented in separate PR."); + } + + public static BGMMVariantAnnotationsScorer deserialize(final File scorerFile) { + throw new NotImplementedException("BGMM module implemented in separate PR."); + } + + // TODO clean this up, copy more fields + public static void preprocessAnnotationsWithBGMMAndWriteHDF5(final List annotationNames, + final String outputPrefix, + final File labeledTrainingAndVariantTypeAnnotationsFile, + final Logger logger) { + final double[][] rawAnnotations = LabeledVariantAnnotationsData.readAnnotations(labeledTrainingAndVariantTypeAnnotationsFile); + final BGMMVariantAnnotationsScorer scorer = BGMMVariantAnnotationsScorer.deserialize(new File(outputPrefix + BGMM_SCORER_SER_SUFFIX)); + final double[][] preprocessedAnnotations = scorer.preprocess(rawAnnotations); + final File outputPreprocessedAnnotationsFile = new File(outputPrefix + ".annot.pre.hdf5"); + try (final HDF5File hdf5File = new HDF5File(outputPreprocessedAnnotationsFile, HDF5File.OpenMode.CREATE)) { + IOUtils.canReadFile(hdf5File.getFile()); + hdf5File.makeStringArray("/data/annotation_names", annotationNames.toArray(new String[0])); + HDF5Utils.writeChunkedDoubleMatrix(hdf5File, "/data/annotations", preprocessedAnnotations, HDF5Utils.MAX_NUMBER_OF_VALUES_PER_HDF5_MATRIX / 16); + } catch (final HDF5LibException exception) { + throw new GATKException(String.format("Exception encountered during writing of preprocessed annotations (%s). Output file at %s may be in a bad state.", + exception, outputPreprocessedAnnotationsFile.getAbsolutePath())); + } + logger.info(String.format("Preprocessed annotations written to %s.", outputPreprocessedAnnotationsFile.getAbsolutePath())); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsModel.java new file mode 100644 index 00000000000..6b6feae5d26 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsModel.java @@ -0,0 +1,68 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling; + +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor; +import org.broadinstitute.hellbender.utils.runtime.ProcessOutput; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Given an HDF5 file containing annotations for a training set (in the format specified by + * {@link VariantAnnotationsModel#trainAndSerialize}), a Python script containing modeling code, + * and a JSON file containing hyperparameters, the {@link #trainAndSerialize} method can be used to train a model. + * + * The modeling script is expected to generate the file {outputPrefix}.scorer.pkl. This file should contain + * a pickled Python lambda function to be used for generating scores from annotations in a subsequent test set. + * The lambda should have the signature: + * + * lambda test_annotation_names_i, test_X_ni + * + * Here, test_annotation_names_i is a numpy array of strings containing the annotation names, and + * test X_ni is a numpy matrix of float-valued annotations, with dimensions (number of data points) x (number of annotations). + * The lambda should check the test annotation names against the training annotation names and + * then return a numpy array of float-valued scores with length given by the number of data points. + * + * See org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py for an example implementation. + */ +public final class PythonSklearnVariantAnnotationsModel implements VariantAnnotationsModel { + + private final File pythonScriptFile; + private final File hyperparametersJSONFile; + + public PythonSklearnVariantAnnotationsModel(final File pythonScriptFile, + final File hyperparametersJSONFile) { + this.pythonScriptFile = pythonScriptFile; + this.hyperparametersJSONFile = hyperparametersJSONFile; + } + + @Override + public void trainAndSerialize(final File trainingAnnotationsFile, + final String outputPrefix) { + final PythonScriptExecutor executor = new PythonScriptExecutor(true); + final ProcessOutput pythonProcessOutput = executor.executeScriptAndGetOutput( + pythonScriptFile.getAbsolutePath(), + null, + composePythonArguments(trainingAnnotationsFile, hyperparametersJSONFile, outputPrefix)); + + if (pythonProcessOutput.getExitValue() != 0) { + throw executor.getScriptException(executor.getExceptionMessageFromScriptError(pythonProcessOutput)); + } + } + + private static List composePythonArguments(final File annotationsFile, + final File hyperparametersJSONFile, + final String outputPrefix) { + try { + return new ArrayList<>(Arrays.asList( + "--annotations_file=" + annotationsFile.getCanonicalPath(), + "--hyperparameters_json_file=" + hyperparametersJSONFile.getCanonicalPath(), + "--output_prefix=" + outputPrefix)); + } catch (final IOException e) { + throw new UserException.BadInput(String.format("Encountered exception resolving canonical file paths: %s", e)); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsScorer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsScorer.java new file mode 100644 index 00000000000..cb3ab93a547 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsScorer.java @@ -0,0 +1,68 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling; + +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor; +import org.broadinstitute.hellbender.utils.runtime.ProcessOutput; + +import java.io.File; +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Given an HDF5 file containing annotations for a test set (in the format specified by + * {@link VariantAnnotationsScorer#score}), a Python script containing scoring code, + * and a file containing a pickled Python lambda function for scoring, + * the {@link #score} method can be used to generate scores. + * + * The scoring script is expected to load both the annotations and the pickled scoring function, + * which are then used to generate the file {outputPrefix}.scores.hdf5. This HDF5 file should contain + * a double array of the scores in {@value SCORES_PATH}, in the same order as the corresponding data points + * in the provided annotations. + * + * See org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py for an example implementation. + */ +public final class PythonSklearnVariantAnnotationsScorer implements VariantAnnotationsScorer, Serializable { + + private static final long serialVersionUID = 1L; + + public static final String PYTHON_SCORER_PKL_SUFFIX = ".scorer.pkl"; + + private final File pythonScriptFile; + private final File scorerPklFile; + + public PythonSklearnVariantAnnotationsScorer(final File pythonScriptFile, + final File scorerPklFile) { + this.pythonScriptFile = pythonScriptFile; + this.scorerPklFile = scorerPklFile; + } + + @Override + public void score(final File inputAnnotationsFile, + final File outputScoresFile) { + final PythonScriptExecutor executor = new PythonScriptExecutor(true); + final ProcessOutput pythonProcessOutput = executor.executeScriptAndGetOutput( + pythonScriptFile.getAbsolutePath(), + null, + composePythonArguments(inputAnnotationsFile, scorerPklFile, outputScoresFile)); + + if (pythonProcessOutput.getExitValue() != 0) { + throw executor.getScriptException(executor.getExceptionMessageFromScriptError(pythonProcessOutput)); + } + } + + private static List composePythonArguments(final File annotationsFile, + final File scorerPklFile, + final File outputScoresFile) { + try { + return new ArrayList<>(Arrays.asList( + "--annotations_file=" + annotationsFile.getCanonicalPath(), + "--scorer_pkl_file=" + scorerPklFile.getCanonicalPath(), + "--output_scores_file=" + outputScoresFile.getCanonicalPath())); + } catch (final IOException e) { + throw new UserException.BadInput(String.format("Encountered exception resolving canonical file paths: %s", e)); + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModel.java new file mode 100644 index 00000000000..ee2e899d0a8 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModel.java @@ -0,0 +1,46 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling; + +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; + +import java.io.File; + +/** + * File interface for passing annotations to a modeling backend and indicating a path prefix for resulting output. + */ +public interface VariantAnnotationsModel { + + /** + * @param trainingAnnotationsFile Training annotations in HDF5 format, containing at least the directory structure + * + *

+ * |--- annotations
+ * | |--- chunk_0
+ * | |--- ...
+ * | |--- chunk_{num_chunks - 1}
+ * | |--- names
+ * | |--- num_chunks
+ * | |--- num_columns
+ * | |--- num_rows
+ *

+ * + * Here, each chunk is a double matrix, with dimensions given by + * (number of sites in the chunk) x (number of annotations). + * See {@link LabeledVariantAnnotationsData#writeHDF5}. + * + * Modeling backends are responsible for consuming annotations in this format + * and outputting a {@link VariantAnnotationsScorer} for each variant type + * with the appropriate output names. This responsibility includes the + * implementation of functionality that allows validation of annotation names + * in downstream {@link VariantAnnotationsScorer} instances. + * + * In current use, we assume that a single model will be trained, so either + * 1) training annotations have already been subset to a single variant type (SNP or INDEL), or + * 2) we assume the model does not care about the variant type. + * TODO we could also pass additional labels to be used in training, + * but all backends would have to likewise respect directory structure + * + * @param outputPrefix Path prefix for all output files + */ + void trainAndSerialize(final File trainingAnnotationsFile, + final String outputPrefix); +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModelBackend.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModelBackend.java new file mode 100644 index 00000000000..a4fa8460440 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModelBackend.java @@ -0,0 +1,16 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling; + +public enum VariantAnnotationsModelBackend { + // TODO will be added in a separate PR + JAVA_BGMM, + + /** + * Use the script at org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py + */ + PYTHON_IFOREST, + + /** + * Use a user-provided script. + */ + PYTHON_SCRIPT +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsScorer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsScorer.java new file mode 100644 index 00000000000..6b881fdcbe2 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsScorer.java @@ -0,0 +1,111 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling; + +import org.broadinstitute.hdf5.HDF5File; +import org.broadinstitute.hdf5.HDF5LibException; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.io.IOUtils; +import org.hipparchus.stat.fitting.EmpiricalDistribution; + +import java.io.File; +import java.util.Arrays; +import java.util.function.Function; +import java.util.stream.IntStream; + +/** + * File interface for passing annotations to a scoring backend and returning scores. + */ +public interface VariantAnnotationsScorer { + + String SCORES_PATH = "/data/scores"; // our HDF5 library does not allow writing to a bare/root path (e.g., /scores) + + /** + * @param inputAnnotationsFile Annotations to be scored in HDF5 format, containing at least the directory structure + * + *

+ * |--- annotations
+ * | |--- chunk_0
+ * | |--- ...
+ * | |--- chunk_{num_chunks - 1}
+ * | |--- names
+ * | |--- num_chunks
+ * | |--- num_columns
+ * | |--- num_rows
+ *

+ * + * Here, each chunk is a double matrix, with dimensions given by + * (number of sites in the chunk) x (number of annotations). + * See {@link LabeledVariantAnnotationsData#writeHDF5}. + * + * Scoring backends are responsible for consuming annotations in this format and + * outputting a double array of scores to file. This responsibility includes + * validation of annotation names. + * + * @param outputScoresFile Output file in HDF5 format, containing scores at {@link VariantAnnotationsScorer#SCORES_PATH}. + */ + void score(final File inputAnnotationsFile, + final File outputScoresFile); + + /** + * Given scores for a calibration set, returns a function for converting a subsequent score to a + * sensitivity to that calibration set. This function is simply given by 1 - ECDF, + * where ECDF is the empirical cumulative distribution function of the calibration scores; + * see here. + * For example, a score that is very low relative to the calibration scores would yield a + * high calibration sensitivity; that is, using this score as the minimum allowable threshold for filtering + * would result in a high sensitivity to the calibration set. + * + * @param calibrationScores must all be finite + */ + static Function createScoreToCalibrationSensitivityConverter(final double[] calibrationScores) { + Utils.validateArg(Arrays.stream(calibrationScores).allMatch(Double::isFinite), + "Calibration scores must all be finite."); + final EmpiricalDistribution empiricalDistribution = new EmpiricalDistribution(); + empiricalDistribution.load(calibrationScores); + return score -> 1. - empiricalDistribution.cumulativeProbability(score); + } + + /** + * Reads a double array of scores from {@value SCORES_PATH} in an HDF5 file. + */ + static double[] readScores(final File inputFile) { + try (final HDF5File inputHDF5File = new HDF5File(inputFile, HDF5File.OpenMode.READ_ONLY)) { + IOUtils.canReadFile(inputHDF5File.getFile()); + return inputHDF5File.readDoubleArray(SCORES_PATH); + } catch (final HDF5LibException exception) { + throw new GATKException(String.format("Exception encountered during reading of scores from %s: %s", + inputFile.getAbsolutePath(), exception)); + } + } + + /** + * Writes a double array of scores to {@value SCORES_PATH} in an HDF5 file. + */ + static void writeScores(final File outputFile, + final double[] scores) { + try (final HDF5File outputHDF5File = new HDF5File(outputFile, HDF5File.OpenMode.CREATE)) { + outputHDF5File.makeDoubleArray(SCORES_PATH, scores); + } catch (final HDF5LibException exception) { + throw new GATKException(String.format("Exception encountered during writing of scores (%s). Output file at %s may be in a bad state.", + exception, outputFile.getAbsolutePath())); + } + } + + /** + * Yields a VQSR-style positive-negative scorer that returns the difference of the positive score and the negative score. + */ + static VariantAnnotationsScorer combinePositiveAndNegativeScorer(final VariantAnnotationsScorer positiveScorer, + final VariantAnnotationsScorer negativeScorer) { + return (inputAnnotationsFile, outputScoresFile) -> { + final File tempPositiveScoresFile = IOUtils.createTempFile("positive", "scores.hdf5"); + final File tempNegativeScoresFile = IOUtils.createTempFile("negative", "scores.hdf5"); + positiveScorer.score(inputAnnotationsFile, tempPositiveScoresFile); + final double[] positiveScores = VariantAnnotationsScorer.readScores(tempPositiveScoresFile); + negativeScorer.score(inputAnnotationsFile, tempNegativeScoresFile); + final double[] negativeScores = VariantAnnotationsScorer.readScores(tempNegativeScoresFile); + final double[] scores = IntStream.range(0, positiveScores.length).mapToDouble(i -> positiveScores[i] - negativeScores[i]).toArray(); + VariantAnnotationsScorer.writeScores(outputScoresFile, scores); + }; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java index edd19e5686c..3a2ccba5f6a 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java @@ -34,6 +34,7 @@ public final class MathUtils { public static final double LOG10_ONE_HALF = Math.log10(0.5); public static final double LOG10_ONE_THIRD = -Math.log10(3.0); public static final double LOG_ONE_THIRD = -Math.log(3.0); + public static final double LOG_2 = Math.log(2.0); public static final double INV_LOG_2 = 1.0 / Math.log(2.0); public static final double LOG_10 = Math.log(10); public static final double INV_LOG_10 = 1.0 / LOG_10; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/NaturalLogUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/NaturalLogUtils.java index 6de748f01f7..55f7b9d8909 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/NaturalLogUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/NaturalLogUtils.java @@ -81,7 +81,7 @@ public static double logSumExp(final double... logValues) { } } if ( Double.isNaN(sum) || sum == Double.POSITIVE_INFINITY ) { - throw new IllegalArgumentException("log10 p: Values must be non-infinite and non-NAN"); + throw new IllegalArgumentException("logValues must be non-infinite and non-NAN"); } return maxValue + (sum != 1.0 ? Math.log(sum) : 0.0); } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/clustering/BayesianGaussianMixtureModeller.java b/src/main/java/org/broadinstitute/hellbender/utils/clustering/BayesianGaussianMixtureModeller.java new file mode 100644 index 00000000000..fc759db3e9d --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/utils/clustering/BayesianGaussianMixtureModeller.java @@ -0,0 +1,35 @@ +package org.broadinstitute.hellbender.utils.clustering; + +import org.apache.commons.lang.NotImplementedException; +import org.apache.commons.math3.linear.RealMatrix; +import org.apache.commons.math3.linear.RealVector; + +import java.io.Serializable; + +public final class BayesianGaussianMixtureModeller implements Serializable { + private static final long serialVersionUID = 1L; + + public enum InitMethod { + K_MEANS_PLUS_PLUS, RANDOM, TEST + } + + private BayesianGaussianMixtureModeller(final int nComponents, + final double tol, + final double regCovar, + final int maxIter, + final int nInit, + final InitMethod initMethod, + final double weightConcentrationPrior, + final double meanPrecisionPrior, + final RealVector meanPrior, + final Double degreesOfFreedomPrior, + final RealMatrix covariancePrior, + final int seed, + final boolean warmStart, + final int verboseInterval, + final double relativeSymmetryThreshold, + final double absolutePositivityThreshold, + final double epsilon) { + throw new NotImplementedException("BGMM module implemented in separate PR."); + } +} \ No newline at end of file diff --git a/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest-hyperparameters.json b/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest-hyperparameters.json new file mode 100644 index 00000000000..172b8aa42eb --- /dev/null +++ b/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest-hyperparameters.json @@ -0,0 +1,3 @@ +{ + "random_state": 0 +} \ No newline at end of file diff --git a/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py b/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py new file mode 100644 index 00000000000..554817162b2 --- /dev/null +++ b/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py @@ -0,0 +1,138 @@ +import argparse +import h5py +import sklearn.ensemble +import sklearn.impute +import numpy as np +import dill +import json + + +def read_annotations(h5file): + with h5py.File(h5file, 'r') as f: + annotation_names_i = f['/annotations/names'][()].astype(str) + + # read chunked annotations + num_chunks = int(f['/annotations/num_chunks'][()]) + num_columns = int(f['/annotations/num_columns'][()]) + num_rows = int(f['/annotations/num_rows'][()]) + X_ni = np.zeros((num_rows, num_columns)) + n = 0 + for chunk_index in range(num_chunks): + chunk_ni = f[f'/annotations/chunk_{chunk_index}'][()] + num_rows_in_chunk = len(chunk_ni) + X_ni[n:n + num_rows_in_chunk, :] = chunk_ni + n += num_rows_in_chunk + assert n == num_rows + return annotation_names_i, X_ni + + +def train(annotations_file, + hyperparameters_json_file, + output_prefix): + print('Reading annotations...') + annotation_names_i, X_ni = read_annotations(annotations_file) + print(f'Annotations: {annotation_names_i}.') + + print('Reading hyperparameters...') + with open(hyperparameters_json_file) as json_file: + hyperparameters_kwargs = json.load(json_file) + print('Hyperparameters:', hyperparameters_kwargs) + + print('Imputing annotations...') + imputer = sklearn.impute.SimpleImputer(strategy='median') + imputed_X_ni = imputer.fit_transform(X_ni) + + # SimpleImputer will drop any features that are completely missing, resulting in different shapes for + # imputed_X_ni and X_ni and misalignment of features when training and scoring downstream if not checked. + # We externally check for and fail in the presence of any entirely missing features, but we do a redundant check here. + assert imputed_X_ni.shape == X_ni.shape, \ + f'Shape of imputed annotations differs from shape of raw annotations; at least one feature is completely missing ' \ + f'and hence dropped during imputation.' + + print(f'Training IsolationForest with {imputed_X_ni.shape[0]} training sites x {imputed_X_ni.shape[1]} annotations...') + clf = sklearn.ensemble.IsolationForest(**hyperparameters_kwargs) + clf.fit(imputed_X_ni) + print('Training complete.') + + def score_samples(test_annotation_names_i, + test_X_ni): + assert np.array_equal(test_annotation_names_i, annotation_names_i), \ + f'Input annotation names ({test_annotation_names_i}) must be identical to those used to train the scorer ({annotation_names_i}).' + return clf.score_samples(imputer.transform(test_X_ni)) # TODO sklearn's implementation is single-threaded, but this could perhaps be parallelized + + scorer_lambda = lambda test_annotation_names_i, test_X_ni: score_samples(test_annotation_names_i, test_X_ni) + + print(f'Pickling scorer...') + output_scorer_pkl_file = f'{output_prefix}.scorer.pkl' + with open(output_scorer_pkl_file, 'wb') as f: + dill.dump(scorer_lambda, f) # the dill package can be used to pickle lambda functions + print(f'Scorer pickled to {output_scorer_pkl_file}.') + + +def score(annotations_file, + scorer_pkl_file, + output_scores_file): + annotation_names_i, X_ni = read_annotations(annotations_file) + + with open(scorer_pkl_file, 'rb') as f: + scorer_lambda = dill.load(f) + score_n = scorer_lambda(annotation_names_i, X_ni) + + with h5py.File(output_scores_file, 'w') as f: + scores_dset = f.create_dataset('data/scores', (len(score_n),), dtype='d') + scores_dset[:] = score_n + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument('--annotations_file', + type=str, + required=True, + help='') + + parser.add_argument('--hyperparameters_json_file', + type=str, + required=False, + help='') + + parser.add_argument('--output_prefix', + type=str, + required=False, + help='') + + parser.add_argument('--scorer_pkl_file', + type=str, + required=False, + help='') + + parser.add_argument('--output_scores_file', + type=str, + required=False, + help='') + + args = parser.parse_args() + + annotations_file = args.annotations_file + + # this script can handle both training and scoring; we check the passed arguments to determine which is appropriate + if args.hyperparameters_json_file is not None and args.output_prefix is not None and \ + args.scorer_pkl_file is None and args.output_scores_file is None: + hyperparameters_json_file = args.hyperparameters_json_file + output_prefix = args.output_prefix + train(annotations_file, + hyperparameters_json_file, + output_prefix) + elif args.hyperparameters_json_file is None and args.output_prefix is None and \ + args.scorer_pkl_file is not None and args.output_scores_file is not None: + scorer_pkl_file = args.scorer_pkl_file + output_scores_file = args.output_scores_file + score(annotations_file, + scorer_pkl_file, + output_scores_file) + else: + raise + + +if __name__ == '__main__': + main() diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotationsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotationsIntegrationTest.java new file mode 100644 index 00000000000..509dfffee1a --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotationsIntegrationTest.java @@ -0,0 +1,253 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable; + +import com.google.common.collect.Lists; +import org.apache.commons.lang3.tuple.Pair; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +/** + * Note that the expected outputs for the exact-match tests below are used as inputs for + * {@link TrainVariantAnnotationsModelIntegrationTest}. Similarly, the expected outputs for + * {@link TrainVariantAnnotationsModelIntegrationTest} are used as inputs for {@link ScoreVariantAnnotationsIntegrationTest}. + * Thus, developers should keep the expected outputs for all of these integration tests in sync when updating any of them. + * This can easily be done by setting the UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS flags for all tools to be true and then running + * the tests in order. + */ +public final class ExtractVariantAnnotationsIntegrationTest extends CommandLineProgramTest { + + // If true, update the expected outputs in tests that assert an exact match vs. prior output, + // instead of actually running the tests. Can be used with "./gradlew test -Dtest.single=ExtractVariantAnnotationsIntegrationTest" + // to update all of the exact-match tests at once. After you do this, you should look at the + // diffs in the new expected outputs in git to confirm that they are consistent with expectations. + public static final boolean UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS = false; + + /** + * Make sure that someone didn't leave the UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS toggle turned on. + */ + @Test + public void assertThatExpectedOutputUpdateToggleIsDisabled() { + Assert.assertFalse(UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS, "The toggle to update expected outputs should not be left enabled."); + } + + private static final List NON_ALLELE_SPECIFIC_ANNOTATIONS = Arrays.asList( + "DP", "FS", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR"); + + private static final List ALLELE_SPECIFIC_ANNOTATIONS = Arrays.asList( + "DP", "AS_FS", "AS_MQ", "AS_MQRankSum", "AS_QD", "AS_ReadPosRankSum", "AS_SOR"); + + private static final File PACKAGE_TEST_FILES_DIR = new File(largeFileTestDir, + "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/"); + private static final File TEST_FILES_DIR = new File(largeFileTestDir, + "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract"); + private static final File EXPECTED_TEST_FILES_DIR = new File(TEST_FILES_DIR, "expected"); + + // The input VCF should cover a genomic region given by the union of regions in the below training and calibration resources + // and should also contain a few multiallelics that overlap those resources. + private static final File INPUT_VCF = new File(PACKAGE_TEST_FILES_DIR, "input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf"); + + // We use snippets of the Omni sites for SNP training (chr1:1-5000000) and calibration (chr1:5000000-10000000); we don't sweat the 1bp overlap. + private static final File SNP_TRAINING_VCF = new File(PACKAGE_TEST_FILES_DIR, "resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz"); + private static final File SNP_CALIBRATION_VCF = new File(PACKAGE_TEST_FILES_DIR, "resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz"); + + // We use snippets of the Mills sites for indel training (chr1:1-5000000) and calibration (chr1:5000000-10000000); we don't sweat the 1bp overlap. + private static final File INDEL_TRAINING_VCF = new File(PACKAGE_TEST_FILES_DIR, "resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz"); + private static final File INDEL_CALIBRATION_VCF = new File(PACKAGE_TEST_FILES_DIR, "resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz"); + + private static final int MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS = 100; + + // Supplier and functions for creating and adding various arguments to an ArgumentsBuilder. + private static final Supplier BASE_ARGUMENTS_BUILDER_SUPPLIER = () -> { + final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); + argsBuilder.addVCF(INPUT_VCF); + argsBuilder.addFlag(LabeledVariantAnnotationsWalker.DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME); // we do not gzip VCF outputs so that we can use diff to compare to the expected result + argsBuilder.add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, false); + return argsBuilder; + }; + static final Function ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS = argsBuilder -> { + NON_ALLELE_SPECIFIC_ANNOTATIONS.forEach(a -> argsBuilder.add(StandardArgumentDefinitions.ANNOTATION_LONG_NAME, a)); + return argsBuilder; + }; + static final Function ADD_ALLELE_SPECIFIC_ANNOTATIONS = argsBuilder -> { + argsBuilder.addFlag(LabeledVariantAnnotationsWalker.USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME); + ALLELE_SPECIFIC_ANNOTATIONS.forEach(a -> argsBuilder.add(StandardArgumentDefinitions.ANNOTATION_LONG_NAME, a)); + return argsBuilder; + }; + static final Function ADD_SNP_MODE_AND_RESOURCES = argsBuilder -> { + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP) + .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME + String.format(":omni-training,%s=true", LabeledVariantAnnotationsData.TRAINING_LABEL), SNP_TRAINING_VCF) + .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME + String.format(":omni-calibration,%s=true", LabeledVariantAnnotationsData.CALIBRATION_LABEL), SNP_CALIBRATION_VCF); + return argsBuilder; + }; + static final Function ADD_INDEL_MODE_AND_RESOURCES = argsBuilder -> { + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.INDEL) + .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME + String.format(":mills-training,%s=true", LabeledVariantAnnotationsData.TRAINING_LABEL), INDEL_TRAINING_VCF) + .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME + String.format(":mills-calibration,%s=true", LabeledVariantAnnotationsData.CALIBRATION_LABEL), INDEL_CALIBRATION_VCF); + return argsBuilder; + }; + private static final Function ADD_MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS = argsBuilder -> { + argsBuilder.add(ExtractVariantAnnotations.MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME, MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS); + return argsBuilder; + }; + + /** + * Exact-match tests for configurations given by the Cartesian product of the following options: + * 1) non-allele-specific ("nonAS') vs. allele-specific ("AS") + * 2) SNP-only ("snp") vs. INDEL-only ("indel") vs. SNP+INDEL ("snpIndel") + * 3) positive ("pos") vs. positive-unlabeled ("posUn") + */ + @DataProvider(name = "dataValidInputs") + public Object[][] dataValidInputs() { + final List>>> testConfigurations = Lists.cartesianProduct( + Collections.singletonList( + Pair.of("extract", Function.identity())), + Arrays.asList( + Pair.of("nonAS", ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS), + Pair.of("AS", ADD_ALLELE_SPECIFIC_ANNOTATIONS)), + Arrays.asList( + Pair.of("snp", ADD_SNP_MODE_AND_RESOURCES), + Pair.of("indel", ADD_INDEL_MODE_AND_RESOURCES), + Pair.of("snpIndel", ADD_SNP_MODE_AND_RESOURCES.andThen(ADD_INDEL_MODE_AND_RESOURCES))), + Arrays.asList( + Pair.of("pos", Function.identity()), + Pair.of("posUn", ADD_MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS))); + + return testConfigurations.stream() + .map(tagAndAddFunctionPairs -> new Object[]{ + tagAndAddFunctionPairs.stream().map(Pair::getLeft).collect(Collectors.joining(".")), // e.g., "extract.nonAS.snp.pos" + tagAndAddFunctionPairs.stream().map(Pair::getRight) // creates the corresponding ArgumentsBuilder + .reduce(Function.identity(), Function::andThen) // by stringing together functions that add the + .apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get())}) // appropriate arguments + .toArray(Object[][]::new); + } + + /** + * Checks expected outputs given a tag (e.g., "extract.nonAS.snp.pos") and arguments corresponding to the + * Cartesian products generated in {@link #dataValidInputs}. + * + * We perform exact-match tests of any annotation HDF5 files produced using h5diff, which is insensitive to timestamps within the file. + * We also perform exact-match tests of VCF files using diff. VCF indices may not be diff equivalent, so + * we just check for their existence. + */ + @Test(dataProvider = "dataValidInputs", groups = {"python"}) // python environment is required to use h5diff for exact-match comparisons + public void testValidInputs(final String tag, + final ArgumentsBuilder argsBuilder) { + final File outputDir = UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ? EXPECTED_TEST_FILES_DIR : createTempDir("extract"); + final String outputPrefix = String.format("%s/%s", outputDir, tag); + argsBuilder.addOutput(outputPrefix); + runCommandLine(argsBuilder); + + if (!UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS) { + assertOutputs(tag, outputPrefix); + } + } + + private static void assertOutputs(final String tag, + final String outputPrefix) { + // vcf.idx files are not reproducible + SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s %s", + EXPECTED_TEST_FILES_DIR, + tag + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX, + outputPrefix + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX)); + SystemCommandUtilsTest.runSystemCommand(String.format("diff %s/%s.vcf %s.vcf", + EXPECTED_TEST_FILES_DIR, tag, outputPrefix)); + if (tag.contains("posUn")) { + SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s %s", + EXPECTED_TEST_FILES_DIR, + tag + ExtractVariantAnnotations.UNLABELED_TAG + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX, + outputPrefix + ExtractVariantAnnotations.UNLABELED_TAG + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX)); + } else { + Assert.assertFalse(new File(outputPrefix + ExtractVariantAnnotations.UNLABELED_TAG + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists()); + } + } + + /** + * If no resources are provided and we do not extract unlabeled sites, then only a zero-record VCF and the corresponding index are created. + * This is because we cannot create HDF5 files with empty arrays/matrices. + */ + @Test + public void testNoResources() { + final File outputDir = createTempDir("extract"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = ADD_ALLELE_SPECIFIC_ANNOTATIONS.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get()); + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP) + .addOutput(outputPrefix); + runCommandLine(argsBuilder); + Assert.assertFalse(new File(outputPrefix + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefix + ExtractVariantAnnotations.UNLABELED_TAG + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf").exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf.idx").exists()); + } + + /** + * If no resources are provided but we do extract unlabeled sites, then all output files except the labeled-annotations HDF5 file are created. + * This is because we cannot create HDF5 files with empty arrays/matrices. + */ + @Test + public void testNoResourcesAndExtractUnlabeled() { + final File outputDir = createTempDir("extract"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = ADD_ALLELE_SPECIFIC_ANNOTATIONS.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get()); + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP) + .add(ExtractVariantAnnotations.MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME, 1) + .addOutput(outputPrefix); + runCommandLine(argsBuilder); + Assert.assertFalse(new File(outputPrefix + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists()); + Assert.assertTrue(new File(outputPrefix + ExtractVariantAnnotations.UNLABELED_TAG + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf").exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf.idx").exists()); + } + + /** + * If no variants are present in the input in the specified region, then only a zero-record VCF and the corresponding index are created. + * This is because we cannot create HDF5 files with empty arrays/matrices. + */ + @Test + public void testNoVariantsInInput() { + final File outputDir = createTempDir("extract"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = ADD_ALLELE_SPECIFIC_ANNOTATIONS.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get()); + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP) + .addOutput(outputPrefix); + runCommandLine(argsBuilder); + Assert.assertFalse(new File(outputPrefix + ScoreVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefix + ScoreVariantAnnotations.SCORES_HDF5_SUFFIX).exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf").exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf.idx").exists()); + } + + @Test(expectedExceptions = UserException.class) + public void testForgotToSpecifyUseAlleleSpecificAnnotationsFlag() { + final File outputDir = createTempDir("extract"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = ADD_SNP_MODE_AND_RESOURCES.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get()); + ALLELE_SPECIFIC_ANNOTATIONS.forEach(a -> argsBuilder.add(StandardArgumentDefinitions.ANNOTATION_LONG_NAME, a)); + argsBuilder.addOutput(outputPrefix); + runCommandLine(argsBuilder); + } + + @Test(expectedExceptions = UserException.class) + public void testReservedSNPResourceLabel() { + final File outputDir = createTempDir("extract"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get()); + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP) + .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME + String.format(":snp,%s=true", LabeledVariantAnnotationsData.SNP_LABEL), SNP_TRAINING_VCF) + .addOutput(outputPrefix); + runCommandLine(argsBuilder); + } +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotationsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotationsIntegrationTest.java new file mode 100644 index 00000000000..83807d6a7a7 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotationsIntegrationTest.java @@ -0,0 +1,260 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable; + +import com.google.common.collect.Lists; +import org.apache.commons.lang3.tuple.Pair; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModelBackend; +import org.broadinstitute.hellbender.utils.io.IOUtils; +import org.broadinstitute.hellbender.utils.io.Resource; +import org.broadinstitute.hellbender.utils.python.PythonScriptExecutorException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.List; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +/** + * See documentation for {@link ExtractVariantAnnotationsIntegrationTest} for information about how inputs and + * expected outputs used there are related to those used here and in {@link TrainVariantAnnotationsModelIntegrationTest}. + */ +public final class ScoreVariantAnnotationsIntegrationTest extends CommandLineProgramTest { + + // If true, update the expected outputs in tests that assert an exact match vs. prior output, + // instead of actually running the tests. Can be used with "./gradlew test -Dtest.single=ScoreVariantAnnotationsIntegrationTest" + // to update all of the exact-match tests at once. After you do this, you should look at the + // diffs in the new expected outputs in git to confirm that they are consistent with expectations. + public static final boolean UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS = false; + + /** + * Make sure that someone didn't leave the UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS toggle turned on. + */ + @Test + public void assertThatExpectedOutputUpdateToggleIsDisabled() { + Assert.assertFalse(UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS, "The toggle to update expected outputs should not be left enabled."); + } + + private static final double CALIBRATION_SENSITIVITY_THRESHOLD = 0.9; + + private static final File PACKAGE_TEST_FILES_DIR = new File(largeFileTestDir, + "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/"); + private static final File TEST_FILES_DIR = new File(largeFileTestDir, + "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score"); + private static final File INPUT_FROM_TRAIN_EXPECTED_TEST_FILES_DIR = new File(largeFileTestDir, + "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected"); + private static final File EXPECTED_TEST_FILES_DIR = new File(TEST_FILES_DIR, "expected"); + + private static final File ISOLATION_FOREST_PYTHON_SCRIPT = IOUtils.writeTempResource( + new Resource("isolation-forest.py", TrainVariantAnnotationsModel.class)); + + private static final File INPUT_VCF = new File(PACKAGE_TEST_FILES_DIR, "input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf"); + + // Supplier and functions for creating and adding various arguments to an ArgumentsBuilder. + private static final Supplier BASE_ARGUMENTS_BUILDER_SUPPLIER = () -> { + final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); + argsBuilder.addVCF(INPUT_VCF); + argsBuilder.addFlag(LabeledVariantAnnotationsWalker.DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME); + argsBuilder.add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, false); + return argsBuilder; + }; + private static final BiFunction ADD_MODEL_PREFIX = (argsBuilder, modelPrefix) -> { + argsBuilder.add(ScoreVariantAnnotations.MODEL_PREFIX_LONG_NAME, modelPrefix); + return argsBuilder; + }; + private static final BiFunction ADD_CALIBRATION_SENSITIVITY_THRESHOLD = (argsBuilder, calibrationSensitivityThreshold) -> { + argsBuilder.add(ScoreVariantAnnotations.SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, calibrationSensitivityThreshold); + argsBuilder.add(ScoreVariantAnnotations.INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, calibrationSensitivityThreshold); + return argsBuilder; + }; + private static final BiFunction ADD_MODEL_BACKEND = (argsBuilder, modelBackendMode) -> { + argsBuilder.add(TrainVariantAnnotationsModel.MODEL_BACKEND_LONG_NAME, modelBackendMode); + return argsBuilder; + }; + private static final Function ADD_ISOLATION_FOREST_PYTHON_SCRIPT = argsBuilder -> { + argsBuilder.add(ScoreVariantAnnotations.PYTHON_SCRIPT_LONG_NAME, ISOLATION_FOREST_PYTHON_SCRIPT); + return argsBuilder; + }; + + /** + * Exact-match tests for (non-exhaustive) configurations given by the Cartesian product of the following options: + * 1) non-allele-specific ("nonAS") vs. allele-specific ("AS") + * 2) model backend + * 2a) Java Bayesian Gaussian Mixture Model (BGMM) backend TODO the BGMM has been reduced to a stub for this initial PR; subsequent PRs will cover the backend code and reconnect the stub + * 2b) default PYTHON_IFOREST ("IF.score") + * 2c) specified PYTHON_SCRIPT ("IF.score"); we will simply use the same script as the default PYTHON_IFOREST backend, so this is just a test of the command-line interface + * We should expect 2b-c to give functionally identical results. + * 3) SNP-only ("snp") vs. SNP+INDEL ("snpIndel") (for both of these options, we use trained models that contain both SNP and INDEL scorers as input) + * TODO the BGMM has been reduced to a stub for this initial PR; subsequent PRs will cover the backend code and reconnect the stub + */ + @DataProvider(name = "dataValidInputs") + public Object[][] dataValidInputs() { + final List>>> testConfigurations = Lists.cartesianProduct( + Arrays.asList( + Pair.of("extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg", Function.identity()), + Pair.of("extract.AS.snpIndel.posUn.train.snpIndel.posNeg", Function.identity())), + Arrays.asList( + Pair.of("IF.score", ab -> ADD_MODEL_BACKEND.apply(ab, VariantAnnotationsModelBackend.PYTHON_IFOREST)), // this and the following case give the same results, so they are given the same IF.score tag + Pair.of("IF.score", ADD_ISOLATION_FOREST_PYTHON_SCRIPT + .andThen(ab -> ADD_MODEL_BACKEND.apply(ab, VariantAnnotationsModelBackend.PYTHON_SCRIPT)))), + Arrays.asList( + Pair.of("snp", ExtractVariantAnnotationsIntegrationTest.ADD_SNP_MODE_AND_RESOURCES), + Pair.of("snpIndel", ExtractVariantAnnotationsIntegrationTest.ADD_SNP_MODE_AND_RESOURCES + .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_INDEL_MODE_AND_RESOURCES)))); + + return testConfigurations.stream() + .map(tagAndAddFunctionPairs -> new Object[]{ + tagAndAddFunctionPairs.stream().map(Pair::getLeft).collect(Collectors.joining(".")), // e.g., extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp + tagAndAddFunctionPairs.stream().map(Pair::getRight) // creates the corresponding ArgumentsBuilder + .reduce(Function.identity(), Function::andThen) // by stringing together functions that add the + .apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get())}) // appropriate arguments + .toArray(Object[][]::new); + } + + /** + * Checks expected outputs given a tag (e.g., "extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp") and arguments corresponding to the + * Cartesian products generated in {@link #dataValidInputs}. + * + * We perform exact-match tests of any HDF5 files produced using h5diff, which is insensitive to timestamps within the file. + * We also perform exact-match tests of VCF files using diff. VCF indices may not be diff equivalent, so + * we just check for their existence. + */ + @Test(dataProvider = "dataValidInputs", groups = {"python"}) // python environment is required to run tool and to use h5diff for exact-match comparisons + public void testValidInputs(final String tag, + final ArgumentsBuilder argsBuilder) { + final File outputDir = UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ? EXPECTED_TEST_FILES_DIR : createTempDir("score"); + final String outputPrefix = String.format("%s/%s", outputDir, tag); + argsBuilder.addOutput(outputPrefix); + + // add arguments for model prefix based on the + // train tag (the portion of the tag preceding ".score", e.g., extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF), + // which gives the basename for the model files + final String trainTag = tag.split(".score")[0]; + if (tag.contains("nonAS")) { + ExtractVariantAnnotationsIntegrationTest.ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS.apply(argsBuilder); + } else { + ExtractVariantAnnotationsIntegrationTest.ADD_ALLELE_SPECIFIC_ANNOTATIONS.apply(argsBuilder); + } + final String modelPrefix = new File(INPUT_FROM_TRAIN_EXPECTED_TEST_FILES_DIR, trainTag).toString(); + final Function addModelPrefix = ab -> + ADD_MODEL_PREFIX.apply(ab, modelPrefix); + final Function addCalibrationSensitivityThreshold = ab -> + ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, CALIBRATION_SENSITIVITY_THRESHOLD); + addModelPrefix.andThen(addCalibrationSensitivityThreshold).apply(argsBuilder); + + // TODO test use of sites-only VCF (output by extract tool) to label extracted sites + + runCommandLine(argsBuilder); + + if (!UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS) { + assertExpectedOutputs(tag, outputPrefix); + } + } + + private static void assertExpectedOutputs(final String tag, + final String outputPrefix) { + // vcf.idx files are not reproducible + SystemCommandUtilsTest.runSystemCommand(String.format("diff %s/%s.vcf %s.vcf", + EXPECTED_TEST_FILES_DIR, tag, outputPrefix)); + SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s.annot.hdf5 %s.annot.hdf5", + EXPECTED_TEST_FILES_DIR, tag, outputPrefix)); + SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s.scores.hdf5 %s.scores.hdf5", + EXPECTED_TEST_FILES_DIR, tag, outputPrefix)); + } + + /** + * In contrast to {@link ExtractVariantAnnotationsIntegrationTest#testNoResources}, the non-presence of + * resources here does not really affect the output. + */ + @Test(groups = {"python"}) // python environment is required to run tool + public void testNoResources() { + final File outputDir = createTempDir("score"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP) + .addOutput(outputPrefix); + final String modelPrefix = new File(INPUT_FROM_TRAIN_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF").toString(); + final Function addModelPrefix = ab -> + ADD_MODEL_PREFIX.apply(ab, modelPrefix); + addModelPrefix + .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS) + .apply(argsBuilder); + runCommandLine(argsBuilder); + Assert.assertTrue(new File(outputPrefix + ScoreVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists()); + Assert.assertTrue(new File(outputPrefix + ScoreVariantAnnotations.SCORES_HDF5_SUFFIX).exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf").exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf.idx").exists()); + } + + /** + * If no variants are present in the input in the specified region, we do not create the scores or annotations HDF5 files. + * This is because we cannot create HDF5 files with empty arrays/matrices. + */ + @Test(groups = {"python"}) // python environment is required to run tool + public void testNoVariantsInInput() { + final File outputDir = createTempDir("score"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP) + .add(StandardArgumentDefinitions.INTERVALS_LONG_NAME, "chr2") // the test input VCF does not have variants here + .addOutput(outputPrefix); + final String modelPrefix = new File(INPUT_FROM_TRAIN_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF").toString(); + final Function addModelPrefix = ab -> + ADD_MODEL_PREFIX.apply(ab, modelPrefix); + addModelPrefix + .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS) + .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_SNP_MODE_AND_RESOURCES) + .apply(argsBuilder); + runCommandLine(argsBuilder); + Assert.assertFalse(new File(outputPrefix + ScoreVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefix + ScoreVariantAnnotations.SCORES_HDF5_SUFFIX).exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf").exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf.idx").exists()); + } + + @Test(expectedExceptions = PythonScriptExecutorException.class, groups = {"python"}) // python environment is required to run tool + public void testAnnotationsDoNotMatchThoseUsedToTrainModel() { + final File outputDir = createTempDir("score"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP) + .addOutput(outputPrefix); + final String modelPrefix = new File(INPUT_FROM_TRAIN_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF").toString(); + final Function addModelPrefix = ab -> + ADD_MODEL_PREFIX.apply(ab, modelPrefix); + addModelPrefix + .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS) // model was trained with non-AS annotations + .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_ALLELE_SPECIFIC_ANNOTATIONS) // but we additionally specify AS annotations + .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_SNP_MODE_AND_RESOURCES) + .apply(argsBuilder); + runCommandLine(argsBuilder); + } + + @Test(expectedExceptions = UserException.class, groups = {"python"}) // python environment is required to run tool + public void testReservedSNPResourceLabel() { + final File outputDir = createTempDir("extract"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = ExtractVariantAnnotationsIntegrationTest.ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get()); + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP) + .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME + String.format(":snp,%s=true", LabeledVariantAnnotationsData.SNP_LABEL), INPUT_VCF) // we just use the input VCF as a dummy resource + .addOutput(outputPrefix); + final String modelPrefix = new File(INPUT_FROM_TRAIN_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF").toString(); + final Function addModelPrefix = ab -> + ADD_MODEL_PREFIX.apply(ab, modelPrefix); + addModelPrefix.apply(argsBuilder); + runCommandLine(argsBuilder); + } +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/SystemCommandUtilsTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/SystemCommandUtilsTest.java new file mode 100644 index 00000000000..705f292116a --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/SystemCommandUtilsTest.java @@ -0,0 +1,62 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable; + +import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; + +public final class SystemCommandUtilsTest extends GATKBaseTest { + + private static final File TEST_FILES_DIR = new File(largeFileTestDir, + "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract"); + private static final File EXPECTED_TEST_FILES_DIR = new File(TEST_FILES_DIR, "expected"); + + static void runSystemCommand(final String command) { + logger.debug(String.format("Testing command: %s", command)); + try { + final ProcessBuilder processBuilder = new ProcessBuilder("sh", "-c", command).redirectErrorStream(true); + final Process process = processBuilder.start(); + + final BufferedReader stdInReader = new BufferedReader(new InputStreamReader(process.getInputStream())); + String stdInLine; + while ((stdInLine = stdInReader.readLine()) != null) { + Assert.fail(String.format("The command \"%s\" resulted in: %s", command, stdInLine)); + } + stdInReader.close(); + + } catch (final IOException e) { + throw new GATKException.ShouldNeverReachHereException(e.getMessage()); + } + } + + @Test(groups = {"python"}) // python environment is required to use h5diff + public void testRunSystemCommand() { + runSystemCommand(String.format("h5diff %s/extract.AS.indel.pos.annot.hdf5 %s/extract.AS.indel.pos.annot.hdf5", + EXPECTED_TEST_FILES_DIR, EXPECTED_TEST_FILES_DIR)); + runSystemCommand(String.format("diff %s/extract.AS.indel.pos.vcf %s/extract.AS.indel.pos.vcf", + EXPECTED_TEST_FILES_DIR, EXPECTED_TEST_FILES_DIR)); + } + + @Test(expectedExceptions = AssertionError.class, groups = {"python"}) // python environment is required to use h5diff + public void testRunSystemCommandH5diffException() { + runSystemCommand(String.format("h5diff %s/extract.AS.indel.pos.annot.hdf5 %s/extract.AS.snp.pos.annot.hdf5", + EXPECTED_TEST_FILES_DIR, EXPECTED_TEST_FILES_DIR)); + } + + @Test(expectedExceptions = AssertionError.class) + public void testRunSystemCommandDiffException() { + runSystemCommand(String.format("diff %s/extract.AS.indel.pos.vcf %s/extract.AS.snp.pos.vcf", + EXPECTED_TEST_FILES_DIR, EXPECTED_TEST_FILES_DIR)); + } + + @Test(expectedExceptions = AssertionError.class) + public void testRunSystemCommandDiffNoSuchFileException() { + runSystemCommand(String.format("diff %s/blahblah %s/extract.AS.snp.pos.vcf", + EXPECTED_TEST_FILES_DIR, EXPECTED_TEST_FILES_DIR)); + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModelIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModelIntegrationTest.java new file mode 100644 index 00000000000..03e09782e1e --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModelIntegrationTest.java @@ -0,0 +1,432 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable; + +import com.google.common.collect.Lists; +import org.apache.commons.lang3.tuple.Pair; +import org.broadinstitute.hdf5.HDF5File; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.BGMMVariantAnnotationsScorer; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonSklearnVariantAnnotationsScorer; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModelBackend; +import org.broadinstitute.hellbender.utils.io.IOUtils; +import org.broadinstitute.hellbender.utils.io.Resource; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +/** + * See documentation for {@link ExtractVariantAnnotationsIntegrationTest} for information about how inputs and + * expected outputs used there are related to those used here and in {@link ScoreVariantAnnotationsIntegrationTest}. + */ +public final class TrainVariantAnnotationsModelIntegrationTest extends CommandLineProgramTest { + + // If true, update the expected outputs in tests that assert an exact match vs. prior output, + // instead of actually running the tests. Can be used with "./gradlew test -Dtest.single=TrainVariantAnnotationsIntegrationTest" + // to update all of the exact-match tests at once. After you do this, you should look at the + // diffs in the new expected outputs in git to confirm that they are consistent with expectations. + public static final boolean UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS = false; + + /** + * Make sure that someone didn't leave the UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS toggle turned on. + */ + @Test + public void assertThatExpectedOutputUpdateToggleIsDisabled() { + Assert.assertFalse(UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS, "The toggle to update expected outputs should not be left enabled."); + } + + private static final double CALIBRATION_SENSITIVITY_THRESHOLD = 0.9; + + private static final File TEST_FILES_DIR = new File(largeFileTestDir, + "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train"); + private static final File INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR = new File(largeFileTestDir, + "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected"); + private static final File EXPECTED_TEST_FILES_DIR = new File(TEST_FILES_DIR, "expected"); + + private static final File ISOLATION_FOREST_PYTHON_SCRIPT = IOUtils.writeTempResource( + new Resource("isolation-forest.py", TrainVariantAnnotationsModel.class)); + private static final File ISOLATION_FOREST_HYPERPARAMETERS_JSON = new File(TEST_FILES_DIR, + "isolation-forest-hyperparameters-different-seed.json"); + + // Supplier and functions for creating and adding various arguments to an ArgumentsBuilder. + private static final Supplier BASE_ARGUMENTS_BUILDER_SUPPLIER = ArgumentsBuilder::new; + private static final BiFunction ADD_ANNOTATIONS_HDF5 = (argsBuilder, annotationsHDF5) -> { + argsBuilder.add(TrainVariantAnnotationsModel.ANNOTATIONS_HDF5_LONG_NAME, annotationsHDF5); + return argsBuilder; + }; + private static final BiFunction ADD_UNLABELED_ANNOTATIONS_HDF5 = (argsBuilder, unlabeledAnnotationsHDF5) -> { + argsBuilder.add(TrainVariantAnnotationsModel.UNLABELED_ANNOTATIONS_HDF5_LONG_NAME, unlabeledAnnotationsHDF5); + return argsBuilder; + }; + private static final BiFunction ADD_CALIBRATION_SENSITIVITY_THRESHOLD = (argsBuilder, calibrationSensitivityThreshold) -> { + argsBuilder.add(TrainVariantAnnotationsModel.CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, calibrationSensitivityThreshold); + return argsBuilder; + }; + private static final Function ADD_SNP_MODE = argsBuilder -> { + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP); + return argsBuilder; + }; + private static final Function ADD_INDEL_MODE = argsBuilder -> { + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.INDEL); + return argsBuilder; + }; + private static final BiFunction ADD_MODEL_BACKEND = (argsBuilder, modelBackendMode) -> { + argsBuilder.add(TrainVariantAnnotationsModel.MODEL_BACKEND_LONG_NAME, modelBackendMode); + return argsBuilder; + }; + private static final Function ADD_ISOLATION_FOREST_PYTHON_SCRIPT = argsBuilder -> { + argsBuilder.add(TrainVariantAnnotationsModel.PYTHON_SCRIPT_LONG_NAME, ISOLATION_FOREST_PYTHON_SCRIPT); + return argsBuilder; + }; + private static final Function ADD_ISOLATION_FOREST_HYPERPARAMETERS_JSON = argsBuilder -> { + argsBuilder.add(TrainVariantAnnotationsModel.HYPERPARAMETERS_JSON_LONG_NAME, ISOLATION_FOREST_HYPERPARAMETERS_JSON); + return argsBuilder; + }; + + /** + * Exact-match tests for (non-exhaustive) configurations given by the Cartesian product of the following options: + * 1) non-allele-specific ("nonAS") vs. allele-specific ("AS") + * 2) SNP-only ("snp") vs. SNP+INDEL ("snpIndel") (for both of these options, we use extracted annotations that contain both SNP and INDEL variants as input) + * 3) positive training with {extract-tag}.annot.hdf5 ("posOnly") vs. positive-unlabeled training with {extract-tag}.annot.hdf5 and {extract-tag}.unlabeled.annot.hdf5 ("posNeg") + * 4) model backend + * 4a) Java Bayesian Gaussian Mixture Model (BGMM) backend TODO the BGMM has been reduced to a stub for this initial PR; subsequent PRs will cover the backend code and reconnect the stub + * 4b) default PYTHON_IFOREST with default hyperparameters ("IF") + * 4c) default PYTHON_IFOREST with non-default seed hyperparameter ("IFDifferentSeed") + * 4d) specified PYTHON_SCRIPT with non-default seed hyperparameter ("IFDifferentSeed"); we will simply use the same script as the default PYTHON_IFOREST backend, so this is just a test of the command-line interface + * We should expect 4c-d to give functionally identical results. + */ + @DataProvider(name = "dataValidInputs") + public Object[][] dataValidInputs() { + final List>>> testConfigurations = Lists.cartesianProduct( + Arrays.asList( + Pair.of("extract.nonAS.snpIndel.posUn.train", Function.identity()), + Pair.of("extract.AS.snpIndel.posUn.train", Function.identity())), + Arrays.asList( + Pair.of("snp", ADD_SNP_MODE), + Pair.of("snpIndel", ADD_SNP_MODE.andThen(ADD_INDEL_MODE))), + Arrays.asList( // we will consume the tag and add appropriate arguments for positive and positive-negative training below + Pair.of("posOnly", Function.identity()), + Pair.of("posNeg", Function.identity())), + Arrays.asList( + Pair.of("IF", ab -> ADD_MODEL_BACKEND.apply(ab, VariantAnnotationsModelBackend.PYTHON_IFOREST)), + Pair.of("IFDifferentSeed", ADD_ISOLATION_FOREST_HYPERPARAMETERS_JSON + .andThen(ab -> ADD_MODEL_BACKEND.apply(ab, VariantAnnotationsModelBackend.PYTHON_IFOREST))), // this and the following case give the same results, so they are given the same IFDifferentSeed tag + Pair.of("IFDifferentSeed", ADD_ISOLATION_FOREST_PYTHON_SCRIPT + .andThen(ADD_ISOLATION_FOREST_HYPERPARAMETERS_JSON) + .andThen(ab -> ADD_MODEL_BACKEND.apply(ab, VariantAnnotationsModelBackend.PYTHON_SCRIPT))))); + + return testConfigurations.stream() + .map(tagAndAddFunctionPairs -> new Object[]{ + tagAndAddFunctionPairs.stream().map(Pair::getLeft).collect(Collectors.joining(".")), // e.g., extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF + tagAndAddFunctionPairs.stream().map(Pair::getRight) // creates the corresponding ArgumentsBuilder + .reduce(Function.identity(), Function::andThen) // by stringing together functions that add the + .apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get())}) // appropriate arguments + .toArray(Object[][]::new); + } + + /** + * Checks expected outputs given a tag (e.g., "extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF") and arguments corresponding to the + * Cartesian products generated in {@link #dataValidInputs}. + * + * We perform exact-match tests of any HDF5 files produced using h5diff, which is insensitive to timestamps within the file. + * Binary serialized scorers may not be diff equivalent, so we just check for their existence. + */ + @Test(dataProvider = "dataValidInputs", groups = {"python"}) // python environment is required to run tool and to use h5diff for exact-match comparisons + public void testValidInputs(final String tag, + final ArgumentsBuilder argsBuilder) { + final File outputDir = UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ? EXPECTED_TEST_FILES_DIR : createTempDir("train"); + final String outputPrefix = String.format("%s/%s", outputDir, tag); + argsBuilder.addOutput(outputPrefix); + + // add arguments for positive/unlabeled annotations based on the + // extract tag (the portion of the tag preceding ".train", e.g., extract.nonAS.snpIndel.posUn), + // which gives the basename for the annotation files + final String extractTag = tag.split(".train")[0]; + final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + extractTag + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); + final Function addPositiveAnnotations = ab -> + ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5); + if (tag.contains("posNeg")) { + final File unlabeledAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + extractTag + ExtractVariantAnnotations.UNLABELED_TAG + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); + final Function addUnlabeledAnnotations = ab -> + ADD_UNLABELED_ANNOTATIONS_HDF5.apply(ab, unlabeledAnnotationsHDF5); + final double calibrationSensitivityThreshold = CALIBRATION_SENSITIVITY_THRESHOLD; + final Function addCalibrationSensitivityThreshold = ab -> + ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, calibrationSensitivityThreshold); + addPositiveAnnotations.andThen(addUnlabeledAnnotations).andThen(addCalibrationSensitivityThreshold).apply(argsBuilder); + } else { + addPositiveAnnotations.apply(argsBuilder); + } + + runCommandLine(argsBuilder); + + if (!UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS) { + assertExpectedOutputs(tag, outputPrefix); + } + } + + private static void assertExpectedOutputs(final String tag, + final String outputPrefix) { + if (tag.contains("train.snp.")) { + assertExpectedOutputsForVariantType(tag, outputPrefix, "snp"); + assertOutputsForVariantTypeDoNotExist(outputPrefix, "indel"); + } else if (tag.contains("train.snpIndel.")) { + assertExpectedOutputsForVariantType(tag, outputPrefix, "snp"); + assertExpectedOutputsForVariantType(tag, outputPrefix, "indel"); + } else { + Assert.fail("Unknown variant-type tag."); + } + } + + private static void assertExpectedOutputsForVariantType(final String tag, + final String outputPrefix, + final String variantType) { + final String tagAndVariantType = String.format("%s.%s", tag, variantType); + final String outputPrefixAndVariantType = String.format("%s.%s", outputPrefix, variantType); + + SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s %s", + EXPECTED_TEST_FILES_DIR, + tagAndVariantType + TrainVariantAnnotationsModel.TRAINING_SCORES_HDF5_SUFFIX, + outputPrefixAndVariantType + TrainVariantAnnotationsModel.TRAINING_SCORES_HDF5_SUFFIX)); + SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s %s", + EXPECTED_TEST_FILES_DIR, + tagAndVariantType + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX, + outputPrefixAndVariantType + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX)); + + assertScorerExpectedOutputs(tagAndVariantType, outputPrefixAndVariantType, false); + + if (tag.contains("posNeg")) { + SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s %s", + EXPECTED_TEST_FILES_DIR, + tagAndVariantType + TrainVariantAnnotationsModel.UNLABELED_SCORES_HDF5_SUFFIX, + outputPrefixAndVariantType + TrainVariantAnnotationsModel.UNLABELED_SCORES_HDF5_SUFFIX)); + assertScorerExpectedOutputs(tagAndVariantType, outputPrefixAndVariantType, true); + } else { + Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.UNLABELED_SCORES_HDF5_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.NEGATIVE_TAG + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.NEGATIVE_TAG + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists()); + } + } + + private static void assertOutputsForVariantTypeDoNotExist(final String outputPrefix, + final String variantType) { + final String outputPrefixAndVariantType = String.format("%s.%s", outputPrefix, variantType); + Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.TRAINING_SCORES_HDF5_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.UNLABELED_SCORES_HDF5_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefixAndVariantType + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefixAndVariantType + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.NEGATIVE_TAG + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.NEGATIVE_TAG + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists()); + } + + /** + * Binary serialized scorers may not be diff equivalent, so we just check for their existence. + * We assume that checking elsewhere for equivalence of the scores that the scorers generate provides sufficient + * coverage. + */ + private static void assertScorerExpectedOutputs(final String tagAndVariantType, + final String outputPrefixAndVariantType, + final boolean isNegative) { + final String positiveOrNegativeTag = isNegative ? ".negative" : ""; + final String scorerTag = outputPrefixAndVariantType + positiveOrNegativeTag; + if (tagAndVariantType.contains("BGMM")) { + Assert.assertTrue(new File(scorerTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists()); + Assert.assertFalse(new File(scorerTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists()); + } else if (tagAndVariantType.contains("IF")) { + Assert.assertTrue(new File(scorerTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists()); + Assert.assertFalse(new File(scorerTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists()); + } else { + Assert.fail("Unknown model-backend tag."); + } + } + + @Test(groups = {"python"}) // python environment is required to run tool and to use h5diff for exact-match comparisons + public void testSNPOnlyModelsFromSNPOnlyAndSNPPlusIndelAnnotationsAreIdentical() { + final File outputDir = createTempDir("train"); + + final String outputPrefixSNPOnly = String.format("%s/test-snp", outputDir); + final ArgumentsBuilder argsBuilderSNPOnly = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilderSNPOnly.addOutput(outputPrefixSNPOnly); + final File positiveAnnotationsHDF5SNPOnly = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snp.pos" + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); + final Function addPositiveAnnotationsSNPOnly = ab -> + ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5SNPOnly); + addPositiveAnnotationsSNPOnly + .andThen(ADD_SNP_MODE) + .apply(argsBuilderSNPOnly); + runCommandLine(argsBuilderSNPOnly); + + final String outputPrefixSNPPlusIndel = String.format("%s/test-snpIndel", outputDir); + final ArgumentsBuilder argsBuilderSNPPlusIndel = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilderSNPPlusIndel.addOutput(outputPrefixSNPPlusIndel); + final File positiveAnnotationsHDF5SNPPlusIndel = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snpIndel.pos" + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); + final Function addPositiveAnnotationsSNPPlusIndel = ab -> + ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5SNPPlusIndel); + addPositiveAnnotationsSNPPlusIndel + .andThen(ADD_SNP_MODE) + .apply(argsBuilderSNPPlusIndel); + runCommandLine(argsBuilderSNPPlusIndel); + + SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s %s", + outputPrefixSNPOnly + ".snp" + TrainVariantAnnotationsModel.TRAINING_SCORES_HDF5_SUFFIX, + outputPrefixSNPPlusIndel + ".snp" + TrainVariantAnnotationsModel.TRAINING_SCORES_HDF5_SUFFIX)); + SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s %s", + outputPrefixSNPOnly + ".snp" + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX, + outputPrefixSNPPlusIndel + ".snp" + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX)); + } + + @Test(expectedExceptions = IllegalArgumentException.class, groups = {"python"}) // python environment is required to run tool + public void testUnlabeledAnnotationsSpecifiedWithoutCalibrationSensitivityThreshold() { + final File outputDir = createTempDir("train"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilder.addOutput(outputPrefix); + final String extractTag = "extract.nonAS.snpIndel.posUn"; + final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + extractTag + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); + final Function addPositiveAnnotations = ab -> + ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5); + final File unlabeledAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + extractTag + ExtractVariantAnnotations.UNLABELED_TAG + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); + final Function addUnlabeledAnnotations = ab -> + ADD_UNLABELED_ANNOTATIONS_HDF5.apply(ab, unlabeledAnnotationsHDF5); + addPositiveAnnotations + .andThen(addUnlabeledAnnotations) + .apply(argsBuilder); + runCommandLine(argsBuilder); + } + + @Test(expectedExceptions = IllegalArgumentException.class, groups = {"python"}) // python environment is required to run tool + public void testCalibrationSensitivityThresholdSpecifiedWithoutUnlabeledAnnotations() { + final File outputDir = createTempDir("train"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilder.addOutput(outputPrefix); + final String extractTag = "extract.nonAS.snpIndel.posUn"; + final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + extractTag + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); + final Function addPositiveAnnotations = ab -> + ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5); + final double calibrationSensitivityThreshold = CALIBRATION_SENSITIVITY_THRESHOLD; + final Function addCalibrationSensitivityThreshold = ab -> + ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, calibrationSensitivityThreshold); + addPositiveAnnotations + .andThen(addCalibrationSensitivityThreshold) + .apply(argsBuilder); + runCommandLine(argsBuilder); + } + + @Test(expectedExceptions = IllegalArgumentException.class, groups = {"python"}) // python environment is required to run tool + public void testPositiveAndUnlabeledAnnotationNamesAreNotIdentical() { + final File outputDir = createTempDir("train"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilder.addOutput(outputPrefix); + final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snpIndel.posUn" + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); // non-allele-specific + final Function addPositiveAnnotations = ab -> + ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5); + final File unlabeledAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + "extract.AS.snpIndel.posUn" + ExtractVariantAnnotations.UNLABELED_TAG + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); // allele-specific + final Function addUnlabeledAnnotations = ab -> + ADD_UNLABELED_ANNOTATIONS_HDF5.apply(ab, unlabeledAnnotationsHDF5); + final double calibrationSensitivityThreshold = CALIBRATION_SENSITIVITY_THRESHOLD; + final Function addCalibrationSensitivityThreshold = ab -> + ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, calibrationSensitivityThreshold); + addPositiveAnnotations + .andThen(addUnlabeledAnnotations) + .andThen(addCalibrationSensitivityThreshold) + .apply(argsBuilder); + runCommandLine(argsBuilder); + } + + @Test(expectedExceptions = UserException.BadInput.class, groups = {"python"}) // python environment is required to run tool + public void testPositiveAnnotationsOfSpecifiedVariantTypesNotPresent() { + final File outputDir = createTempDir("train"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilder.addOutput(outputPrefix); + final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snp.posUn" + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); // contains only SNPs, but SNP+INDEL is specified + final Function addPositiveAnnotations = ab -> + ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5); + ADD_SNP_MODE + .andThen(ADD_INDEL_MODE) + .andThen(addPositiveAnnotations) + .apply(argsBuilder); + runCommandLine(argsBuilder); + } + + @Test(expectedExceptions = UserException.BadInput.class, groups = {"python"}) // python environment is required to run tool + public void testUnlabeledAnnotationsOfSpecifiedVariantTypesNotPresent() { + final File outputDir = createTempDir("train"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilder.addOutput(outputPrefix); + final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snpIndel.posUn" + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); + final Function addPositiveAnnotations = ab -> + ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5); + final File unlabeledAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snp.posUn" + ExtractVariantAnnotations.UNLABELED_TAG + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); // contains only SNPs, but SNP+INDEL is specified + final Function addUnlabeledAnnotations = ab -> + ADD_UNLABELED_ANNOTATIONS_HDF5.apply(ab, unlabeledAnnotationsHDF5); + final double calibrationSensitivityThreshold = CALIBRATION_SENSITIVITY_THRESHOLD; + final Function addCalibrationSensitivityThreshold = ab -> + ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, calibrationSensitivityThreshold); + ADD_SNP_MODE.andThen(ADD_INDEL_MODE) + .andThen(addPositiveAnnotations) + .andThen(addUnlabeledAnnotations) + .andThen(addCalibrationSensitivityThreshold) + .apply(argsBuilder); + runCommandLine(argsBuilder); + } + + @Test(expectedExceptions = UserException.BadInput.class, groups = {"python"}) // python environment is required to run tool + public void testPositiveAnnotationForOneVariantTypeIsCompletelyMissing() { // TODO add analogous test that warning is emitted when annotation has zero variance? + final File outputDir = createTempDir("train"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilder.addOutput(outputPrefix); + + // we will dummy up an annotations file that contains 2 annotations (ANNOT_1 and ANNOT_2) + // for 4 variants (2 SNPs and 2 INDELs); the INDELs will all have missing (i.e., NaN) ANNOT_1 values + final List annotationNames = Arrays.asList("ANNOT_1", "ANNOT_2"); + final double[][] annotations = new double[][]{ + new double[]{1, 2}, // SNP + new double[]{3, 4}, // SNP + new double[]{Double.NaN, 2}, // INDEL + new double[]{Double.NaN, 4}}; // INDEL + final List isSubset = Collections.nCopies(4, true); + + final File positiveAnnotationsHDF5 = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile( + annotationNames, annotations, isSubset); + + try (final HDF5File positiveAnnotationsHDF5File = new HDF5File(positiveAnnotationsHDF5, HDF5File.OpenMode.READ_WRITE)) { + positiveAnnotationsHDF5File.makeDoubleArray("/labels/snp", new double[]{1, 1, 0, 0}); + positiveAnnotationsHDF5File.makeDoubleArray("/labels/training", new double[]{1, 1, 1, 1}); + positiveAnnotationsHDF5File.makeDoubleArray("/labels/calibration", new double[]{1, 1, 1, 1}); + } + final Function addPositiveAnnotations = ab -> + ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5); + + ADD_SNP_MODE.andThen(ADD_INDEL_MODE) + .andThen(addPositiveAnnotations) + .apply(argsBuilder); + runCommandLine(argsBuilder); + } +} \ No newline at end of file diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.annot.hdf5 new file mode 100644 index 00000000000..773930b7e98 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a2441845e66c7ccfbbde701f1736aade9f2c72f50f95f6e7a0a6e66fe0752a8 +size 31088 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf new file mode 100644 index 00000000000..67a8e58fe29 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1d20489c2ff9b0ccba12a24c84d5d9fd61d62d8ffbb416593559120461b8140 +size 171038 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf.idx new file mode 100644 index 00000000000..834ef275edd --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4098f119d6d22391e7a6c839b12f1b1f1e36ced1df1fcbef1c4e1b09b2bd8704 +size 114263 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.annot.hdf5 new file mode 100644 index 00000000000..bbeab77af5b --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f050ca3dc10bc4a9c205568320e7a829129e72fe72e1dba31098c7e0a1a11167 +size 31200 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.unlabeled.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.unlabeled.annot.hdf5 new file mode 100644 index 00000000000..6d843706e64 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.unlabeled.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e9024e4b622dc4fdc5fd12bf59881a21d90a5099f34a58cdc02f700416c2af4 +size 39248 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf new file mode 100644 index 00000000000..c33bd749e80 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0232e0dbb277e6a5e736d01e0b17c1d0e7f0254c89f200e693393ab0962293e6 +size 171044 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf.idx new file mode 100644 index 00000000000..4cd06d7a707 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f21bccaab78e09cc3f48fc9bef944e7f8b96ce3dbd30179de611c9622629100 +size 114265 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.annot.hdf5 new file mode 100644 index 00000000000..5d0b70e972d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7a06b6b23ed66bc05fd45c80beca1c103a52dca5b81409babb6fa899dff0bc2 +size 152912 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf new file mode 100644 index 00000000000..fef16673a21 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3060969881dbc006d167f09817924d38b6345e25976ac53880f624d94aea68e9 +size 193277 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf.idx new file mode 100644 index 00000000000..3151b6bf26e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:590a999540ccf1ba0ecd95fcef993f16436acbde20a19a606c3101c8d734bb8c +size 114298 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.annot.hdf5 new file mode 100644 index 00000000000..d27272a8169 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d7bdbcf4e37f722dc13b0f944238f99f50a40cdde09fbd8e297ea7a78ac95ab +size 153184 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.unlabeled.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.unlabeled.annot.hdf5 new file mode 100644 index 00000000000..91c0861efca --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.unlabeled.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdf803b41fad0be998389707101c0b35ab80074a2360b1108c8730cc8e703816 +size 33120 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf new file mode 100644 index 00000000000..3dbb5880865 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cf5ee7adacf635c73d7493b99cc8df19a31acbbec991fbe5173e7cd6b405491 +size 193281 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf.idx new file mode 100644 index 00000000000..0e9f0d3a39d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41568d1de020fdbe972628ba36d47d03bafabacbe578c474b868eeda8b903be2 +size 114300 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.annot.hdf5 new file mode 100644 index 00000000000..e0f693cb0d1 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb09a014315261f93ea375d123b479c670839401b7fd518f7b4bf3998388827c +size 180856 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf new file mode 100644 index 00000000000..1b2a380111c --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ea0ee6bca7622ae8670c2f8ee2930a3223ba5d3edd89871c8e4b5cf3cf96f9f +size 196269 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf.idx new file mode 100644 index 00000000000..68badf87382 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ebcd9088ef7de5b901316ed2a23b81f8e3bb8a58d15b0f9220dcf4cc590c8f2 +size 114496 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.annot.hdf5 new file mode 100644 index 00000000000..3a6ca28d90d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e50f7011bfa07113b9abfa885ab8e5fcb3753eca41c96a86403a0c4dac74125d +size 181264 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.unlabeled.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.unlabeled.annot.hdf5 new file mode 100644 index 00000000000..8c86f2e7674 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.unlabeled.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1f519745eeceac897ad1b03bb6412aa5fbed60f5366e8614dec7c0a05e5f6e7 +size 32880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf new file mode 100644 index 00000000000..201b4860fa1 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fa4160640da9143000d5f3b2497ca20c02c0944ca53cfa03b1a63d935b2cf2e +size 196279 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf.idx new file mode 100644 index 00000000000..960ee8b9d4f --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5112f2a5cfbfcbea25be8cbd9f1fde5f3b3c84ae6981ddbc304afef4f2cbdc9d +size 114498 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.annot.hdf5 new file mode 100644 index 00000000000..2d385db28f5 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85932f2adf41c57634f4610dd78446284e937a32e12344045308895259ff9686 +size 31088 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf new file mode 100644 index 00000000000..c33bd749e80 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0232e0dbb277e6a5e736d01e0b17c1d0e7f0254c89f200e693393ab0962293e6 +size 171044 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf.idx new file mode 100644 index 00000000000..545dd32d2dd --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3751f13f13783c5c580053bf17e79c48cce582c588bee3f09b39dc535131a6fa +size 114266 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.annot.hdf5 new file mode 100644 index 00000000000..2d385db28f5 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85932f2adf41c57634f4610dd78446284e937a32e12344045308895259ff9686 +size 31088 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.unlabeled.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.unlabeled.annot.hdf5 new file mode 100644 index 00000000000..4ba95be06c8 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.unlabeled.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5602062a62176c9430426ca6818c887af6244d38976978c73d9eb7e90b12d825 +size 32880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf new file mode 100644 index 00000000000..c33bd749e80 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0232e0dbb277e6a5e736d01e0b17c1d0e7f0254c89f200e693393ab0962293e6 +size 171044 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf.idx new file mode 100644 index 00000000000..e637efbecf8 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5058b4e06c1773f6693991c26ee45e2dc095cfb38f2073ba1c6d14954636c135 +size 114268 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.annot.hdf5 new file mode 100644 index 00000000000..dcd7adedb1e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab001c1c4f75558ea4724800a0a167b339f9801b3781dba35b4cb05574452d08 +size 153048 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf new file mode 100644 index 00000000000..665f9422ec8 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fada922eff03dbd88bbe2e8d593a5cd194e657babb98db49b6c14adae0c2f9de +size 193313 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf.idx new file mode 100644 index 00000000000..0d6dd8f3a5c --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ddbcd95fe053e9dc9b0593f6fa4dd7e63b9960d7b646bf7d8476e9338827152 +size 114301 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.annot.hdf5 new file mode 100644 index 00000000000..230577f3f18 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9694a52e7485a62c7104e0a8113ae2c7cb64e02fa42b895716d89c0c7ef6adb8 +size 153048 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.unlabeled.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.unlabeled.annot.hdf5 new file mode 100644 index 00000000000..4958f31f7c6 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.unlabeled.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97d0502e52fe9821be8291db8100d197473538347bdd45f6d117ee0d04f0e511 +size 32880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf new file mode 100644 index 00000000000..665f9422ec8 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fada922eff03dbd88bbe2e8d593a5cd194e657babb98db49b6c14adae0c2f9de +size 193313 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf.idx new file mode 100644 index 00000000000..e334fdcd2f3 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cfb720e4d4ddbb52611d9730740d92d4b3f5b21af490688e57b860ee5a49b52 +size 114303 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.annot.hdf5 new file mode 100644 index 00000000000..5fa7822dca6 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3f11eee781282323d3accdcfc82db103238ad094f5fc5ef1dcdd3d9adc806e6 +size 180992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf new file mode 100644 index 00000000000..abec25cff9d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84b539e52f7e0f24e840845ce3d090e4f55e7444d50c2a44177bb62041fb172a +size 196311 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf.idx new file mode 100644 index 00000000000..6c984b4dfe3 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abeae05044326f9cba30da108b5ded10265b3f7884a74cf30c322ac06080d39f +size 114502 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.annot.hdf5 new file mode 100644 index 00000000000..ef3dee693c1 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e21468903993f7c31cd808e22a8a34d7dac95451215f66177135651938af505 +size 180992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.unlabeled.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.unlabeled.annot.hdf5 new file mode 100644 index 00000000000..c2e342413c6 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.unlabeled.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e5c8a0cde229c2e5714c4e412ef0c0952b5b40a9c268ba3ff04d30a882b56b7 +size 32880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf new file mode 100644 index 00000000000..abec25cff9d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84b539e52f7e0f24e840845ce3d090e4f55e7444d50c2a44177bb62041fb172a +size 196311 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf.idx new file mode 100644 index 00000000000..0f734427511 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a4ae5483ce2b01b56109c77366a32aa3661a88cdbbf60567f9c4dba25cf9ea6 +size 114504 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf new file mode 100644 index 00000000000..5bb2ef3ab94 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ea6cbe230a5a18f3447cfd5d29ce2787fd4a625128ab147ce0a1b207e577d50 +size 2013818 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf.idx new file mode 100644 index 00000000000..6926fb95f58 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f40a26b8528447a9d1b1154643cbd682a154e91a67e6dda58cf11a620a1af3dc +size 5387 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz new file mode 100644 index 00000000000..4157ac3128e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5804bcbfb060e10c3aa841a4a92acfbafbf1b24c88c87fceaa0d9089eee699e +size 127853 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz.tbi b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz.tbi new file mode 100644 index 00000000000..bc59b8a6e25 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d09b831af6a1b8585c26da1b29d131f8983f121703c5131a6596a1e81e0408f +size 2141 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz new file mode 100644 index 00000000000..5a556e7a0d7 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c6559d5c1567042ddb0fb05d7a5b7d9a07c56c61d8d21adfa85c15bf44e24fa +size 132259 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz.tbi b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz.tbi new file mode 100644 index 00000000000..a7a45835346 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbf9f97389369ac5e5a41420e58aaa3fa0a5f5edc21a6bd04b7e18c5bc21c914 +size 2542 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz new file mode 100644 index 00000000000..187e5f24e86 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b610a0aeccbec80b69572abcb89e1d3c5e96bc7df22b38b8dccf0b3c6b0ed1b5 +size 45717 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz.tbi b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz.tbi new file mode 100644 index 00000000000..582b14d068e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91926cca5a1c36a336f54ba918d0fe0581a6f6e89421a971c78c39aa9e5dd3e6 +size 2040 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz new file mode 100644 index 00000000000..38011d42e49 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5063e401c67443ce0c12c1534b3b1284fe690c826c8987d0430e516193d062ce +size 49655 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz.tbi b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz.tbi new file mode 100644 index 00000000000..27bd4edcff5 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a738a699beff718443d36022bf9fb35686498f63d7f8e5c40f79ef26e3d5908 +size 2465 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5 new file mode 100644 index 00000000000..3e8642a3b87 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ddd32415de62806eb4d6e9f6d877ce48ff4afdf866ddc99ac50f4f608a84f76 +size 769360 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5 new file mode 100644 index 00000000000..2edf48f3050 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:720a5f717bc736a97a45c9b94ff7b1b2e0b6a72882ce02e3ac41b5760064a42e +size 35136 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf new file mode 100644 index 00000000000..0c56eab2dd0 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbe7d8c66fa02024fcacb5356f7a8f63fe68f4c45f451fa1f40074ba60e12cdd +size 2226691 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx new file mode 100644 index 00000000000..9e44f3a8d25 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f55d658a38cb5ee5e03a2352a03b455887c025a406469864e0b4704ae1e703a +size 119222 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5 new file mode 100644 index 00000000000..ad25b5fd13a --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9e76640cbe974ea3c90671b1511d8d741346b08da8a7bbb04af694347e470bb +size 858296 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5 new file mode 100644 index 00000000000..09f07d2e6bd --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4b2c801b83c65db356ecb922cb4e5a113b4d51d268a751cb4755d5ff208f82b +size 38440 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf new file mode 100644 index 00000000000..6657efb8919 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:344467e055f48a6af7eb0dae413a6890c289bfdc5603f94f7111bbcefc1ef096 +size 2242652 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx new file mode 100644 index 00000000000..320366021d7 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99de9277bc4a27e2053bc126fff4fa40732235081c34cc04eb12297b1a10f094 +size 119227 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5 new file mode 100644 index 00000000000..e892f373649 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:999a5d700940f1ee3f34f824d055583e71a74f8cce515e9753c8e32ef72199ed +size 766368 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5 new file mode 100644 index 00000000000..1d346b1433d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56523ce8c46dd132b26ddcd00d4c4404fa6285807c8425daf0b5a96a78a556ad +size 34960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf new file mode 100644 index 00000000000..e46bbcf2a15 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:993e2d40dea8558c001a7321a4bbe4804877b2de36c3a266416310446c915ccb +size 2226076 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx new file mode 100644 index 00000000000..0c6d0619b61 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe5b8b9eb01cd5e86c2c9e57fa9f60cf6baab5474cba697b93e554557b197844 +size 119225 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5 new file mode 100644 index 00000000000..d16045c4035 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ef7d6bebcb961f25ee1d83280c22ac1ef664250ee7a5466af87f014f278c9e7 +size 829672 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5 new file mode 100644 index 00000000000..15ed17d3d28 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:889ea5a6e75168ed0925a20ed9a506374cc866d01eee7ef3d1e56916cb05ba5d +size 37720 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf new file mode 100644 index 00000000000..4af1921ce48 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05f85d264a457cdd81896bde03f51b2369343da5ade21b1c8df183a2b7e8f974 +size 2242450 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx new file mode 100644 index 00000000000..f93d3f908cc --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f4bfa4bdbd17eb3b33f4464921c1616dcade460c37558436b16583c4c6dec17 +size 119230 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..ee4e288ce0d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c86e349291873b6842482917dcbc4adaca389d7b1024f08ac7635207ed658bec +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl new file mode 100644 index 00000000000..ec6a6946b59 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d11c2aaf79130735fe23b1310f1e71639d2732245ba2aa6f980f32f0000a5007 +size 383290 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl new file mode 100644 index 00000000000..327e4870279 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c4ba38c85f7d0552bd425a9983bc9b9edde3235af5bd9a90d20b7a1341a8500 +size 547416 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..38a580ccdf8 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05634a06c9eaa7d9668beac0b9fc2abb2fc1ae3abce4f3f9bc961b710eaa3adf +size 5984 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5 new file mode 100644 index 00000000000..63812d55193 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5edc21e043e3e00dc52a8c9406d420036af0bc6ea8bbd8a30691431703a7457 +size 3192 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..59667fa4758 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9747a320daa223c085327584cfa619fb13b72c8a7ad79a4423600bcb445557d1 +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl new file mode 100644 index 00000000000..dc93df3c615 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d5c8e2cb4090e091c5689bc533a55ce078a339f33e44b48a04c76dbcd39d8d3 +size 385484 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl new file mode 100644 index 00000000000..cf53a88b22d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce595a7d881ba30a91faf0633652176d23475d987cdade18e28fc3ad7b50c592 +size 547672 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..bf9afd308b6 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b52e02e103f8da5f68df7dcbb2310ca2b3577051d897cf0d912361a1c0465bef +size 5984 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 new file mode 100644 index 00000000000..b2b15d2ada5 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae02f7c771453a2cf9c2ccc496374bf59a730130319cfbea890a23b241b972ec +size 3192 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..84d18527553 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10684b03e9cda817198599a6ff16bc72e0a7de0af61fed5fe6a3585ae3afd77d +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl new file mode 100644 index 00000000000..37b6aaaee1b --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c7f9b9edb775e5e550206e9fb600e4e4935743faf2d25b1418ac5645e89ced3 +size 547414 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..81e0cf0bc41 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1417290317ae1919cb53bc20df786b22a55a60db8f6b486a261fcc00767c106 +size 5984 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..2903011df8e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b768396c95302b2417c3966dda6ec96bf39a323bb86f882ebbf7d62dba72a475 +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl new file mode 100644 index 00000000000..cf53a88b22d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce595a7d881ba30a91faf0633652176d23475d987cdade18e28fc3ad7b50c592 +size 547672 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..1d1e1ca254e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1aae124b17cace08e3eb7ec5b29f03bbcfb4aca6d216c216b57d127b992ea100 +size 5984 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5 new file mode 100644 index 00000000000..e2fbacb00bf --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34ed77628bdb203281ca2c2ff53cf378fcdfff7ae164c85a7543be7243b5453b +size 2664 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl new file mode 100644 index 00000000000..106582be542 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbc3a904292d0fef1ce0d4dfacf5d19736cc39d6be9e9a5b757bbcba3fe3a8bb +size 133292 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl new file mode 100644 index 00000000000..086acb5b830 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ca24f63b37c8e47a7dc9a6b538fcfff85e61b901759c267bd7c5b1412864032 +size 248621 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5 new file mode 100644 index 00000000000..cb888d205de --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe27be5af1f958112a7d96d51bd1859a21b4ac2468175e7570fdc5270149d550 +size 2880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5 new file mode 100644 index 00000000000..cf187c5d4cf --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:761c2f2c063a5c80223fe247ec999308596dd71b619dc0ac5e608b89dcf54d19 +size 2472 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..f590dd72785 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f43a4e3ae888f8275c179412e407e25ca562922c9bb0cce2ace06d77474d7ea +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl new file mode 100644 index 00000000000..6175b8f3538 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1911a9970b8020800b43a26314dd285a166f2519f6f891d54b246c352483b21f +size 383290 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl new file mode 100644 index 00000000000..d8f0b75d96d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d1a8c9875ea6a045d9b8ec6c07b637a75063dc3f06954768238f3c034a73762 +size 547416 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..8af1c23b254 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:108e279f0708fedfb14af48be8103ea23b36e187d8f54d305dcef8c53fc7fb73 +size 5984 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5 new file mode 100644 index 00000000000..d448097c76b --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56b77554dcff0def314931f776a602ea12f791bd730adac0ebcbbdb154c3ff0a +size 3192 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5 new file mode 100644 index 00000000000..204b0c6bde1 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b0c96739db44b4c0e2075053fded315268821a070bd8535b2605555cf058c2e +size 2664 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl new file mode 100644 index 00000000000..753b924b903 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0a91a10ac594a414747c6cca91724c919ef84997729e3a31dee21acbd1ad862 +size 131500 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl new file mode 100644 index 00000000000..29acdf69596 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:971241c711511787ea2174729a132ecb6fb194203a0a687a94be1b3c2f386e61 +size 258117 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5 new file mode 100644 index 00000000000..e2a66ea327a --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab06dd2dbd424d8199fd29ec7f3055063ebe11b8c670a2fec4a3a5387421d487 +size 2880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5 new file mode 100644 index 00000000000..8cb5feba953 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80e9e5ee795e34d0a14410485f674a4c6626991ef2ed7a737c9ec925755afe3b +size 2472 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..681be839ce0 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20abec02257257e46c2587fdd00d4f22d0ede66869b14c5a44439fac065546ff +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl new file mode 100644 index 00000000000..dc93df3c615 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d5c8e2cb4090e091c5689bc533a55ce078a339f33e44b48a04c76dbcd39d8d3 +size 385484 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl new file mode 100644 index 00000000000..cf53a88b22d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce595a7d881ba30a91faf0633652176d23475d987cdade18e28fc3ad7b50c592 +size 547672 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..db4d29a60b6 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31edd143f65ebd84d8d6478b1182a2c641cc28210e224cf4ccff449b0d433bfd +size 5984 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 new file mode 100644 index 00000000000..319c56e2887 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:618a074fdbaf3821d0320871eaeead76b657c2a08120d1d8e600da460454d92b +size 3192 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5 new file mode 100644 index 00000000000..e3ee61cde33 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:066bff7fcc54d96fc388b2d2e810737a2fae092295ab23d1a86fd26248875fa2 +size 2664 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl new file mode 100644 index 00000000000..f371e4a8c40 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e60950ec257dd1dbafa4438b6454eef3561b7fe8aab977f939ac1aa484d6ef21 +size 248621 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5 new file mode 100644 index 00000000000..f50551b6df5 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1afbe046bf9f6843f0c70d2b80f1c6d508be0cc020eee123e5581a670404097 +size 2880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..450f847c31b --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6dd0a7cf575dbe4361d2a8c7e1ce827a9e19b14a1da60171a2f21d62c6ab90c +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl new file mode 100644 index 00000000000..78f99b8c18b --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32322fcf64720c3d7f0761d0214fe1655ad3a774444a20fb559ff4a04b2d673d +size 547416 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..4bfae9c79c1 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68628cf4169c4bfea257db989295514fc55ab6be0214adb036b97d3e89c522be +size 5984 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5 new file mode 100644 index 00000000000..b9275e66479 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a9bdfd5475708edfdb45b3be9859d15c23e853bfe31efeb1051990479ac590b +size 2664 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl new file mode 100644 index 00000000000..29acdf69596 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:971241c711511787ea2174729a132ecb6fb194203a0a687a94be1b3c2f386e61 +size 258117 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5 new file mode 100644 index 00000000000..89609f5b1ba --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:457da91649faabf0c97454c72ecb2599a94e7d0f5e0e13209526521fd2d24967 +size 2880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..f54673180c0 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e0b40cfb3c99d390509b05fdc3acf6dae85261cd27311056ff983c27d4727c5 +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl new file mode 100644 index 00000000000..cf53a88b22d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce595a7d881ba30a91faf0633652176d23475d987cdade18e28fc3ad7b50c592 +size 547672 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..08e917e9cee --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95a334d5dbbc130e5aa78992b39c30d9ba5033a85b79a23977bf26775a123f27 +size 5984 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..eacf51121bd --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:337e9ae2798650c76e1639de6d873c924ff5bcb5d9aed01ddb1b118592121362 +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl new file mode 100644 index 00000000000..8771f724f11 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e3cd0c75f8b6ade95ced7cd03d86e809580bfe65c8455cfb0b526a3dedbea12 +size 368367 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl new file mode 100644 index 00000000000..87adbb0b4be --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b078a2ae61a30ff3e7ca753f21d9294eec8cebbab42a1eabc7f3753e21617f0e +size 556676 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..0b2c64749b8 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39797c6b95bbc8ac687042da7b60b28a24c54fbc4321c64a3d0559d4b77eef8f +size 5992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5 new file mode 100644 index 00000000000..8468ae9a55e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60e748fe1e1e26a006e1d4ece97abff9e5723883e8ad1774e0cd63b17bacca6e +size 3168 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..e05d0ba0cdd --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1ee9ab7fbf01e834c32729a84c1f699fee7896a20b4966f88d8b73ffe9f1ee1 +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl new file mode 100644 index 00000000000..f3d63e89231 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02ea61d317a9782fb49689fe18b4ecb100538c771a2ca6403c1b1c323358e1a3 +size 359136 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl new file mode 100644 index 00000000000..90b198fbde4 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cf9f706ee2ee2241663c9a8cc2a939cb851f58eec9dd551baf38c669c12cd20 +size 525313 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..79fbe9271ec --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fedd367ae518ca7a420684e09f6e043e62b338263ea328edfa1feac731c58717 +size 5992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 new file mode 100644 index 00000000000..f3505388573 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7539e559f152311925d552f8143514b00b59e5f2a3727fff35213f09a446c17e +size 3168 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..f0a49de465e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34bbbd98bf30c5abee92ecd5bf1e5838f4310c7e60b5e925985294c0b80ee897 +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl new file mode 100644 index 00000000000..3777424a4cf --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:524e8c41b21f3efabec74bfed799941ee02083d578c65351e672329b724765c5 +size 556676 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..1e710180709 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a18628b0dd05c50e41771aef301834a7d9f070e410aa48ad2b33702da8b5767c +size 5992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..a04fc38d0bd --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b6ddc616d7e353b2a9c2dcd3830065386cbe5ebf2ee3833ca8833390217717b +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl new file mode 100644 index 00000000000..90b198fbde4 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cf9f706ee2ee2241663c9a8cc2a939cb851f58eec9dd551baf38c669c12cd20 +size 525313 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..d97235df500 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea99a9aec1c142b5d4c08a5f70027bcf6db8b1dc06cf8b6e7072e5300cf33dba +size 5992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5 new file mode 100644 index 00000000000..5499e96df9c --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d65f9ac8acf8ae8fd99de8f2148357283c18d2210357f2f961b06ecc5d9ea476 +size 2664 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl new file mode 100644 index 00000000000..9876244c2a0 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46e82c2222e1e206cc5f15c6b2e173ec30c79be5f3466313b841a9fc0947409d +size 108247 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl new file mode 100644 index 00000000000..7d9ae303257 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a3102dc2bccf300f8a101a8087c22edfc70049274691eb58691f627e7a5cf77 +size 259163 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5 new file mode 100644 index 00000000000..fbce46f8a73 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e99a57eb29259b0cb76876ba69a00009bcce9a0197c390405b6bbe6dc2d6ac00 +size 2880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5 new file mode 100644 index 00000000000..b9fcf1a9d4f --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74b2d49d826bab13fbd1a862ce0ef75adeb939c625eb9aa4f9cc4403ef078b5c +size 2496 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..6a63227e00b --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5b62757c20f97475e7f2fa4c30caadac7b98d2483b2d7b9bd83e6c442a5f428 +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl new file mode 100644 index 00000000000..9b4df1f88db --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5b5eee807d0db47119926067f54a7f38ad242090d19b50005d849c19ea06b9b +size 368366 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl new file mode 100644 index 00000000000..4f032f617ed --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e766e906340119331ede315b578af92de6c7e307a8f4009ea7ec3268bac8eed +size 556675 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..42d57b45d1e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b23331c08949ef559ccb282553fabd14849982c4491004eb35c99450a6981ee5 +size 5992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5 new file mode 100644 index 00000000000..2438f2bdeeb --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf627f796420f66f21289f0f138a47b5f1a2f4fe8665484819853147a0958523 +size 3168 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5 new file mode 100644 index 00000000000..54bf9ede9dd --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5237283409f8c9c3380e36f361a74c5c79531af73b70c1abe0cd8e7e758ab423 +size 2664 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl new file mode 100644 index 00000000000..0821f41755c --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b110c498b70170a3e41258344ef78fdc266326168f253ed076a203f94dcfe45d +size 132824 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl new file mode 100644 index 00000000000..b880fca3d31 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e46a4531d68bdf6122231ba3ed3e263c091089ab227d7871ba3349c21c93f4f +size 248814 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5 new file mode 100644 index 00000000000..2cddb25120d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:284f5f52917ce99f3b7679373c6ae011edb3c91bd8a1ddb675e1f115dda3e810 +size 2880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5 new file mode 100644 index 00000000000..9580c080eb4 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b607fcf3181b0cef441617146cfc44d0e49a9e7f2a1f1c1e3ee778a632841586 +size 2496 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..bb5056a3d90 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24708a2c6f66d23669ffc52eb63e9372e9883d6517371a23d9e81c22fcf0db46 +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl new file mode 100644 index 00000000000..f3d63e89231 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02ea61d317a9782fb49689fe18b4ecb100538c771a2ca6403c1b1c323358e1a3 +size 359136 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl new file mode 100644 index 00000000000..90b198fbde4 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cf9f706ee2ee2241663c9a8cc2a939cb851f58eec9dd551baf38c669c12cd20 +size 525313 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..ba37a2d0a75 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec107cfac03b83f9062a5f1f40bd76fe4cd5e2641b37b1d0027417576c5f77e4 +size 5992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 new file mode 100644 index 00000000000..4a6efa95dca --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbbbf3723990daa0f398120b36fa2c6b8354681adb74ae2298d2e05a22956372 +size 3168 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5 new file mode 100644 index 00000000000..c400fa9af57 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b13c6ebe092625234fb0878bfcea5f0a75a092626a2dd074ad384689bbc5c98 +size 2664 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl new file mode 100644 index 00000000000..ce6ed251be8 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdf14d85d1b0b58a44c851ee8b84e8a10d78bfb8372139dd9ef229d51791dc78 +size 259163 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5 new file mode 100644 index 00000000000..45cd8fd5d76 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d51882fd91528fb6005dc73fd7d4ea9ca2b57ba3426004603f839eafa4aa4305 +size 2880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..98afa8e174e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5b380fd59ed8afc20098ff5a8f6fbd7e6e90c7bd7f873ba6b91951ece600d76 +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl new file mode 100644 index 00000000000..14c00193bba --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f213af5f6931cc35a0f21eb998740412e1e42e7274b916bc639cd4fb07922dd +size 556675 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..8a64fbc2814 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:803ee77783e2c52f888b6fcb64b9179e4ca091b995e53735204369ed1f823de4 +size 5992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5 new file mode 100644 index 00000000000..2d138011e4c --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16e12374eaf43eb990e52c43718c6db20e6d8ba715f334f43dfdb2af6da97b3d +size 2664 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl new file mode 100644 index 00000000000..b880fca3d31 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e46a4531d68bdf6122231ba3ed3e263c091089ab227d7871ba3349c21c93f4f +size 248814 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5 new file mode 100644 index 00000000000..ae946940860 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bf264b767541c3036d88042e213d27fb1a4a3bd3663173022854844ae57d258 +size 2880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..95b3d2fcb5f --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bbb2201dcddea0e3ce7f96377edc2aafb05134c60926d11df0c103795bbd006 +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl new file mode 100644 index 00000000000..90b198fbde4 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cf9f706ee2ee2241663c9a8cc2a939cb851f58eec9dd551baf38c669c12cd20 +size 525313 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..3d569294bb2 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a7c429c04a96894c0d6d80cf27940eed25e9254538e9c1cdd5250554a7ba009 +size 5992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/isolation-forest-hyperparameters-different-seed.json b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/isolation-forest-hyperparameters-different-seed.json new file mode 100644 index 00000000000..6fbb7d105da --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/isolation-forest-hyperparameters-different-seed.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddca401b3f0fdceedc96946c8ced9870984f1ae34ce5e5626cc4b08152639532 +size 23