From ea6ae8273573ebf5c024cc6b54a10a39243f5865 Mon Sep 17 00:00:00 2001
From: samuelklee
Date: Fri, 22 Sep 2023 13:01:19 -0400
Subject: [PATCH] Performed a round of ablation on new annotation-based
filtering tools. (#8131)
* Performed a round of ablation on new annotation-based filtering tools.
* Removed Javadoc tags unsupported by Barclay in VETS tool documentation and fixed other minor documentation issues.
---
.../run_vcf_site_level_filtering_wdl.sh | 4 -
.../vcf_site_level_filtering_pos_neg.json | 19 --
.../JointVcfFiltering.wdl | 17 +-
.../scalable/ExtractVariantAnnotations.java | 69 ++--
.../LabeledVariantAnnotationsWalker.java | 26 +-
.../scalable/ScoreVariantAnnotations.java | 78 ++---
.../TrainVariantAnnotationsModel.java | 311 +++++-------------
.../data/LabeledVariantAnnotationsDatum.java | 4 +-
.../modeling/BGMMVariantAnnotationsModel.java | 1 +
...ava => PythonVariantAnnotationsModel.java} | 16 +-
...va => PythonVariantAnnotationsScorer.java} | 6 +-
.../modeling/VariantAnnotationsModel.java | 3 +-
.../modeling/VariantAnnotationsScorer.java | 17 -
...ractVariantAnnotationsIntegrationTest.java | 11 -
...coreVariantAnnotationsIntegrationTest.java | 10 +-
...ariantAnnotationsModelIntegrationTest.java | 111 ++-----
...in.snpIndel.posNeg.IF.score.snp.annot.hdf5 | 3 -
...n.snpIndel.posNeg.IF.score.snp.scores.hdf5 | 3 -
...sUn.train.snpIndel.posNeg.IF.score.snp.vcf | 3 -
...train.snpIndel.posNeg.IF.score.snp.vcf.idx | 3 -
...pIndel.posNeg.IF.score.snpIndel.annot.hdf5 | 3 -
...Indel.posNeg.IF.score.snpIndel.scores.hdf5 | 3 -
...rain.snpIndel.posNeg.IF.score.snpIndel.vcf | 3 -
....snpIndel.posNeg.IF.score.snpIndel.vcf.idx | 3 -
...n.snpIndel.posOnly.IF.score.snp.annot.hdf5 | 3 +
....snpIndel.posOnly.IF.score.snp.scores.hdf5 | 3 +
...Un.train.snpIndel.posOnly.IF.score.snp.vcf | 3 +
...rain.snpIndel.posOnly.IF.score.snp.vcf.idx | 3 +
...Indel.posOnly.IF.score.snpIndel.annot.hdf5 | 3 +
...ndel.posOnly.IF.score.snpIndel.scores.hdf5 | 3 +
...ain.snpIndel.posOnly.IF.score.snpIndel.vcf | 3 +
...snpIndel.posOnly.IF.score.snpIndel.vcf.idx | 3 +
...in.snpIndel.posNeg.IF.score.snp.annot.hdf5 | 3 -
...n.snpIndel.posNeg.IF.score.snp.scores.hdf5 | 3 -
...sUn.train.snpIndel.posNeg.IF.score.snp.vcf | 3 -
...train.snpIndel.posNeg.IF.score.snp.vcf.idx | 3 -
...pIndel.posNeg.IF.score.snpIndel.annot.hdf5 | 3 -
...Indel.posNeg.IF.score.snpIndel.scores.hdf5 | 3 -
...rain.snpIndel.posNeg.IF.score.snpIndel.vcf | 3 -
....snpIndel.posNeg.IF.score.snpIndel.vcf.idx | 3 -
...n.snpIndel.posOnly.IF.score.snp.annot.hdf5 | 3 +
....snpIndel.posOnly.IF.score.snp.scores.hdf5 | 3 +
...Un.train.snpIndel.posOnly.IF.score.snp.vcf | 3 +
...rain.snpIndel.posOnly.IF.score.snp.vcf.idx | 3 +
...Indel.posOnly.IF.score.snpIndel.annot.hdf5 | 3 +
...ndel.posOnly.IF.score.snpIndel.scores.hdf5 | 3 +
...ain.snpIndel.posOnly.IF.score.snpIndel.vcf | 3 +
...snpIndel.posOnly.IF.score.snpIndel.vcf.idx | 3 +
...n.snp.posNeg.IF.snp.calibrationScores.hdf5 | 3 -
...rain.snp.posNeg.IF.snp.negative.scorer.pkl | 3 -
...l.posUn.train.snp.posNeg.IF.snp.scorer.pkl | 3 -
...rain.snp.posNeg.IF.snp.trainingScores.hdf5 | 3 -
...ain.snp.posNeg.IF.snp.unlabeledScores.hdf5 | 3 -
...IFDifferentSeed.snp.calibrationScores.hdf5 | 3 -
...eg.IFDifferentSeed.snp.negative.scorer.pkl | 3 -
....snp.posNeg.IFDifferentSeed.snp.scorer.pkl | 3 -
...eg.IFDifferentSeed.snp.trainingScores.hdf5 | 3 -
...g.IFDifferentSeed.snp.unlabeledScores.hdf5 | 3 -
...del.posNeg.IF.indel.calibrationScores.hdf5 | 3 -
...pIndel.posNeg.IF.indel.negative.scorer.pkl | 3 -
....train.snpIndel.posNeg.IF.indel.scorer.pkl | 3 -
...pIndel.posNeg.IF.indel.trainingScores.hdf5 | 3 -
...Indel.posNeg.IF.indel.unlabeledScores.hdf5 | 3 -
...Indel.posNeg.IF.snp.calibrationScores.hdf5 | 3 -
...snpIndel.posNeg.IF.snp.negative.scorer.pkl | 3 -
...Un.train.snpIndel.posNeg.IF.snp.scorer.pkl | 3 -
...snpIndel.posNeg.IF.snp.trainingScores.hdf5 | 3 -
...npIndel.posNeg.IF.snp.unlabeledScores.hdf5 | 3 -
...DifferentSeed.indel.calibrationScores.hdf5 | 3 -
....IFDifferentSeed.indel.negative.scorer.pkl | 3 -
...el.posNeg.IFDifferentSeed.indel.scorer.pkl | 3 -
....IFDifferentSeed.indel.trainingScores.hdf5 | 3 -
...IFDifferentSeed.indel.unlabeledScores.hdf5 | 3 -
...IFDifferentSeed.snp.calibrationScores.hdf5 | 3 -
...eg.IFDifferentSeed.snp.negative.scorer.pkl | 3 -
...ndel.posNeg.IFDifferentSeed.snp.scorer.pkl | 3 -
...eg.IFDifferentSeed.snp.trainingScores.hdf5 | 3 -
...g.IFDifferentSeed.snp.unlabeledScores.hdf5 | 3 -
...n.snp.posNeg.IF.snp.calibrationScores.hdf5 | 3 -
...rain.snp.posNeg.IF.snp.negative.scorer.pkl | 3 -
...l.posUn.train.snp.posNeg.IF.snp.scorer.pkl | 3 -
...rain.snp.posNeg.IF.snp.trainingScores.hdf5 | 3 -
...ain.snp.posNeg.IF.snp.unlabeledScores.hdf5 | 3 -
...IFDifferentSeed.snp.calibrationScores.hdf5 | 3 -
...eg.IFDifferentSeed.snp.negative.scorer.pkl | 3 -
....snp.posNeg.IFDifferentSeed.snp.scorer.pkl | 3 -
...eg.IFDifferentSeed.snp.trainingScores.hdf5 | 3 -
...g.IFDifferentSeed.snp.unlabeledScores.hdf5 | 3 -
...del.posNeg.IF.indel.calibrationScores.hdf5 | 3 -
...pIndel.posNeg.IF.indel.negative.scorer.pkl | 3 -
....train.snpIndel.posNeg.IF.indel.scorer.pkl | 3 -
...pIndel.posNeg.IF.indel.trainingScores.hdf5 | 3 -
...Indel.posNeg.IF.indel.unlabeledScores.hdf5 | 3 -
...Indel.posNeg.IF.snp.calibrationScores.hdf5 | 3 -
...snpIndel.posNeg.IF.snp.negative.scorer.pkl | 3 -
...Un.train.snpIndel.posNeg.IF.snp.scorer.pkl | 3 -
...snpIndel.posNeg.IF.snp.trainingScores.hdf5 | 3 -
...npIndel.posNeg.IF.snp.unlabeledScores.hdf5 | 3 -
...DifferentSeed.indel.calibrationScores.hdf5 | 3 -
....IFDifferentSeed.indel.negative.scorer.pkl | 3 -
...el.posNeg.IFDifferentSeed.indel.scorer.pkl | 3 -
....IFDifferentSeed.indel.trainingScores.hdf5 | 3 -
...IFDifferentSeed.indel.unlabeledScores.hdf5 | 3 -
...IFDifferentSeed.snp.calibrationScores.hdf5 | 3 -
...eg.IFDifferentSeed.snp.negative.scorer.pkl | 3 -
...ndel.posNeg.IFDifferentSeed.snp.scorer.pkl | 3 -
...eg.IFDifferentSeed.snp.trainingScores.hdf5 | 3 -
...g.IFDifferentSeed.snp.unlabeledScores.hdf5 | 3 -
108 files changed, 269 insertions(+), 710 deletions(-)
delete mode 100644 scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_pos_neg.json
rename src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/{PythonSklearnVariantAnnotationsModel.java => PythonVariantAnnotationsModel.java} (81%)
rename src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/{PythonSklearnVariantAnnotationsScorer.java => PythonVariantAnnotationsScorer.java} (92%)
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx
create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.annot.hdf5
create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.scores.hdf5
create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.vcf
create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.vcf.idx
create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.annot.hdf5
create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.scores.hdf5
create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.vcf
create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.vcf.idx
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx
create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.annot.hdf5
create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.scores.hdf5
create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.vcf
create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.vcf.idx
create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.annot.hdf5
create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.scores.hdf5
create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.vcf
create mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.vcf.idx
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
delete mode 100644 src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh b/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh
index f25ad6bb191..27d5e522510 100644
--- a/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh
+++ b/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh
@@ -28,7 +28,6 @@ fi
echo "Docker build done =========="
sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/vcf_site_level_filtering.json >$WORKING_DIR/vcf_site_level_filtering_mod.json
-sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/vcf_site_level_filtering_pos_neg.json >$WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json
echo "Running Filtering WDL through cromwell"
@@ -41,6 +40,3 @@ done
FIN
cat $WORKING_DIR/vcf_site_level_filtering_mod.json
java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_mod.json
-
-cat $WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json
-java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json
diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_pos_neg.json b/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_pos_neg.json
deleted file mode 100644
index ee2d116e1d4..00000000000
--- a/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_pos_neg.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
- "JointVcfFiltering.gatk_docker": "__GATK_DOCKER__",
- "JointVcfFiltering.input_vcfs": [
- "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz",
- "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz",
- "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz"],
- "JointVcfFiltering.input_vcf_idxs": [
- "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21.avg.vcf.gz.tbi",
- "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr22.avg.vcf.gz.tbi",
- "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.empty.avg.vcf.gz.tbi"],
- "JointVcfFiltering.sites_only_vcf": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz",
- "JointVcfFiltering.sites_only_vcf_idx": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.chr21_chr22.sites_only.vcf.gz.tbi",
- "JointVcfFiltering.annotations": ["ReadPosRankSum", "FS", "SOR", "QD"],
- "JointVcfFiltering.output_prefix": "test_10_samples",
- "JointVcfFiltering.resource_args": "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
- "JointVcfFiltering.extract_extra_args": "-L chr21 --maximum-number-of-unlabeled-variants 10000000",
- "JointVcfFiltering.train_extra_args": "--calibration-sensitivity-threshold 0.95",
- "JointVcfFiltering.score_extra_args": "--snp-calibration-sensitivity-threshold 0.95 --indel-calibration-sensitivity-threshold 0.95"
-}
\ No newline at end of file
diff --git a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
index 55b5fa1c390..7b265c45c54 100644
--- a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
+++ b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
@@ -29,8 +29,9 @@ workflow JointVcfFiltering {
String resource_args
String? model_backend
- File? python_script
+ File? training_python_script
File? hyperparameters_json
+ File? scoring_python_script
String? extract_extra_args
String? train_extra_args
@@ -55,9 +56,9 @@ workflow JointVcfFiltering {
model_backend: "(Optional) Model backend to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool."
python_script: "(Optional) Python script specifying custom model backend to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool."
hyperparameters_json: "(Optional) JSON file specifying model hyperparameters to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool."
- extract_extra_args: "(Optional) Catch-all string to provide additional arguments for ExtractVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-negative training, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
- train_extra_args: "(Optional) Catch-all string to provide additional arguments for TrainVariantAnnotationsModel. This can include variant-type modes, arguments for enabling positive-negative training, etc. See GATK documentation for this tool."
- score_extra_args: "(Optional) Catch-all string to provide additional arguments for ScoreVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-negative training and hard filtering, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
+ extract_extra_args: "(Optional) Catch-all string to provide additional arguments for ExtractVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-unlabeled learning, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
+ train_extra_args: "(Optional) Catch-all string to provide additional arguments for TrainVariantAnnotationsModel. This can include variant-type modes, arguments for enabling positive-unlabeled learning, etc. See GATK documentation for this tool."
+ score_extra_args: "(Optional) Catch-all string to provide additional arguments for ScoreVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-unlabeled learning and hard filtering, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
}
call ExtractVariantAnnotations {
@@ -79,7 +80,7 @@ workflow JointVcfFiltering {
annotations_hdf5 = ExtractVariantAnnotations.annotations_hdf5,
unlabeled_annotations_hdf5 = ExtractVariantAnnotations.unlabeled_annotations_hdf5,
model_backend = model_backend,
- python_script = python_script,
+ python_script = training_python_script,
hyperparameters_json = hyperparameters_json,
output_prefix = output_prefix,
extra_args = train_extra_args,
@@ -101,6 +102,8 @@ workflow JointVcfFiltering {
extracted_vcf_idx = ExtractVariantAnnotations.extracted_vcf_idx,
model_prefix = output_prefix,
model_files = TrainVariantAnnotationsModel.model_files,
+ model_backend = model_backend,
+ python_script = scoring_python_script,
extra_args = score_extra_args,
gatk_docker = gatk_docker,
gatk_override = gatk_override,
@@ -251,6 +254,8 @@ task ScoreVariantAnnotations {
File extracted_vcf_idx
String model_prefix
Array[File] model_files
+ String? model_backend
+ File? python_script
String? extra_args
File? monitoring_script
@@ -287,6 +292,8 @@ task ScoreVariantAnnotations {
~{resource_args} \
--resource:extracted,extracted=true ~{extracted_vcf} \
--model-prefix model-files/~{model_prefix}.train \
+ ~{"--model-backend " + model_backend} \
+ ~{"--python-script " + python_script} \
~{extra_args}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java
index 0213ff0b97d..0611af6d420 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java
@@ -31,7 +31,7 @@
* Extracts site-level variant annotations, labels, and other metadata from a VCF file to HDF5 files.
*
*
- * This tool is intended to be used as the first step in a variant-filtering workflow that supersedes the
+ * This tool is primarily intended to be used as the first step in a variant-filtering workflow that supersedes the
* {@link VariantRecalibrator} workflow. This tool extracts site-level annotations, labels, and other relevant metadata
* from variant sites (or alleles, in allele-specific mode) that are or are not present in specified labeled
* resource VCFs (e.g., training or calibration VCFs). Input sites that are present in the resources are considered
@@ -65,7 +65,7 @@
*
* -
* Input VCF file. Site-level annotations will be extracted from the contained variants (or alleles,
- * if the {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} argument is specified).
+ * if at least one allele-specific annotation with "Number=A" is specified).
*
* -
* Annotations to extract.
@@ -78,13 +78,12 @@
*
* -
* (Optional) Resource VCF file(s). Each resource should be tagged with a label, which will be assigned to
- * extracted sites that are present in the resource. In typical use, the {@value LabeledVariantAnnotationsData#TRAINING_LABEL}
- * and {@value LabeledVariantAnnotationsData#CALIBRATION_LABEL} labels should be used to tag at least one resource
- * apiece. The resulting sets of sites will be used for model training and conversion of scores to
+ * extracted sites that are present in the resource. In typical use, the "training"
+ * and "calibration" labels should be used to tag at least one resource apiece.
+ * The resulting sets of sites will be used for model training and conversion of scores to
* calibration-set sensitivity, respectively; the trustworthiness of the respective resources should be
- * taken into account accordingly. The {@value LabeledVariantAnnotationsData#SNP_LABEL} label is
- * reserved by the tool, as it is used to label sites determined to be SNPs, and thus it cannot be used to tag
- * provided resources.
+ * taken into account accordingly. The "snp" label is reserved by the tool, as it is used to label sites
+ * determined to be SNPs, and thus it cannot be used to tag provided resources.
*
* -
* (Optional) Maximum number of unlabeled variants (or alleles) to randomly sample with reservoir sampling.
@@ -128,19 +127,19 @@
*
* Here, each chunk is a double matrix, with dimensions given by (number of sites in the chunk) x (number of annotations).
* See the methods {@link HDF5Utils#writeChunkedDoubleMatrix} and {@link HDF5Utils#writeIntervals} for additional details.
- * If {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} is specified, each record corresponds to an individual allele;
+ * In allele-specific mode (i.e., when allele-specific annotations are requested), each record corresponds to an individual allele;
* otherwise, each record corresponds to a variant site, which may contain multiple alleles.
- * Storage of alleles can be omitted using the {@value OMIT_ALLELES_IN_HDF5_LONG_NAME} argument, which will reduce
+ * Storage of alleles can be omitted using the "--omit-alleles-in-hdf5" argument, which will reduce
* the size of the file. This file will only be produced if resources are provided and the number of extracted
* labeled sites is nonzero.
*
*
*
* -
- * Labeled sites-only VCF file and index. The VCF will not be gzipped if the {@value DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME}
+ * Labeled sites-only VCF file and index. The VCF will not be gzipped if the "--do-not-gzip-vcf-output"
* argument is set to true. The VCF can be provided as a resource in subsequent runs of
* {@link ScoreVariantAnnotations} and used to indicate labeled sites that were extracted.
- * This can be useful if the {@value StandardArgumentDefinitions#INTERVALS_LONG_NAME} argument was used to
+ * This can be useful if the "--intervals/-L" argument was used to
* subset sites in training or calibration resources for extraction; this may occur when setting up
* training/validation/test splits, for example. Note that records for the random sample of unlabeled sites are
* currently not included in the VCF.
@@ -149,7 +148,7 @@
* (Optional) Unlabeled-annotations HDF5 file. This will have the same directory structure as in the
* labeled-annotations HDF5 file. However, note that records are currently written in the order they
* appear in the downsampling reservoir after random sampling, and hence, are not in genomic order.
- * This file will only be produced if a nonzero value of the {@value MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME}
+ * This file will only be produced if a nonzero value of the "--maximum-number-of-unlabeled-variants"
* argument is provided.
*
*
@@ -158,9 +157,9 @@
*
*
* Extract annotations from training/calibration SNP/INDEL sites, producing the outputs
- * 1) {@code extract.annot.hdf5}, 2) {@code extract.vcf.gz}, and 3) {@code extract.vcf.gz.tbi}.
+ * 1) extract.annot.hdf5, 2) extract.vcf.gz, and 3) extract.vcf.gz.tbi.
* The HDF5 file can then be provided to {@link TrainVariantAnnotationsModel}
- * to train a model using a positive-only approach. Note that the {@value MODE_LONG_NAME} arguments are made
+ * to train a model using a positive-only approach. Note that the "--mode" arguments are made
* explicit here, although both SNP and INDEL modes are selected by default.
*
*
@@ -182,11 +181,10 @@
*
* Extract annotations from both training/calibration SNP/INDEL sites and a random sample of
* 1000000 unlabeled (i.e., non-training/calibration) sites, producing the outputs
- * 1) {@code extract.annot.hdf5}, 2) {@code extract.unlabeled.annot.hdf5}, 3) {@code extract.vcf.gz},
- * and 4) {@code extract.vcf.gz.tbi}. The HDF5 files can then be provided to {@link TrainVariantAnnotationsModel}
- * to train a model using a positive-negative approach (similar to that used in {@link VariantRecalibrator}).
- * Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both SNP and INDEL modes are
- * selected by default.
+ * 1) extract.annot.hdf5, 2) extract.unlabeled.annot.hdf5, 3) extract.vcf.gz,
+ * and 4) extract.vcf.gz.tbi. The HDF5 files can then be provided to {@link TrainVariantAnnotationsModel}
+ * to train a model using a positive-unlabeled approach. Note that the "--mode" arguments
+ * are made explicit here, although both SNP and INDEL modes are selected by default.
*
*
* gatk ExtractVariantAnnotations \
@@ -200,17 +198,23 @@
* --mode INDEL \
* --resource:indel-training,training=true indel-training.vcf \
* --resource:indel-calibration,calibration=true indel-calibration.vcf \
- * --maximum-number-of-unlableled-variants 1000000
+ * --maximum-number-of-unlabeled-variants 1000000
* -O extract
*
*
*
*
+ * Note that separate SNP and INDEL resources are shown in the above examples purely for demonstration purposes,
+ * as are separate training and calibration resources. However, it may be desirable to specify combined
+ * resource(s); e.g., "--resource:snp-and-indel-resource,training=true,calibration=true snp-and-indel-resource.vcf".
+ *
+ *
+ *
* In the (atypical) event that resource VCFs are unavailable, one can still extract annotations from a random sample of
- * unlabeled sites, producing the outputs 1) {@code extract.unlabeled.annot.hdf5},
- * 2) {@code extract.vcf.gz} (which will contain no records), and 3) {@code extract.vcf.gz.tbi}.
+ * unlabeled sites, producing the outputs 1) extract.unlabeled.annot.hdf5,
+ * 2) extract.vcf.gz (which will contain no records), and 3) extract.vcf.gz.tbi.
* This random sample cannot be used by {@link TrainVariantAnnotationsModel}, but may still be useful for
- * exploratory analyses. Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both
+ * exploratory analyses. Note that the "--mode" arguments are made explicit here, although both
* SNP and INDEL modes are selected by default.
*
*
@@ -221,12 +225,20 @@
* -A annotation_N \
* --mode SNP \
* --mode INDEL \
- * --maximum-number-of-unlableled-variants 1000000
+ * --maximum-number-of-unlabeled-variants 1000000
* -O extract
*
*
*
- * DEVELOPER NOTE: See documentation in {@link LabeledVariantAnnotationsWalker}.
+ *
+ * Alternatively, if resource VCFs are unavailable, one might want to specify the input VCF itself as a resource
+ * and extract annotations for the input variants (or a subset thereof). Again, this may be useful for
+ * exploratory analyses.
+ *
+ *
+ *
+ * DEVELOPER NOTE: See documentation in {@link LabeledVariantAnnotationsWalker}.
+ *
*
* @author Samuel Lee <slee@broadinstitute.org>
*/
@@ -249,11 +261,10 @@ public final class ExtractVariantAnnotations extends LabeledVariantAnnotationsWa
doc = "Maximum number of unlabeled variants to extract. " +
"If greater than zero, reservoir sampling will be used to randomly sample this number " +
"of sites from input sites that are not present in the specified resources. " +
- "Choice of this number should be guided by considerations for training the negative model in " +
+ "Choice of this number should be guided by considerations for training the model in " +
"TrainVariantAnnotationsModel; users may wish to choose a number that is comparable to the " +
"expected size of the labeled training set or that is compatible with available memory resources. " +
- "Note that in allele-specific mode (--" + LabeledVariantAnnotationsWalker.USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME +
- " true), this argument limits the number of variant records, rather than the number of alleles.",
+ "Note that in allele-specific mode, this argument limits the number of variant records, rather than the number of alleles.",
minValue = 0)
private int maximumNumberOfUnlabeledVariants = 0;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java
index 32a72c0f5f0..108fe4bccbd 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java
@@ -9,6 +9,7 @@
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
+import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.apache.commons.collections4.ListUtils;
@@ -34,6 +35,7 @@
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
+import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -87,7 +89,6 @@
public abstract class LabeledVariantAnnotationsWalker extends MultiplePassVariantWalker {
public static final String MODE_LONG_NAME = "mode";
- public static final String USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME = "use-allele-specific-annotations";
public static final String IGNORE_FILTER_LONG_NAME = "ignore-filter";
public static final String IGNORE_ALL_FILTERS_LONG_NAME = "ignore-all-filters";
public static final String DO_NOT_TRUST_ALL_POLYMORPHIC_LONG_NAME = "do-not-trust-all-polymorphic";
@@ -129,12 +130,6 @@ enum ResourceMatchingStrategy {
minElements = 1)
private List variantTypesToExtractList = new ArrayList<>(Arrays.asList(VariantType.SNP, VariantType.INDEL));
- @Argument(
- fullName = USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME,
- doc = "If true, use the allele-specific versions of the specified annotations.",
- optional = true)
- boolean useASAnnotations = false;
-
@Argument(
fullName = IGNORE_FILTER_LONG_NAME,
doc = "Ignore the specified filter(s) in the input VCF.",
@@ -159,13 +154,13 @@ enum ResourceMatchingStrategy {
@Argument(
fullName = RESOURCE_MATCHING_STRATEGY_LONG_NAME,
doc = "The strategy to use for determining whether an input variant is present in a resource " +
- "in non-allele-specific mode (--" + USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME + " false). " +
+ "in non-allele-specific mode. " +
"START_POSITION: Start positions of input and resource variants must match. " +
"START_POSITION_AND_GIVEN_REPRESENTATION: The intersection of the sets of input and resource alleles " +
"(in their given representations) must also be non-empty. " +
"START_POSITION_AND_MINIMAL_REPRESENTATION: The intersection of the sets of input and resource alleles " +
"(after converting alleles to their minimal representations) must also be non-empty. " +
- "This argument has no effect in allele-specific mode (--" + USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME + " true), " +
+ "This argument has no effect in allele-specific mode, " +
"in which the minimal representations of the input and resource alleles must match.",
optional = true)
private ResourceMatchingStrategy resourceMatchingStrategy = ResourceMatchingStrategy.START_POSITION;
@@ -186,6 +181,7 @@ enum ResourceMatchingStrategy {
private final Set ignoreInputFilterSet = new TreeSet<>();
Set variantTypesToExtract;
TreeSet resourceLabels = new TreeSet<>();
+ boolean useASAnnotations;
File outputAnnotationsFile;
VariantContextWriter vcfWriter;
@@ -222,9 +218,11 @@ public void onTraversalStart() {
LabeledVariantAnnotationsData.SNP_LABEL));
}
+ useASAnnotations = isAlleleSpecificAnnotationRequested();
+
if (useASAnnotations && resourceMatchingStrategy != ResourceMatchingStrategy.START_POSITION_AND_MINIMAL_REPRESENTATION) {
- logger.warn(String.format("The %s argument is ignored when %s is set to true. The START_POSITION_AND_MINIMAL_REPRESENTATION strategy will be used.",
- RESOURCE_MATCHING_STRATEGY_LONG_NAME, USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME));
+ logger.warn(String.format("The %s argument is ignored when allele-specific annotations are requested. The START_POSITION_AND_MINIMAL_REPRESENTATION strategy will be used.",
+ RESOURCE_MATCHING_STRATEGY_LONG_NAME));
resourceMatchingStrategy = ResourceMatchingStrategy.START_POSITION_AND_MINIMAL_REPRESENTATION;
}
@@ -251,6 +249,12 @@ public Object onTraversalSuccess() {
return null;
}
+ private boolean isAlleleSpecificAnnotationRequested() {
+ final Set distinctAnnotationNames = new LinkedHashSet<>(annotationNames);
+ final VCFHeader inputHeader = getHeaderForVariants();
+ return distinctAnnotationNames.stream().anyMatch(a -> inputHeader.getInfoHeaderLine(a).getCountType() == VCFHeaderLineCount.A);
+ }
+
static void addExtractedVariantToData(final LabeledVariantAnnotationsData data,
final VariantContext variant,
final List, VariantType, TreeSet>> metadata) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotations.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotations.java
index 522ac434403..94a69f89881 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotations.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotations.java
@@ -25,7 +25,7 @@
import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData;
import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType;
import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.BGMMVariantAnnotationsScorer;
-import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonSklearnVariantAnnotationsScorer;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonVariantAnnotationsScorer;
import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModel;
import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModelBackend;
import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsScorer;
@@ -52,7 +52,7 @@
* Scores variant calls in a VCF file based on site-level annotations using a previously trained model.
*
*
- * This tool is intended to be used as the last step in a variant-filtering workflow that supersedes the
+ * This tool is primarily intended to be used as the last step in a variant-filtering workflow that supersedes the
* {@link VariantRecalibrator} workflow. Using a previously trained model produced by {@link TrainVariantAnnotationsModel},
* this tool assigns a score to each call (with a lower score indicating that a call is more likely to be an artifact).
* Each score can also be converted to a corresponding sensitivity with respect to a calibration set, if the latter is available.
@@ -64,7 +64,7 @@
* Note that annotations and metadata are collected in memory during traversal until they are written to HDF5 files
* upon completion of the traversal. Memory and disk requirements thus roughly scale linearly with both the number
* of sites scored and the number of annotations. For large callsets, this tool may be run in parallel over separate
- * genomic shards using the {@value StandardArgumentDefinitions#INTERVALS_LONG_NAME} argument as usual.
+ * genomic shards using the "--intervals/-L" argument as usual.
*
*
*
@@ -78,7 +78,7 @@
*
* -
* Input VCF file. Site-level annotations will be extracted from the contained variants (or alleles,
- * if the {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} argument is specified).
+ * if at least one allele-specific annotation with "Number=A" is specified).
*
* -
* Annotations to use for scoring. These should be identical to those used in the {@link ExtractVariantAnnotations}
@@ -97,7 +97,7 @@
* (Optional) Model backend. This should be identical to that specified in {@link TrainVariantAnnotationsModel}.
* The default Python IsolationForest implementation requires either the GATK Python environment
* or that certain Python packages (argparse, h5py, numpy, sklearn, and dill) are otherwise available.
- * A custom backend can also be specified in conjunction with the {@value PYTHON_SCRIPT_LONG_NAME} argument.
+ * A custom backend can also be specified in conjunction with the "--python-script" argument.
*
* -
* (Optional) Resource VCF file(s). See the corresponding documentation in {@link ExtractVariantAnnotations}.
@@ -121,34 +121,33 @@
*
*
* -
- * Scored VCF file and index. The VCF will not be gzipped if the {@value DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME}
+ * Scored VCF file and index. The VCF will not be gzipped if the "--do-not-gzip-vcf-output"
* argument is set to true. The INFO field in each VCF record will be annotated with:
*
*
- * 1) a score (with a key as given by the {@value SCORE_KEY_LONG_NAME} argument,
- * which has a default value of {@value DEFAULT_SCORE_KEY}),
+ * 1) a score (with a key as given by the "--score-key" argument, which has a default value of "SCORE"),
*
*
* 2) if resources are provided, flags corresponding to the labels (e.g.,
- * {@value LabeledVariantAnnotationsData#TRAINING_LABEL}, {@value LabeledVariantAnnotationsData#CALIBRATION_LABEL}, etc.)
+ * "training", "calibration", etc.)
* of resources containing the record,
*
*
- * 3) if the {@value SNP_KEY_LONG_NAME} argument (which has a default value of {@value DEFAULT_SNP_KEY})
+ * 3) if the "--snp-key" argument (which has a default value of "snp")
* is non-null, a flag corresponding to whether a site is treated as a SNP,
*
*
- * 4) if {@value SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} and/or
- * {@value INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} are provided, a filter (with name given by
- * the {@value LOW_SCORE_FILTER_NAME_LONG_NAME} argument, which has a default value of
- * {@value DEFAULT_LOW_SCORE_FILTER_NAME}) will be applied if a record has a calibration-set sensitivity
+ * 4) if "--snp-calibration-sensitivity-threshold" and/or
+ * "--indel-calibration-sensitivity-threshold" are provided, a filter (with name given by
+ * the "--low-score-filter-name" argument, which has a default value of
+ * "LOW_SCORE") will be applied if a record has a calibration-set sensitivity
* falling above the appropriate threshold (i.e., if it has a score falling below the corresponding
* score threshold).
*
*
- * If {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} is true, the score, SNP flag, calibration sensitivity,
- * and filter appropriate for the highest scoring allele are used; however, the resource labels for all alleles
- * are applied.
+ * In allele-specific mode (i.e., when allele-specific annotations are requested), the score, SNP flag,
+ * calibration sensitivity, and filter appropriate for the highest scoring allele are used for any
+ * multiallelic records; however, the resource labels for all alleles are applied.
*
*
*
@@ -162,7 +161,7 @@
*
* -
* (Optional) Scores HDF5 file (.scores.hdf5). Scores for all scored sites are stored in the
- * HDF5 path {@value VariantAnnotationsScorer#SCORES_PATH}. Scores are given in the same order as records
+ * HDF5 path "/data/scores". Scores are given in the same order as records
* in both the VCF and the annotations HDF5 file. This file will only be produced if the number of scored sites
* is nonzero.
*
@@ -174,12 +173,12 @@
*
* Score sites using a model (produced by {@link TrainVariantAnnotationsModel} using the default
* {@link VariantAnnotationsModelBackend#PYTHON_IFOREST} model backend and contained in the directory
- * {@code model_dir}), producing the outputs 1) {@code output.vcf.gz}, 2) {@code output.vcf.gz.tbi},
- * 3) {@code output.annot.hdf5}, and 4) {@code output.scores.hdf5}. Note that {@code extract.vcf.gz} is
+ * model_dir), producing the outputs 1) output.vcf.gz, 2) output.vcf.gz.tbi,
+ * 3) output.annot.hdf5, and 4) output.scores.hdf5. Note that extract.vcf.gz is
* produced by {@link ExtractVariantAnnotations}. Records will be filtered according to the values provided to the
- * {@value SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} and {@value INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME}
+ * "--snp-calibration-sensitivity-threshold" and "--indel-calibration-sensitivity-threshold"
* arguments; the values below are only meant to be illustrative and should be set as appropriate for a given analysis.
- * Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both SNP and INDEL modes are
+ * Note that the "--mode" arguments are made explicit here, although both SNP and INDEL modes are
* selected by default.
*
*
@@ -204,9 +203,9 @@
*
* One may chain together two runs of this tool to score SNPs and INDELs using different models
* (note that SNP and INDEL models have "snp" and "indel" tags in their respective filenames, so these
- * models can still be contained in the same {@code model_dir} directory).
+ * models can still be contained in the same model_dir directory).
* This may have implications for mixed SNP/INDEL sites, especially if filters are applied; see also the
- * {@value IGNORE_ALL_FILTERS_LONG_NAME} and {@value IGNORE_FILTER_LONG_NAME} arguments.
+ * "--ignore-all-filters" and "--ignore-filter" arguments.
*
*
* gatk ScoreVariantAnnotations \
@@ -223,7 +222,7 @@
* -O intermediate-output
*
* gatk ScoreVariantAnnotations \
- * -V intermediate-output.vcf \
+ * -V intermediate-output.vcf.gz \
* -A indel_annotation_1 \
* ...
* -A indel_annotation_M \
@@ -236,16 +235,23 @@
* -O output
*
*
+ *
+ * Note that separate SNP and INDEL resources are shown in the above examples purely for demonstration purposes,
+ * as are separate training and calibration resources. However, it may be desirable to specify combined
+ * resource(s); e.g., "--resource:combined-resource,training=true,calibration=true combined-resource.vcf".
+ * Recall that this is also the case in {@link ExtractVariantAnnotations}.
+ *
+ *
* Custom modeling/scoring backends (ADVANCED)
*
*
* The primary scoring functionality performed by this tool is accomplished by a "scoring backend"
* whose fundamental contract is to take an input annotation matrix and to output corresponding scores,
* with both input and output given as HDF5 files. Rather than using one of the available, implemented backends,
- * advanced users may provide their own backend via the {@value PYTHON_SCRIPT_LONG_NAME} argument.
+ * advanced users may provide their own backend via the "--python-script" argument.
* See documentation in the modeling and scoring interfaces ({@link VariantAnnotationsModel} and
* {@link VariantAnnotationsScorer}, respectively), as well as the default Python IsolationForest implementation at
- * {@link PythonSklearnVariantAnnotationsScorer} and
+ * {@link PythonVariantAnnotationsScorer} and
* src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py.
*
*
@@ -485,15 +491,9 @@ protected void afterNthPass(final int n) {
private VariantAnnotationsScorer deserializeScorerFromPklFiles(final VariantType variantType) {
final String variantTypeTag = '.' + variantType.toString().toLowerCase();
final File scorerPklFile = new File(
- modelPrefix + variantTypeTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX);
- final File negativeScorerPklFile = new File(
- modelPrefix + variantTypeTag + TrainVariantAnnotationsModel.NEGATIVE_TAG + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX);
+ modelPrefix + variantTypeTag + PythonVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX);
return scorerPklFile.canRead()
- ? negativeScorerPklFile.canRead()
- ? VariantAnnotationsScorer.combinePositiveAndNegativeScorer(
- new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, scorerPklFile),
- new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, negativeScorerPklFile))
- : new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, scorerPklFile)
+ ? new PythonVariantAnnotationsScorer(pythonScriptFile, scorerPklFile)
: null;
}
@@ -501,14 +501,8 @@ private VariantAnnotationsScorer deserializeScorerFromSerFiles(final VariantType
final String variantTypeTag = '.' + variantType.toString().toLowerCase();
final File scorerSerFile = new File(
modelPrefix + variantTypeTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX);
- final File negativeScorerSerFile = new File(
- modelPrefix + variantTypeTag + TrainVariantAnnotationsModel.NEGATIVE_TAG + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX);
return scorerSerFile.canRead()
- ? negativeScorerSerFile.canRead()
- ? VariantAnnotationsScorer.combinePositiveAndNegativeScorer(
- BGMMVariantAnnotationsScorer.deserialize(scorerSerFile),
- BGMMVariantAnnotationsScorer.deserialize(negativeScorerSerFile))
- : BGMMVariantAnnotationsScorer.deserialize(scorerSerFile)
+ ? BGMMVariantAnnotationsScorer.deserialize(scorerSerFile)
: null;
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModel.java
index 6baf81c74eb..570cba33652 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModel.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModel.java
@@ -1,9 +1,7 @@
package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable;
import com.google.common.collect.Streams;
-import com.google.common.primitives.Doubles;
import org.apache.commons.math3.stat.descriptive.moment.Variance;
-import org.apache.commons.math3.stat.descriptive.rank.Percentile;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
@@ -17,8 +15,8 @@
import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType;
import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.BGMMVariantAnnotationsModel;
import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.BGMMVariantAnnotationsScorer;
-import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonSklearnVariantAnnotationsModel;
-import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonSklearnVariantAnnotationsScorer;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonVariantAnnotationsModel;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonVariantAnnotationsScorer;
import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModel;
import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModelBackend;
import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsScorer;
@@ -31,7 +29,6 @@
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
-import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
@@ -40,10 +37,10 @@
* Trains a model for scoring variant calls based on site-level annotations.
*
*
- * This tool is intended to be used as the second step in a variant-filtering workflow that supersedes the
+ * This tool is primarily intended to be used as the second step in a variant-filtering workflow that supersedes the
* {@link VariantRecalibrator} workflow. Given training (and optionally, calibration) sets of site-level annotations
* produced by {@link ExtractVariantAnnotations}, this tool can be used to train a model for scoring variant
- * calls. For each variant type (i.e., SNP or INDEL) specified using the {@value MODE_LONG_NAME} argument, the tool
+ * calls. For each variant type (i.e., SNP or INDEL) specified using the "--mode" argument, the tool
* outputs files that are either: 1) serialized scorers, each of which persists to disk a function for computing
* scores given subsequent annotations, or 2) HDF5 files containing a set of scores, each corresponding to training,
* calibration, and unlabeled sets, as appropriate.
@@ -59,29 +56,21 @@
*
Modeling approaches
*
*
- * This tool can perform modeling using either a positive-only approach or a positive-negative approach.
+ * This tool can perform modeling using either a positive-only approach or a positive-unlabeled approach.
* In a positive-only approach, the annotation-space distribution of training sites is used to learn a
* function for converting annotations for subsequent sites into a score; typically, higher scores correspond to
- * regions of annotation space that are more densely populated by training sites. In contrast, a positive-negative
- * approach attempts to additionally use unlabeled sites to better identify regions of annotation space that correspond
- * to low scores against the original, positive-only model (with the assumption being that unlabeled sites are
- * more likely to populate such regions than are training sites). A second, negative model can then be trained,
- * and the resulting scores (which are presumably higher in regions of annotation space that are less densely
- * populated by the original training sites) can be subtracted from the original scores to produce a final score.
- * (Note that this positive-negative approach could be considered as a single iteration of a more general
- * approach typically referred to as positive-unlabeled learning.)
+ * regions of annotation space that are more densely populated by training sites. In contrast, a positive-unlabeled
+ * approach attempts to additionally use unlabeled sites to better learn not only these regions of annotation space
+ * populated by training sites, but also those that are populated by sites that may be drawn from a different distribution.
*
*
*
* A positive-only approach is likely to perform well in cases where a sufficient number of reliable training sites
* is available. In contrast, if 1) only a small number of reliable training sites is available, and/or
* 2) the reliability of the training sites is questionable (e.g., the sites may be contaminated by
- * a non-negigible number of sequencing artifacts), then a positive-negative approach may be beneficial.
- * However, note that the positive-negative approach introduces an additional hyperparameter---the threshold
- * that determines the selection of sites for training the negative model, controlled by the
- * {@value CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} argument---which may require tuning.
+ * a non-negligible number of sequencing artifacts), then a positive-unlabeled approach may be beneficial.
* Further note that although {@link VariantRecalibrator} (which this tool supplants) has typically been used to
- * implement a positive-negative approach, a positive-only approach likely suffices in many use cases.
+ * implement a naive positive-unlabeled approach, a positive-only approach likely suffices in many use cases.
*
*
*
@@ -94,24 +83,9 @@
* generated using the positive model and output to a file.
*
*
- * Additionally, if a positive-negative approach has been specified (i.e., the {@value UNLABELED_ANNOTATIONS_HDF5_LONG_NAME}
- * and {@value CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} arguments have been provided),
- * and if both unlabeled and calibration sites of the variant type are available, then:
- *
- *
- * - 4) The calibration scores generated from the positive model are used to convert the
- * calibration-sensitivity threshold into a score threshold,
- * - 5) Training sites with scores below the score threshold are selected for training a negative model,
- * - 6) Scores for unlabeled sites are generated using the positive model and output to a file,
- * - 7) Unlabeled sites with scores below the score threshold are selected for training a negative model,
- * - 8) A negative model is trained using these selected training and unlabeled sites and is serialized to file,
- * - 9) Scores for calibration sites are generated using the positive-negative model and overwritten in the existing file.
- *
- *
- * Note that the positive-negative approach thus yields 1) scores for training and unlabeled sites generated from
- * the positive model and 2) scores for calibration sites generated from the positive-negative model. This is opposed
- * to generating scores from all sites from the positive-negative model, since these can simply be obtained from
- * a downstream run of {@link ScoreVariantAnnotations}.
+ * In contrast, a positive-unlabeled approach may instead be specified by providing the
+ * "--unlabeled-annotations-hdf5" argument. Currently, this requires the use of a custom modeling backend;
+ * see below.
*
*
* Modeling backends
@@ -131,14 +105,14 @@
*
*
*
- * This backend can be selected by specifying {@code PYTHON_IFOREST} to the {@value MODEL_BACKEND_LONG_NAME} argument
+ * This backend can be selected by specifying "--model-backend PYTHON_IFOREST"
* and is also currently the the default backend. It is implemented by the script at
* src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py, which
* requires that the argparse, h5py, numpy, sklearn, and dill packages be present in the Python environment; users
* may wish to simply use the provided GATK conda environment to ensure that the correct versions of all packages are available.
* See the IsolationForest documentation here
* as appropriate for the version of scikit-learn used in your Python environment. The hyperparameters documented
- * there can be specified using the {@value HYPERPARAMETERS_JSON_LONG_NAME} argument; see
+ * there can be specified using the "--hyperparameters-json" argument; see
* src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest-hyperparameters.json
* for an example and the default values.
*
@@ -166,15 +140,13 @@
*
* Labeled-annotations HDF5 file (.annot.hdf5). Annotation data and metadata for labeled sites are stored in the
* HDF5 directory structure given in the documentation for the {@link ExtractVariantAnnotations} tool. In typical
- * usage, both the {@value LabeledVariantAnnotationsData#TRAINING_LABEL} and
- * {@value LabeledVariantAnnotationsData#CALIBRATION_LABEL} labels would be available for non-empty sets of
+ * usage, both the "training" and "calibration" labels would be available for non-empty sets of
* sites of the requested variant type.
*
*
* (Optional) Unlabeled-annotations HDF5 file (.unlabeled.annot.hdf5). Annotation data and metadata for
* unlabeled sites are stored in the HDF5 directory structure given in the documentation for the
- * {@link ExtractVariantAnnotations} tool. If provided, a positive-negative modeling approach (similar to
- * that used in {@link VariantRecalibrator} will be used.
+ * {@link ExtractVariantAnnotations} tool. If provided, a positive-unlabeled modeling approach will be used.
*
*
* Variant types (i.e., SNP and/or INDEL) for which to train models. Logic for determining variant type was retained from
@@ -185,7 +157,7 @@
*
*
* (Optional) Model backend. The Python isolation-forest backend is currently the default backend.
- * A custom backend can also be specified in conjunction with the {@value PYTHON_SCRIPT_LONG_NAME} argument.
+ * A custom backend can also be specified in conjunction with the "--python-script" argument.
*
*
* (Optional) Model hyperparameters JSON file. This file can be used to specify backend-specific
@@ -205,10 +177,10 @@
* Outputs
*
*
- * The following outputs are produced for each variant type specified by the {@value MODE_LONG_NAME} argument
+ * The following outputs are produced for each variant type specified by the "--mode" argument
* and are delineated by type-specific tags in the filename of each output, which take the form of
- * {@code {output-prefix}.{variant-type}.{file-suffix}}. For example, scores for the SNP calibration set
- * will be output to the {@code {output-prefix}.snp.calibrationScores.hdf5} file.
+ * {output-prefix}.{variant-type}.{file-suffix}. For example, scores for the SNP calibration set
+ * will be output to the {output-prefix}.snp.calibrationScores.hdf5 file.
*
*
*
@@ -216,20 +188,11 @@
* Training-set positive-model scores HDF5 file (.trainingScores.hdf5).
*
*
- * Positive-model serialized scorer file. (.scorer.pkl for the default {@code PYTHON_IFOREST} model backend).
- *
- *
- * (Optional) Unlabeled-set positive-model scores HDF5 file (.unlabeledScores.hdf5). This is only output
- * if a positive-negative modeling approach is used.
+ * Positive-model serialized scorer file. (.scorer.pkl for the default PYTHON_IFOREST model backend).
*
*
* (Optional) Calibration-set scores HDF5 file (.calibrationScores.hdf5). This is only output if a calibration
- * set is provided. If a positive-only modeling approach is used, scores will be generated from the positive model;
- * if a positive-negative modeling approach is used, scores will be generated from the positive-negative model.
- *
- *
- * (Optional) Negative-model serialized scorer file. (.negative.scorer.pkl for the default {@code PYTHON_IFOREST} model backend).
- * This is only output if a positive-negative modeling approach is used.
+ * set is provided.
*
*
*
@@ -240,7 +203,7 @@
* given an input labeled-annotations HDF5 file generated by {@link ExtractVariantAnnotations} that contains
* labels for both training and calibration sets, producing the outputs 1) train.snp.scorer.pkl,
* 2) train.snp.trainingScores.hdf5, and 3) train.snp.calibrationScores.hdf5, as well as analogous files
- * for the INDEL model. Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both
+ * for the INDEL model. Note that the "--mode" arguments are made explicit here, although both
* SNP and INDEL modes are selected by default.
*
*
@@ -252,37 +215,17 @@
*
*
*
- *
- * Train SNP and INDEL models using the default Python IsolationForest model backend with a positive-negative approach
- * (using a calibration-sensitivity threshold of 0.95 to select sites for training the negative model),
- * given an input labeled-annotations HDF5 file that contains labels for both training and calibration sets
- * and an input unlabeled-annotations HDF5 file (with both HDF5 files generated by {@link ExtractVariantAnnotations}),
- * producing the outputs 1) train.snp.scorer.pkl, 2) train.snp.negative.scorer.pkl, 3) train.snp.trainingScores.hdf5,
- * 4) train.snp.calibrationScores.hdf5, and 5) train.snp.unlabeledScores.hdf5, as well as analogous files
- * for the INDEL model. Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both
- * SNP and INDEL modes are selected by default.
- *
- *
- * gatk TrainVariantAnnotationsModel \
- * --annotations-hdf5 extract.annot.hdf5 \
- * --unlabeled-annotations-hdf5 extract.unlabeled.annot.hdf5 \
- * --mode SNP \
- * --mode INDEL \
- * --calibration-sensitivity-threshold 0.95 \
- * -O train
- *
- *
- *
* Custom modeling/scoring backends (ADVANCED)
*
*
* The primary modeling functionality performed by this tool is accomplished by a "modeling backend"
* whose fundamental contract is to take an input HDF5 file containing an annotation matrix for sites of a
- * single variant type (i.e., SNP or INDEL) and to output a serialized scorer for that variant type.
+ * single variant type (i.e., SNP or INDEL) (as well as an analogous HDF5 file for unlabeled sites,
+ * if a positive-unlabeled modeling approach has been specified) and to output a serialized scorer for that variant type.
* Rather than using one of the available, implemented backends, advanced users may provide their own backend
- * via the {@value PYTHON_SCRIPT_LONG_NAME} argument. See documentation in the modeling and scoring interfaces
+ * via the "--python-script" argument. See documentation in the modeling and scoring interfaces
* ({@link VariantAnnotationsModel} and {@link VariantAnnotationsScorer}, respectively), as well as the default
- * Python IsolationForest implementation at {@link PythonSklearnVariantAnnotationsModel} and
+ * Python IsolationForest implementation at {@link PythonVariantAnnotationsModel} and
* src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py.
*
*
@@ -311,7 +254,6 @@ public final class TrainVariantAnnotationsModel extends CommandLineProgram {
public static final String MODEL_BACKEND_LONG_NAME = "model-backend";
public static final String PYTHON_SCRIPT_LONG_NAME = "python-script";
public static final String HYPERPARAMETERS_JSON_LONG_NAME = "hyperparameters-json";
- public static final String CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME = "calibration-sensitivity-threshold";
public static final String ISOLATION_FOREST_PYTHON_SCRIPT = "isolation-forest.py";
public static final String ISOLATION_FOREST_HYPERPARAMETERS_JSON = "isolation-forest-hyperparameters.json";
@@ -323,7 +265,6 @@ enum AvailableLabelsMode {
public static final String TRAINING_SCORES_HDF5_SUFFIX = ".trainingScores.hdf5";
public static final String CALIBRATION_SCORES_HDF5_SUFFIX = ".calibrationScores.hdf5";
public static final String UNLABELED_SCORES_HDF5_SUFFIX = ".unlabeledScores.hdf5";
- public static final String NEGATIVE_TAG = ".negative";
@Argument(
fullName = ANNOTATIONS_HDF5_LONG_NAME,
@@ -333,8 +274,7 @@ enum AvailableLabelsMode {
@Argument(
fullName = UNLABELED_ANNOTATIONS_HDF5_LONG_NAME,
doc = "HDF5 file containing annotations extracted with ExtractVariantAnnotations. " +
- "If specified with " + CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME + ", " +
- "a positive-unlabeled modeling approach will be used; otherwise, a positive-only modeling " +
+ "If specified, a positive-unlabeled modeling approach will be used; otherwise, a positive-only modeling " +
"approach will be used.",
optional = true)
private File inputUnlabeledAnnotationsFile;
@@ -368,20 +308,6 @@ enum AvailableLabelsMode {
doc = "Output prefix.")
private String outputPrefix;
- @Argument(
- fullName = CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME,
- doc = "Calibration-sensitivity threshold that determines which sites will be used for training the negative model " +
- "in the positive-unlabeled modeling approach. " +
- "Increasing this will decrease the corresponding positive-model score threshold; sites with scores below this score " +
- "threshold will be used for training the negative model. Thus, this parameter should typically be chosen to " +
- "be close to 1, so that sites that score highly according to the positive model will not be used to train the negative model. " +
- "The " + UNLABELED_ANNOTATIONS_HDF5_LONG_NAME + " argument must be specified in conjunction with this argument. " +
- "If separate thresholds for SNP and INDEL models are desired, run the tool separately for each mode with its respective threshold.",
- optional = true,
- minValue = 0.,
- maxValue = 1.)
- private Double calibrationSensitivityThreshold;
-
@Argument(
fullName = MODE_LONG_NAME,
doc = "Variant types for which to train models. Duplicate values will be ignored.",
@@ -411,11 +337,7 @@ protected Object doWork() {
private void validateArgumentsAndSetModes() {
IOUtils.canReadFile(inputAnnotationsFile);
- Utils.validateArg((inputUnlabeledAnnotationsFile == null) == (calibrationSensitivityThreshold == null),
- "Unlabeled annotations and calibration-sensitivity threshold must both be unspecified (for positive-only model training) " +
- "or specified (for positive-negative model training).");
-
- availableLabelsMode = inputUnlabeledAnnotationsFile != null && calibrationSensitivityThreshold != null
+ availableLabelsMode = inputUnlabeledAnnotationsFile != null
? AvailableLabelsMode.POSITIVE_UNLABELED
: AvailableLabelsMode.POSITIVE_ONLY;
@@ -485,7 +407,29 @@ private void doModelingWorkForVariantType(final VariantType variantType) {
logger.info(String.format("Training %s model with %d training sites x %d annotations %s...",
variantTypeString, numTrainingAndVariantType, annotationNames.size(), annotationNames));
final File labeledTrainingAndVariantTypeAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, annotations, isTrainingAndVariantType);
- trainAndSerializeModel(labeledTrainingAndVariantTypeAnnotationsFile, outputPrefixTag);
+
+ File unlabeledAndVariantTypeAnnotationsFile = null;
+ int numUnlabeledAndVariantType = 0;
+ if (availableLabelsMode == AvailableLabelsMode.POSITIVE_UNLABELED) {
+ final double[][] unlabeledAnnotations = LabeledVariantAnnotationsData.readAnnotations(inputUnlabeledAnnotationsFile);
+ final List unlabeledIsSNP = LabeledVariantAnnotationsData.readLabel(inputUnlabeledAnnotationsFile, "snp");
+ final List isUnlabeledAndVariantType = variantType == VariantType.SNP ? unlabeledIsSNP : unlabeledIsSNP.stream().map(x -> !x).collect(Collectors.toList());
+
+ numUnlabeledAndVariantType = numPassingFilter(isUnlabeledAndVariantType);
+
+ if (numUnlabeledAndVariantType > 0) {
+ logger.info(String.format("Training %s model with %d unlabeled sites x %d annotations %s...",
+ variantTypeString, numUnlabeledAndVariantType, annotationNames.size(), annotationNames));
+ unlabeledAndVariantTypeAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(
+ annotationNames, unlabeledAnnotations, isUnlabeledAndVariantType);
+ } else {
+ throw new UserException.BadInput(String.format("Attempted to train %s model, " +
+ "but no suitable unlabeled sites were found in the provided annotations.", variantTypeString));
+ }
+ }
+
+ trainAndSerializeModel(labeledTrainingAndVariantTypeAnnotationsFile, unlabeledAndVariantTypeAnnotationsFile, outputPrefixTag);
+
logger.info(String.format("%s model trained and serialized with output prefix \"%s\".", variantTypeString, outputPrefix + outputPrefixTag));
if (modelBackend == VariantAnnotationsModelBackend.JAVA_BGMM) {
@@ -508,64 +452,10 @@ private void doModelingWorkForVariantType(final VariantType variantType) {
logger.warn(String.format("No %s calibration sites were available.", variantTypeString));
}
- // negative model
- if (availableLabelsMode == AvailableLabelsMode.POSITIVE_UNLABELED) {
- if (numLabeledCalibrationAndVariantType == 0) {
- throw new UserException.BadInput(String.format("Attempted to train %s negative model, " +
- "but no suitable calibration sites were found in the provided annotations.", variantTypeString));
- }
- final double[][] unlabeledAnnotations = LabeledVariantAnnotationsData.readAnnotations(inputUnlabeledAnnotationsFile);
- final List unlabeledIsSNP = LabeledVariantAnnotationsData.readLabel(inputUnlabeledAnnotationsFile, "snp");
- final List isUnlabeledVariantType = variantType == VariantType.SNP ? unlabeledIsSNP : unlabeledIsSNP.stream().map(x -> !x).collect(Collectors.toList());
-
- final int numUnlabeledVariantType = numPassingFilter(isUnlabeledVariantType);
-
- if (numUnlabeledVariantType > 0) {
- final File labeledCalibrationAndVariantTypeScoresFile = new File(outputPrefix + outputPrefixTag + CALIBRATION_SCORES_HDF5_SUFFIX);
- final double[] labeledCalibrationAndVariantTypeScores = VariantAnnotationsScorer.readScores(labeledCalibrationAndVariantTypeScoresFile);
- final double scoreThreshold = calibrationSensitivityThreshold == 1. // Percentile requires quantile > 0, so we treat this as a special case
- ? Doubles.min(labeledCalibrationAndVariantTypeScores)
- : new Percentile(100. * (1. - calibrationSensitivityThreshold)).evaluate(labeledCalibrationAndVariantTypeScores);
- logger.info(String.format("Using %s score threshold of %.4f corresponding to specified calibration-sensitivity threshold of %.4f ...",
- variantTypeString, scoreThreshold, calibrationSensitivityThreshold));
-
- final double[] labeledTrainingAndVariantTypeScores = VariantAnnotationsScorer.readScores(labeledTrainingAndVariantTypeScoresFile);
- final List isNegativeTrainingFromLabeledTrainingAndVariantType = Arrays.stream(labeledTrainingAndVariantTypeScores).boxed().map(s -> s < scoreThreshold).collect(Collectors.toList());
-
- logger.info(String.format("Scoring %d unlabeled %s sites...", numUnlabeledVariantType, variantTypeString));
- final File unlabeledVariantTypeAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, unlabeledAnnotations, isUnlabeledVariantType);
- final File unlabeledVariantTypeScoresFile = score(unlabeledVariantTypeAnnotationsFile, outputPrefixTag, UNLABELED_SCORES_HDF5_SUFFIX);
- final double[] unlabeledVariantTypeScores = VariantAnnotationsScorer.readScores(unlabeledVariantTypeScoresFile);
- final List isNegativeTrainingFromUnlabeledVariantType = Arrays.stream(unlabeledVariantTypeScores).boxed().map(s -> s < scoreThreshold).collect(Collectors.toList()); // length matches unlabeledAnnotationsFile
- final int numNegativeTrainingFromUnlabeledVariantType = numPassingFilter(isNegativeTrainingFromUnlabeledVariantType);
- logger.info(String.format("Selected %d unlabeled %s sites below score threshold of %.4f for negative-model training...",
- numNegativeTrainingFromUnlabeledVariantType, variantTypeString, scoreThreshold));
-
- final double[][] negativeTrainingAndVariantTypeAnnotations = concatenateLabeledAndUnlabeledNegativeTrainingData(
- annotationNames, annotations, unlabeledAnnotations, isNegativeTrainingFromLabeledTrainingAndVariantType, isNegativeTrainingFromUnlabeledVariantType);
- final int numNegativeTrainingAndVariantType = negativeTrainingAndVariantTypeAnnotations.length;
- final List isNegativeTrainingAndVariantType = Collections.nCopies(numNegativeTrainingAndVariantType, true);
-
- if (numNegativeTrainingAndVariantType > 0) {
- logger.info(String.format("Training %s negative model with %d negative-training sites x %d annotations %s...",
- variantTypeString, numNegativeTrainingAndVariantType, annotationNames.size(), annotationNames));
- final File negativeTrainingAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(
- annotationNames, negativeTrainingAndVariantTypeAnnotations, isNegativeTrainingAndVariantType);
- trainAndSerializeModel(negativeTrainingAnnotationsFile, outputPrefixTag + NEGATIVE_TAG);
- logger.info(String.format("%s negative model trained and serialized with output prefix \"%s\".", variantTypeString, outputPrefix + outputPrefixTag + NEGATIVE_TAG));
- } else {
- throw new UserException.BadInput(String.format("Attempted to train %s negative model, " +
- "but no suitable sites with scores below the specified threshold were found in the provided annotations.", variantTypeString));
- }
-
- logger.info(String.format("Re-scoring %d %s calibration sites...", numLabeledCalibrationAndVariantType, variantTypeString));
- final File labeledCalibrationAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, annotations, isLabeledCalibrationAndVariantType);
- final File labeledCalibrationScoresFile = positiveNegativeScore(labeledCalibrationAnnotationsFile, outputPrefixTag, CALIBRATION_SCORES_HDF5_SUFFIX);
- logger.info(String.format("Calibration scores written to %s.", labeledCalibrationScoresFile.getAbsolutePath()));
- } else {
- throw new UserException.BadInput(String.format("Attempted to train %s negative model, " +
- "but no suitable unlabeled sites were found in the provided annotations.", variantTypeString));
- }
+ if (availableLabelsMode == AvailableLabelsMode.POSITIVE_UNLABELED && unlabeledAndVariantTypeAnnotationsFile != null) {
+ logger.info(String.format("Scoring %d %s unlabeled sites...", numUnlabeledAndVariantType, variantTypeString));
+ final File unlabeledAndVariantTypeScoresFile = score(unlabeledAndVariantTypeAnnotationsFile, outputPrefixTag, UNLABELED_SCORES_HDF5_SUFFIX);
+ logger.info(String.format("%s unlabeled scores written to %s.", variantTypeString, unlabeledAndVariantTypeScoresFile.getAbsolutePath()));
}
} else {
throw new UserException.BadInput(String.format("Attempted to train %s model, " +
@@ -577,34 +467,41 @@ private static int numPassingFilter(final List isPassing) {
return (int) isPassing.stream().filter(x -> x).count();
}
+ /**
+ * @param unlabeledAnnotationsFile if not {@code null}, use a positive-unlabeled approach
+ */
private void trainAndSerializeModel(final File trainingAnnotationsFile,
+ final File unlabeledAnnotationsFile,
final String outputPrefixTag) {
- readAndValidateTrainingAnnotations(trainingAnnotationsFile, outputPrefixTag);
+ readAndValidateAnnotations(trainingAnnotationsFile, outputPrefixTag);
+ if (unlabeledAnnotationsFile != null) {
+ readAndValidateAnnotations(unlabeledAnnotationsFile, outputPrefixTag);
+ }
final VariantAnnotationsModel model;
switch (modelBackend) {
case JAVA_BGMM:
model = new BGMMVariantAnnotationsModel(hyperparametersJSONFile);
break;
case PYTHON_IFOREST:
- model = new PythonSklearnVariantAnnotationsModel(pythonScriptFile, hyperparametersJSONFile);
+ model = new PythonVariantAnnotationsModel(pythonScriptFile, hyperparametersJSONFile);
break;
case PYTHON_SCRIPT:
- model = new PythonSklearnVariantAnnotationsModel(pythonScriptFile, hyperparametersJSONFile);
+ model = new PythonVariantAnnotationsModel(pythonScriptFile, hyperparametersJSONFile);
break;
default:
throw new GATKException.ShouldNeverReachHereException("Unknown model mode.");
}
- model.trainAndSerialize(trainingAnnotationsFile, outputPrefix + outputPrefixTag);
+ model.trainAndSerialize(trainingAnnotationsFile, unlabeledAnnotationsFile, outputPrefix + outputPrefixTag);
}
/**
* When training models on data that has been subset to a given variant type,
* we FAIL if any annotation is completely missing and WARN if any annotation has zero variance.
*/
- private void readAndValidateTrainingAnnotations(final File trainingAnnotationsFile,
- final String outputPrefixTag) {
- final List annotationNames = LabeledVariantAnnotationsData.readAnnotationNames(trainingAnnotationsFile);
- final double[][] annotations = LabeledVariantAnnotationsData.readAnnotations(trainingAnnotationsFile);
+ private void readAndValidateAnnotations(final File annotationsFile,
+ final String outputPrefixTag) {
+ final List annotationNames = LabeledVariantAnnotationsData.readAnnotationNames(annotationsFile);
+ final double[][] annotations = LabeledVariantAnnotationsData.readAnnotations(annotationsFile);
// these checks are redundant, but we err on the side of robustness
final int numAnnotationNames = annotationNames.size();
@@ -631,14 +528,8 @@ private void readAndValidateTrainingAnnotations(final File trainingAnnotationsFi
if (!completelyMissingAnnotationNames.isEmpty()) {
throw new UserException.BadInput(
String.format("All values of the following annotations are missing in the training data for the %s model: %s. " +
- "Consider repeating the extraction step with this annotation dropped. " +
- "If this is a negative model and the amount of negative training data is small, " +
- "perhaps also consider lowering the value of the %s argument so that more " +
- "training data is considered, which may ultimately admit data with non-missing values for the annotation " +
- "(although note that this will also have implications for the resulting model fit); " +
- "alternatively, consider excluding the %s and %s arguments and running positive-only modeling.",
- outputPrefix + outputPrefixTag, completelyMissingAnnotationNames,
- CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, UNLABELED_ANNOTATIONS_HDF5_LONG_NAME, CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME));
+ "Consider repeating the extraction step without specifying these annotations. ",
+ outputPrefix + outputPrefixTag, completelyMissingAnnotationNames));
}
}
@@ -652,7 +543,7 @@ private File score(final File annotationsFile,
break;
case PYTHON_IFOREST:
case PYTHON_SCRIPT:
- scorer = new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, new File(outputPrefix + outputPrefixTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX));
+ scorer = new PythonVariantAnnotationsScorer(pythonScriptFile, new File(outputPrefix + outputPrefixTag + PythonVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX));
break;
default:
@@ -662,56 +553,4 @@ private File score(final File annotationsFile,
scorer.score(annotationsFile, outputScoresFile);
return outputScoresFile;
}
-
- private File positiveNegativeScore(final File annotationsFile,
- final String outputPrefixTag,
- final String outputSuffix) {
- final VariantAnnotationsScorer scorer;
- switch (modelBackend) {
- case JAVA_BGMM:
- scorer = VariantAnnotationsScorer.combinePositiveAndNegativeScorer(
- BGMMVariantAnnotationsScorer.deserialize(new File(outputPrefix + outputPrefixTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX)),
- BGMMVariantAnnotationsScorer.deserialize(new File(outputPrefix + outputPrefixTag + NEGATIVE_TAG + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX)));
- break;
- case PYTHON_IFOREST:
- case PYTHON_SCRIPT:
- scorer = VariantAnnotationsScorer.combinePositiveAndNegativeScorer(
- new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, new File(outputPrefix + outputPrefixTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX)),
- new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, new File(outputPrefix + outputPrefixTag + NEGATIVE_TAG + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX)));
- break;
- default:
- throw new GATKException.ShouldNeverReachHereException("Unknown model mode.");
- }
- final File outputScoresFile = new File(outputPrefix + outputPrefixTag + outputSuffix);
- scorer.score(annotationsFile, outputScoresFile);
- return outputScoresFile;
- }
-
- private static double[][] concatenateLabeledAndUnlabeledNegativeTrainingData(final List annotationNames,
- final double[][] annotations,
- final double[][] unlabeledAnnotations,
- final List isNegativeTrainingFromLabeledTrainingAndVariantType,
- final List isNegativeTrainingFromUnlabeledVariantType) {
- final double[][] negativeTrainingFromLabeledTrainingAndVariantTypeAnnotations;
- if (numPassingFilter(isNegativeTrainingFromLabeledTrainingAndVariantType) > 0) {
- final File negativeTrainingFromLabeledTrainingAndVariantTypeAnnotationsFile =
- LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, annotations, isNegativeTrainingFromLabeledTrainingAndVariantType);
- negativeTrainingFromLabeledTrainingAndVariantTypeAnnotations = LabeledVariantAnnotationsData.readAnnotations(negativeTrainingFromLabeledTrainingAndVariantTypeAnnotationsFile);
- } else {
- negativeTrainingFromLabeledTrainingAndVariantTypeAnnotations = new double[0][];
- }
-
- final double[][] negativeTrainingFromUnlabeledVariantTypeAnnotations;
- if (numPassingFilter(isNegativeTrainingFromUnlabeledVariantType) > 0) {
- final File negativeTrainingFromUnlabeledVariantTypeAnnotationsFile =
- LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, unlabeledAnnotations, isNegativeTrainingFromUnlabeledVariantType);
- negativeTrainingFromUnlabeledVariantTypeAnnotations = LabeledVariantAnnotationsData.readAnnotations(negativeTrainingFromUnlabeledVariantTypeAnnotationsFile);
- } else {
- negativeTrainingFromUnlabeledVariantTypeAnnotations = new double[0][];
- }
-
- return Streams.concat(
- Arrays.stream(negativeTrainingFromLabeledTrainingAndVariantTypeAnnotations),
- Arrays.stream(negativeTrainingFromUnlabeledVariantTypeAnnotations)).toArray(double[][]::new);
- }
}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsDatum.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsDatum.java
index 6b960d1042f..665777c1936 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsDatum.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsDatum.java
@@ -93,8 +93,8 @@ private static double decodeAnnotation(final VariantContext vc,
value = vc.getAttributeAsDouble(annotationName, Double.NaN);
} catch (final ClassCastException e) {
throw new UserException(String.format("Could not extract annotation %s from variant context: %s. " +
- "Ensure that %s is specified, if desired. Encountered exception: %s",
- annotationName, vc, LabeledVariantAnnotationsWalker.USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME, e));
+ "Encountered exception: %s",
+ annotationName, vc, e));
}
}
if (Double.isInfinite(value)) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsModel.java
index 14fedaa0a98..9f46d801869 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsModel.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsModel.java
@@ -14,6 +14,7 @@ public BGMMVariantAnnotationsModel(final File hyperparametersJSONFile) {
@Override
public void trainAndSerialize(final File trainingAnnotationsFile,
+ final File unlabeledAnnotationsFile,
final String outputPrefix) {
throw new NotImplementedException("BGMM module will be implemented in separate PR.");
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonVariantAnnotationsModel.java
similarity index 81%
rename from src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsModel.java
rename to src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonVariantAnnotationsModel.java
index bbe082186a3..542f7b2cbcc 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsModel.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonVariantAnnotationsModel.java
@@ -29,25 +29,26 @@
*
* See src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py for an example implementation.
*/
-public final class PythonSklearnVariantAnnotationsModel implements VariantAnnotationsModel {
+public final class PythonVariantAnnotationsModel implements VariantAnnotationsModel {
private final File pythonScriptFile;
private final File hyperparametersJSONFile;
- public PythonSklearnVariantAnnotationsModel(final File pythonScriptFile,
- final File hyperparametersJSONFile) {
+ public PythonVariantAnnotationsModel(final File pythonScriptFile,
+ final File hyperparametersJSONFile) {
this.pythonScriptFile = pythonScriptFile;
this.hyperparametersJSONFile = hyperparametersJSONFile;
}
@Override
public void trainAndSerialize(final File trainingAnnotationsFile,
+ final File unlabeledAnnotationsFile,
final String outputPrefix) {
final PythonScriptExecutor executor = new PythonScriptExecutor(true);
final ProcessOutput pythonProcessOutput = executor.executeScriptAndGetOutput(
pythonScriptFile.getAbsolutePath(),
null,
- composePythonArguments(trainingAnnotationsFile, hyperparametersJSONFile, outputPrefix));
+ composePythonArguments(trainingAnnotationsFile, unlabeledAnnotationsFile, hyperparametersJSONFile, outputPrefix));
if (pythonProcessOutput.getExitValue() != 0) {
throw executor.getScriptException(executor.getExceptionMessageFromScriptError(pythonProcessOutput));
@@ -55,13 +56,18 @@ public void trainAndSerialize(final File trainingAnnotationsFile,
}
private static List composePythonArguments(final File annotationsFile,
+ final File unlabeledAnnotationsFile,
final File hyperparametersJSONFile,
final String outputPrefix) {
try {
- return new ArrayList<>(Arrays.asList(
+ final List arguments = new ArrayList<>(Arrays.asList(
"--annotations_file=" + annotationsFile.getCanonicalPath(),
"--hyperparameters_json_file=" + hyperparametersJSONFile.getCanonicalPath(),
"--output_prefix=" + outputPrefix));
+ if (unlabeledAnnotationsFile != null) {
+ arguments.add("--unlabeled_annotations_file=" + unlabeledAnnotationsFile.getCanonicalPath());
+ }
+ return arguments;
} catch (final IOException e) {
throw new UserException.BadInput(String.format("Encountered exception resolving canonical file paths: %s", e));
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsScorer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonVariantAnnotationsScorer.java
similarity index 92%
rename from src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsScorer.java
rename to src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonVariantAnnotationsScorer.java
index 51e4e9a4e9b..fb5e190305e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsScorer.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonVariantAnnotationsScorer.java
@@ -25,7 +25,7 @@
*
* See src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py for an example implementation.
*/
-public final class PythonSklearnVariantAnnotationsScorer implements VariantAnnotationsScorer, Serializable {
+public final class PythonVariantAnnotationsScorer implements VariantAnnotationsScorer, Serializable {
private static final long serialVersionUID = 1L;
@@ -34,8 +34,8 @@ public final class PythonSklearnVariantAnnotationsScorer implements VariantAnnot
private final File pythonScriptFile;
private final File scorerPklFile;
- public PythonSklearnVariantAnnotationsScorer(final File pythonScriptFile,
- final File scorerPklFile) {
+ public PythonVariantAnnotationsScorer(final File pythonScriptFile,
+ final File scorerPklFile) {
this.pythonScriptFile = pythonScriptFile;
this.scorerPklFile = scorerPklFile;
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModel.java
index ee2e899d0a8..cb3c5595e3f 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModel.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModel.java
@@ -38,9 +38,10 @@ public interface VariantAnnotationsModel {
* 2) we assume the model does not care about the variant type.
* TODO we could also pass additional labels to be used in training,
* but all backends would have to likewise respect directory structure
- *
+ * @param unlabeledAnnotationsFile Unlabeled annotations in HDF5 format, with the same structure and format as the above. May be {@code null}.
* @param outputPrefix Path prefix for all output files
*/
void trainAndSerialize(final File trainingAnnotationsFile,
+ final File unlabeledAnnotationsFile,
final String outputPrefix);
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsScorer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsScorer.java
index c0550273c57..377bacecc5e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsScorer.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsScorer.java
@@ -91,21 +91,4 @@ static void writeScores(final File outputFile,
exception, outputFile.getAbsolutePath()));
}
}
-
- /**
- * Yields a VQSR-style positive-negative scorer that returns the difference of the positive score and the negative score.
- */
- static VariantAnnotationsScorer combinePositiveAndNegativeScorer(final VariantAnnotationsScorer positiveScorer,
- final VariantAnnotationsScorer negativeScorer) {
- return (inputAnnotationsFile, outputScoresFile) -> {
- final File tempPositiveScoresFile = IOUtils.createTempFile("positive", "scores.hdf5");
- final File tempNegativeScoresFile = IOUtils.createTempFile("negative", "scores.hdf5");
- positiveScorer.score(inputAnnotationsFile, tempPositiveScoresFile);
- final double[] positiveScores = VariantAnnotationsScorer.readScores(tempPositiveScoresFile);
- negativeScorer.score(inputAnnotationsFile, tempNegativeScoresFile);
- final double[] negativeScores = VariantAnnotationsScorer.readScores(tempNegativeScoresFile);
- final double[] scores = IntStream.range(0, positiveScores.length).mapToDouble(i -> positiveScores[i] - negativeScores[i]).toArray();
- VariantAnnotationsScorer.writeScores(outputScoresFile, scores);
- };
- }
}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotationsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotationsIntegrationTest.java
index dd3f1202b7f..e3379e91928 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotationsIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotationsIntegrationTest.java
@@ -83,7 +83,6 @@ public void assertThatExpectedOutputUpdateToggleIsDisabled() {
return argsBuilder;
};
static final Function ADD_ALLELE_SPECIFIC_ANNOTATIONS = argsBuilder -> {
- argsBuilder.addFlag(LabeledVariantAnnotationsWalker.USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME);
ALLELE_SPECIFIC_ANNOTATIONS.forEach(a -> argsBuilder.add(StandardArgumentDefinitions.ANNOTATION_LONG_NAME, a));
return argsBuilder;
};
@@ -230,16 +229,6 @@ public void testNoVariantsInInput() {
Assert.assertTrue(new File(outputPrefix + ".vcf.idx").exists());
}
- @Test(expectedExceptions = UserException.class)
- public void testForgotToSpecifyUseAlleleSpecificAnnotationsFlag() {
- final File outputDir = createTempDir("extract");
- final String outputPrefix = String.format("%s/test", outputDir);
- final ArgumentsBuilder argsBuilder = ADD_SNP_MODE_AND_RESOURCES.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get());
- ALLELE_SPECIFIC_ANNOTATIONS.forEach(a -> argsBuilder.add(StandardArgumentDefinitions.ANNOTATION_LONG_NAME, a));
- argsBuilder.addOutput(outputPrefix);
- runCommandLine(argsBuilder);
- }
-
@Test(expectedExceptions = UserException.class)
public void testReservedSNPResourceLabel() {
final File outputDir = createTempDir("extract");
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotationsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotationsIntegrationTest.java
index 289821d0e54..1ff89982a9b 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotationsIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotationsIntegrationTest.java
@@ -100,8 +100,8 @@ public void assertThatExpectedOutputUpdateToggleIsDisabled() {
public Object[][] dataValidInputs() {
final List>>> testConfigurations = Lists.cartesianProduct(
Arrays.asList(
- Pair.of("extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg", Function.identity()),
- Pair.of("extract.AS.snpIndel.posUn.train.snpIndel.posNeg", Function.identity())),
+ Pair.of("extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly", Function.identity()),
+ Pair.of("extract.AS.snpIndel.posUn.train.snpIndel.posOnly", Function.identity())),
Arrays.asList(
Pair.of("IF.score", ab -> ADD_MODEL_BACKEND.apply(ab, VariantAnnotationsModelBackend.PYTHON_IFOREST)), // this and the following case give the same results, so they are given the same IF.score tag
Pair.of("IF.score", ADD_ISOLATION_FOREST_PYTHON_SCRIPT
@@ -113,7 +113,7 @@ public Object[][] dataValidInputs() {
return testConfigurations.stream()
.map(tagAndAddFunctionPairs -> new Object[]{
- tagAndAddFunctionPairs.stream().map(Pair::getLeft).collect(Collectors.joining(".")), // e.g., extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp
+ tagAndAddFunctionPairs.stream().map(Pair::getLeft).collect(Collectors.joining(".")), // e.g., extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp
tagAndAddFunctionPairs.stream().map(Pair::getRight) // creates the corresponding ArgumentsBuilder
.reduce(Function.identity(), Function::andThen) // by stringing together functions that add the
.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get())}) // appropriate arguments
@@ -121,7 +121,7 @@ public Object[][] dataValidInputs() {
}
/**
- * Checks expected outputs given a tag (e.g., "extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp") and arguments corresponding to the
+ * Checks expected outputs given a tag (e.g., "extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp") and arguments corresponding to the
* Cartesian products generated in {@link #dataValidInputs}.
*
* We perform exact-match tests of any HDF5 files produced using h5diff, which is insensitive to timestamps within the file.
@@ -136,7 +136,7 @@ public void testValidInputs(final String tag,
argsBuilder.addOutput(outputPrefix);
// add arguments for model prefix based on the
- // train tag (the portion of the tag preceding ".score", e.g., extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF),
+ // train tag (the portion of the tag preceding ".score", e.g., extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF),
// which gives the basename for the model files
final String trainTag = tag.split(".score")[0];
if (tag.contains("nonAS")) {
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModelIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModelIntegrationTest.java
index 9082fe7a0ad..4f631c77bf2 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModelIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModelIntegrationTest.java
@@ -9,7 +9,7 @@
import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData;
import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType;
import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.BGMMVariantAnnotationsScorer;
-import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonSklearnVariantAnnotationsScorer;
+import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonVariantAnnotationsScorer;
import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModelBackend;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.io.Resource;
@@ -69,10 +69,6 @@ public void assertThatExpectedOutputUpdateToggleIsDisabled() {
argsBuilder.add(TrainVariantAnnotationsModel.UNLABELED_ANNOTATIONS_HDF5_LONG_NAME, unlabeledAnnotationsHDF5);
return argsBuilder;
};
- private static final BiFunction ADD_CALIBRATION_SENSITIVITY_THRESHOLD = (argsBuilder, calibrationSensitivityThreshold) -> {
- argsBuilder.add(TrainVariantAnnotationsModel.CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, calibrationSensitivityThreshold);
- return argsBuilder;
- };
private static final Function ADD_SNP_MODE = argsBuilder -> {
argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP);
return argsBuilder;
@@ -98,13 +94,12 @@ public void assertThatExpectedOutputUpdateToggleIsDisabled() {
* Exact-match tests for (non-exhaustive) configurations given by the Cartesian product of the following options:
* 1) non-allele-specific ("nonAS") vs. allele-specific ("AS")
* 2) SNP-only ("snp") vs. SNP+INDEL ("snpIndel") (for both of these options, we use extracted annotations that contain both SNP and INDEL variants as input)
- * 3) positive training with {extract-tag}.annot.hdf5 ("posOnly") vs. positive-negative training with {extract-tag}.annot.hdf5 and {extract-tag}.unlabeled.annot.hdf5 ("posNeg")
- * 4) model backend
- * 4a) Java Bayesian Gaussian Mixture Model (BGMM) backend TODO the BGMM has been reduced to a stub for this initial PR; subsequent PRs will cover the backend code and reconnect the stub
- * 4b) default PYTHON_IFOREST with default hyperparameters ("IF")
- * 4c) default PYTHON_IFOREST with non-default seed hyperparameter ("IFDifferentSeed")
- * 4d) specified PYTHON_SCRIPT with non-default seed hyperparameter ("IFDifferentSeed"); we will simply use the same script as the default PYTHON_IFOREST backend, so this is just a test of the command-line interface
- * We should expect 4c-d to give functionally identical results.
+ * 3) model backend
+ * 3a) Java Bayesian Gaussian Mixture Model (BGMM) backend TODO the BGMM has been reduced to a stub for this initial PR; subsequent PRs will cover the backend code and reconnect the stub
+ * 3b) default PYTHON_IFOREST with default hyperparameters ("IF")
+ * 3c) default PYTHON_IFOREST with non-default seed hyperparameter ("IFDifferentSeed")
+ * 3d) specified PYTHON_SCRIPT with non-default seed hyperparameter ("IFDifferentSeed"); we will simply use the same script as the default PYTHON_IFOREST backend, so this is just a test of the command-line interface
+ * We should expect 3c-d to give functionally identical results.
*/
@DataProvider(name = "dataValidInputs")
public Object[][] dataValidInputs() {
@@ -115,9 +110,8 @@ public Object[][] dataValidInputs() {
Arrays.asList(
Pair.of("snp", ADD_SNP_MODE),
Pair.of("snpIndel", ADD_SNP_MODE.andThen(ADD_INDEL_MODE))),
- Arrays.asList( // we will consume the tag and add appropriate arguments for positive and positive-negative training below
- Pair.of("posOnly", Function.identity()),
- Pair.of("posNeg", Function.identity())),
+ Collections.singletonList(
+ Pair.of("posOnly", Function.identity())),
Arrays.asList(
Pair.of("IF", ab -> ADD_MODEL_BACKEND.apply(ab, VariantAnnotationsModelBackend.PYTHON_IFOREST)),
Pair.of("IFDifferentSeed", ADD_ISOLATION_FOREST_HYPERPARAMETERS_JSON
@@ -157,17 +151,7 @@ public void testValidInputs(final String tag,
extractTag + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX);
final Function addPositiveAnnotations = ab ->
ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5);
- if (tag.contains("posNeg")) {
- final File unlabeledAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR,
- extractTag + ExtractVariantAnnotations.UNLABELED_TAG + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX);
- final Function addUnlabeledAnnotations = ab ->
- ADD_UNLABELED_ANNOTATIONS_HDF5.apply(ab, unlabeledAnnotationsHDF5);
- final Function addCalibrationSensitivityThreshold = ab ->
- ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, CALIBRATION_SENSITIVITY_THRESHOLD);
- addPositiveAnnotations.andThen(addUnlabeledAnnotations).andThen(addCalibrationSensitivityThreshold).apply(argsBuilder);
- } else {
- addPositiveAnnotations.apply(argsBuilder);
- }
+ addPositiveAnnotations.apply(argsBuilder);
runCommandLine(argsBuilder);
@@ -204,18 +188,10 @@ private static void assertExpectedOutputsForVariantType(final String tag,
tagAndVariantType + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX,
outputPrefixAndVariantType + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX));
- assertScorerExpectedOutputs(tagAndVariantType, outputPrefixAndVariantType, false);
+ assertScorerExpectedOutputs(tagAndVariantType, outputPrefixAndVariantType);
- if (tag.contains("posNeg")) {
- SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s %s",
- EXPECTED_TEST_FILES_DIR,
- tagAndVariantType + TrainVariantAnnotationsModel.UNLABELED_SCORES_HDF5_SUFFIX,
- outputPrefixAndVariantType + TrainVariantAnnotationsModel.UNLABELED_SCORES_HDF5_SUFFIX));
- assertScorerExpectedOutputs(tagAndVariantType, outputPrefixAndVariantType, true);
- } else {
+ if (tag.contains("posOnly")) {
Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.UNLABELED_SCORES_HDF5_SUFFIX).exists());
- Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.NEGATIVE_TAG + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists());
- Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.NEGATIVE_TAG + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists());
}
}
@@ -226,9 +202,7 @@ private static void assertOutputsForVariantTypeDoNotExist(final String outputPre
Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX).exists());
Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.UNLABELED_SCORES_HDF5_SUFFIX).exists());
Assert.assertFalse(new File(outputPrefixAndVariantType + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists());
- Assert.assertFalse(new File(outputPrefixAndVariantType + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists());
- Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.NEGATIVE_TAG + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists());
- Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.NEGATIVE_TAG + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists());
+ Assert.assertFalse(new File(outputPrefixAndVariantType + PythonVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists());
}
/**
@@ -237,15 +211,13 @@ private static void assertOutputsForVariantTypeDoNotExist(final String outputPre
* coverage.
*/
private static void assertScorerExpectedOutputs(final String tagAndVariantType,
- final String outputPrefixAndVariantType,
- final boolean isNegative) {
- final String positiveOrNegativeTag = isNegative ? ".negative" : "";
- final String scorerTag = outputPrefixAndVariantType + positiveOrNegativeTag;
+ final String outputPrefixAndVariantType) {
+ final String scorerTag = outputPrefixAndVariantType;
if (tagAndVariantType.contains("BGMM")) {
Assert.assertTrue(new File(scorerTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists());
- Assert.assertFalse(new File(scorerTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists());
+ Assert.assertFalse(new File(scorerTag + PythonVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists());
} else if (tagAndVariantType.contains("IF")) {
- Assert.assertTrue(new File(scorerTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists());
+ Assert.assertTrue(new File(scorerTag + PythonVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists());
Assert.assertFalse(new File(scorerTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists());
} else {
Assert.fail("Unknown model-backend tag.");
@@ -288,46 +260,6 @@ public void testSNPOnlyModelsFromSNPOnlyAndSNPPlusIndelAnnotationsAreIdentical()
outputPrefixSNPPlusIndel + ".snp" + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX));
}
- @Test(expectedExceptions = IllegalArgumentException.class)
- public void testUnlabeledAnnotationsSpecifiedWithoutCalibrationSensitivityThreshold() {
- final File outputDir = createTempDir("train");
- final String outputPrefix = String.format("%s/test", outputDir);
- final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get();
- argsBuilder.addOutput(outputPrefix);
- final String extractTag = "extract.nonAS.snpIndel.posUn";
- final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR,
- extractTag + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX);
- final Function addPositiveAnnotations = ab ->
- ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5);
- final File unlabeledAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR,
- extractTag + ExtractVariantAnnotations.UNLABELED_TAG + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX);
- final Function addUnlabeledAnnotations = ab ->
- ADD_UNLABELED_ANNOTATIONS_HDF5.apply(ab, unlabeledAnnotationsHDF5);
- addPositiveAnnotations
- .andThen(addUnlabeledAnnotations)
- .apply(argsBuilder);
- runCommandLine(argsBuilder);
- }
-
- @Test(expectedExceptions = IllegalArgumentException.class)
- public void testCalibrationSensitivityThresholdSpecifiedWithoutUnlabeledAnnotations() {
- final File outputDir = createTempDir("train");
- final String outputPrefix = String.format("%s/test", outputDir);
- final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get();
- argsBuilder.addOutput(outputPrefix);
- final String extractTag = "extract.nonAS.snpIndel.posUn";
- final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR,
- extractTag + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX);
- final Function addPositiveAnnotations = ab ->
- ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5);
- final Function addCalibrationSensitivityThreshold = ab ->
- ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, CALIBRATION_SENSITIVITY_THRESHOLD);
- addPositiveAnnotations
- .andThen(addCalibrationSensitivityThreshold)
- .apply(argsBuilder);
- runCommandLine(argsBuilder);
- }
-
@Test(expectedExceptions = IllegalArgumentException.class) // python environment is required to run tool
public void testPositiveAndUnlabeledAnnotationNamesAreNotIdentical() {
final File outputDir = createTempDir("train");
@@ -342,11 +274,8 @@ public void testPositiveAndUnlabeledAnnotationNamesAreNotIdentical() {
"extract.AS.snpIndel.posUn" + ExtractVariantAnnotations.UNLABELED_TAG + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); // allele-specific
final Function addUnlabeledAnnotations = ab ->
ADD_UNLABELED_ANNOTATIONS_HDF5.apply(ab, unlabeledAnnotationsHDF5);
- final Function addCalibrationSensitivityThreshold = ab ->
- ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, CALIBRATION_SENSITIVITY_THRESHOLD);
addPositiveAnnotations
.andThen(addUnlabeledAnnotations)
- .andThen(addCalibrationSensitivityThreshold)
.apply(argsBuilder);
runCommandLine(argsBuilder);
}
@@ -368,7 +297,8 @@ public void testPositiveAnnotationsOfSpecifiedVariantTypesNotPresent() {
runCommandLine(argsBuilder);
}
- @Test(expectedExceptions = UserException.BadInput.class, groups = {"python"}) // python environment is required to run tool
+ // we will enable this once a positive-unlabeled backend is implemented
+ @Test(expectedExceptions = UserException.BadInput.class, groups = {"python"}, enabled = false) // python environment is required to run tool
public void testUnlabeledAnnotationsOfSpecifiedVariantTypesNotPresent() {
final File outputDir = createTempDir("train");
final String outputPrefix = String.format("%s/test", outputDir);
@@ -382,12 +312,9 @@ public void testUnlabeledAnnotationsOfSpecifiedVariantTypesNotPresent() {
"extract.nonAS.snp.posUn" + ExtractVariantAnnotations.UNLABELED_TAG + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); // contains only SNPs, but SNP+INDEL is specified
final Function addUnlabeledAnnotations = ab ->
ADD_UNLABELED_ANNOTATIONS_HDF5.apply(ab, unlabeledAnnotationsHDF5);
- final Function addCalibrationSensitivityThreshold = ab ->
- ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, CALIBRATION_SENSITIVITY_THRESHOLD);
ADD_SNP_MODE.andThen(ADD_INDEL_MODE)
.andThen(addPositiveAnnotations)
.andThen(addUnlabeledAnnotations)
- .andThen(addCalibrationSensitivityThreshold)
.apply(argsBuilder);
runCommandLine(argsBuilder);
}
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5
deleted file mode 100644
index b75fc0d7d7f..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a8d333bfb49c88d34c24250c6e31ae53b00bda9076c1106390ed10f5949de160
-size 736656
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5
deleted file mode 100644
index fb9d835d194..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8801b51a901df9ef298359cba9e6764d76fa773c9c92c5e14727ef77f70d2beb
-size 35136
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf
deleted file mode 100644
index 1ddff59081d..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6bfd450c342ce64428a70975bcdb9764e37154ec4463d008203fc63631391b14
-size 2227806
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx
deleted file mode 100644
index d9579cdc497..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:29e2978e195940ad1c236267274871f91d51ac7b12e824aac03d56ffe2f946f1
-size 119222
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5
deleted file mode 100644
index 5089c01571c..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7777d249687060ee948557d6271916c597c87ee9f297e07d8f78a8451e405d86
-size 822288
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5
deleted file mode 100644
index c87bb0f1a42..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:cb63a07115d848e929102cc23a77c735187126d2abfc3af7813a4fe4a77b612c
-size 38440
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf
deleted file mode 100644
index 6849cdd175a..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7dd1d93b67390c83f9569a641239eebcd01860673b41907ad05d19635fd60ab8
-size 2243854
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx
deleted file mode 100644
index 6af660bbdb2..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c967b19e7d946467c37f531d7cc6901e8e903ae59466000b58db08ff7edf611b
-size 119227
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.annot.hdf5
new file mode 100644
index 00000000000..9f4e9f393ef
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92a91bb19f59cb602656805cd7efb2bb302022686fe21aef03a51e7ea3c0aef0
+size 736656
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.scores.hdf5
new file mode 100644
index 00000000000..c9eee51c9c5
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.scores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f99391792c228f5f34bb83dc581d521e03e06f442140b8f3af3066dcb9592ce
+size 35136
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.vcf
new file mode 100644
index 00000000000..d8dcbfe3b57
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ca7f76315099e24dedee2de127d8c24f3d28f62d7569b617d56958eb0bfedd5
+size 2228705
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.vcf.idx
new file mode 100644
index 00000000000..4d0b41d5596
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2eefcac29c9ccf8760a6d3a9e55cb863c9e4ea804a1a78ba38db4c448cb41b20
+size 119223
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.annot.hdf5
new file mode 100644
index 00000000000..08faea04b52
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4510202dd1e097698b2f862fafa8d076d2d8e11cf5908bf3883fe6f618d69fa7
+size 822288
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.scores.hdf5
new file mode 100644
index 00000000000..3d0cb431766
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.scores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0bcf784e97396a9b14dc65912d043ad011dbe1c9c8d70e9eea1b45502eb7d7d
+size 38440
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.vcf
new file mode 100644
index 00000000000..14ac057b300
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29c4e43204b5a655135c84fd7e4489d69ae542892f11f01d097a1fcd62a759ec
+size 2244914
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.vcf.idx
new file mode 100644
index 00000000000..c1a130a9f64
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb24eed9c59f80aae758dc2fb89130c782fbc07e14220da57d46936611d8c390
+size 119228
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5
deleted file mode 100644
index fbf0990ee70..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b25607c74d197a7116421014925ad4dcc10c326e561b193b1e2eb71152598369
-size 766368
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5
deleted file mode 100644
index ee4850c9acb..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:afb44cc0b2f1c821d4b79f4c0145edc5fc662d06ce13239fd2077e1d1e045783
-size 34960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf
deleted file mode 100644
index e46bbcf2a15..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:993e2d40dea8558c001a7321a4bbe4804877b2de36c3a266416310446c915ccb
-size 2226076
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx
deleted file mode 100644
index 9be1548020d..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1a56007a28f971a86349be709cb2b5ce3821ef5f3ae19ff0f9dcd2841021a510
-size 119225
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5
deleted file mode 100644
index 1378a5e61da..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:38fb5c443979d9468de740c26c1e3b2d8f27938c1ffb43ebf48ae1bef94196b3
-size 829672
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5
deleted file mode 100644
index 58244d511a7..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3bb53ebfca7a737737a1d01ff541d414c3cef07d507b3e360d360079239d723a
-size 37720
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf
deleted file mode 100644
index 4af1921ce48..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:05f85d264a457cdd81896bde03f51b2369343da5ade21b1c8df183a2b7e8f974
-size 2242450
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx
deleted file mode 100644
index 34133ce42ad..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:cba3840238dc7d6c7d85eeda892da51abdccf1e80c60b9030fa781da42d16b9f
-size 119230
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.annot.hdf5
new file mode 100644
index 00000000000..711ae0e8e75
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcf123ed80e8c503ad4d2d1364be63f22d82379b32eb0c4f563f08cb4abbb2df
+size 766368
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.scores.hdf5
new file mode 100644
index 00000000000..ca29220bbcc
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.scores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8467b31a8916052b511adf002a877b36a0b5fb0e64d035022b337beabd437da
+size 34960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.vcf
new file mode 100644
index 00000000000..dd76708357f
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19e40f44f8bbd173cb325a9e14aa3240ca5725296abbfc4fb005eb2bb14a2cdd
+size 2227870
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.vcf.idx
new file mode 100644
index 00000000000..e19112ffa42
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snp.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bdbec4dc5e2c9263da4b217e33fd7fbf4dac34f9f42821d3d8c69136b9131f62
+size 119226
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.annot.hdf5
new file mode 100644
index 00000000000..bf62479462f
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.annot.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6539c887b88be6d4b2ed0b03866d2d20054c0cbb24f777d27cc043bcbcc69055
+size 829672
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.scores.hdf5
new file mode 100644
index 00000000000..a65701f8e94
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.scores.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b19295ce01f6d6a84b48c88e8a53c23d65036b34f04a1ac7f13760b803813aa4
+size 37720
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.vcf
new file mode 100644
index 00000000000..017e1ec2493
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.vcf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34ca4c5ab46abe321506c5b43db550257e55c441f0a52553aca3acb5fc78ab63
+size 2244522
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.vcf.idx
new file mode 100644
index 00000000000..7fbb1578345
--- /dev/null
+++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.score.snpIndel.vcf.idx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e95e34b18a698a7d4db138297b3f13c427d8b371d1c0e94f289f05ae240f438b
+size 119231
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5
deleted file mode 100644
index e6ffb5c84fa..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f64ff3525514a1b3da32c2ea87a22dc46b57382f09286400878f182f60e41f6e
-size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl
deleted file mode 100644
index d7516056325..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:05dae24fc2ae3e88654a53d49d23cf0345a4a358af8cdb97881f4498df9c7d7e
-size 354768
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl
deleted file mode 100644
index dc18c6fa7e2..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c1fda6d8d6b4f200bfb7a2707ce544dd9a27ae483f1ca1d649e5dbddf330c69d
-size 506091
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5
deleted file mode 100644
index d30433b70d0..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:336e6546b3c9fddbd6134c3e9f6cd47b45a2c05aa52a65ffadeb0a6d041b00f8
-size 5984
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5
deleted file mode 100644
index e5ffe1456ac..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c2f758e56b80afa511ad8e743ff651b2807e6c741617a51e9f7815e52a55e8e7
-size 3184
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
deleted file mode 100644
index d2ae036241d..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2d665749610d4f9aa117dbbd098a4f9cf2fd139b679285fdf854c2e2656ca10c
-size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
deleted file mode 100644
index d30a719af1d..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ff23ea334269b3cc502646a7d301d956876717e781502cf3ccca6a0b4a36268a
-size 374750
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl
deleted file mode 100644
index 348ebedf945..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:be6d3e97dbe88caeead043c0264b7345589da11bf3e3842693bec9fccac9f802
-size 514532
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
deleted file mode 100644
index 9cbba14a079..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:76b90080d91b94a79c4a19ce0b26c1f63b173fd32b5148aba88087dd6c3bd7ac
-size 5984
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
deleted file mode 100644
index 36cc0a423c5..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:94b85b2bc701c9e895ac089629e5da035eec4fd2bd482a4b0cc41e4a78ed8750
-size 3184
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5
deleted file mode 100644
index 79c760c631f..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8891aa2ff9b4ec967bcf1322076005ab7c1a3e95af246c0489c337890b5b4475
-size 2664
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl
deleted file mode 100644
index 5fdff35c06d..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:934d96ba164a225333a5d5b736a2ecfe874833d3f356c2e003ee4053005055ed
-size 127038
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl
deleted file mode 100644
index 9f86fab9b13..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4a1937d6b3ea7cf030d154d5c9a6e7d2b14b3db38be2c934ccf06f48549eb3a8
-size 235812
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5
deleted file mode 100644
index 6e60311572d..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9bd8a9643b807bbc107c8edbcbf2b0d3112ab47c0ca90a8d37e6874bdb74f8cb
-size 2880
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5
deleted file mode 100644
index 4fca186b572..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:49c8737964fd8944aa7df315ca977256f97c51dfcf720a8fce729e466b90829b
-size 2488
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5
deleted file mode 100644
index 3f3047a05a4..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2be28cb801165772e3cefc27eff812f87a392f53873da6379571ed6419ec4f33
-size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl
deleted file mode 100644
index 7741e476134..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3461d5a5e306a17a133260557b1753acd467a1f6c344551af16eba1eae6bfc8e
-size 354767
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl
deleted file mode 100644
index 697a8dde554..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f5e2c57e426d79d87922e8dafbbf245380fc756e0f61b4dc935529527ab1e08e
-size 506090
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5
deleted file mode 100644
index 5f20c187730..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8201352895fe4b1284886cd1b7a33516d25d7ba98a5f8e2f03c7437380cc823f
-size 5984
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5
deleted file mode 100644
index 6306b6a8478..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:45a1dfd68dc4617de6537ec92711d486c7033adaa51944fe6cbed824bee890ff
-size 3184
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5
deleted file mode 100644
index 59900e515e6..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bbda044e552390c1766e74c0209c6328c708b8dc789c0d0f8da99ea30ace3fac
-size 2664
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl
deleted file mode 100644
index 9d8cdfe2ae8..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:318291edce230c68c1c8c8cd47326b64da3735f792ed501894c38123fcf5a738
-size 130238
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl
deleted file mode 100644
index 6b3104e110c..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e80bc19607efaa536f9dee44ef5cca3ce2554a9d6c9b498694bd9de73f8da40d
-size 240054
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5
deleted file mode 100644
index 9cd7f179d2a..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3b769079f7729d6c691f64fb30a2a3225fb9e4c0dd0ab72a8847daa9ed69fd8d
-size 2880
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5
deleted file mode 100644
index 463e56dcfdf..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c73d66459b9ba13ccf739b6a09e68184ebaaa29f83451b51057ed9f6051beab2
-size 2488
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
deleted file mode 100644
index 60ffad8c4c3..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0aa00881b8362481f68fdfff6dc85ab708348b0f487c0d0da4f6d27a1a0ad81f
-size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
deleted file mode 100644
index d30a719af1d..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ff23ea334269b3cc502646a7d301d956876717e781502cf3ccca6a0b4a36268a
-size 374750
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl
deleted file mode 100644
index 348ebedf945..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:be6d3e97dbe88caeead043c0264b7345589da11bf3e3842693bec9fccac9f802
-size 514532
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
deleted file mode 100644
index 02d8c94608e..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4eac1941fe274f44029780339614d4197211a10a6cb4bdb8191d82bf7b3ca933
-size 5984
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
deleted file mode 100644
index 6de63dfaf4b..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d692b4f62830aa24b882be7f7dcf4c63e9266ed2ff6dfe516715a6ed47d523ec
-size 3184
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5
deleted file mode 100644
index 1af4242bbec..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7687aa701fe0fdb52d86a27ccc12a6cc8bb2b57906ea2335146396bfae47ea1b
-size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl
deleted file mode 100644
index f99de98d4fb..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d7d481a06e9c5d27b8a322712818731b508ecb09309cacff1c5f24df1077d975
-size 368366
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl
deleted file mode 100644
index 21c88876f37..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6f4f417b26d9478fb36ac5372f1634140a4114f09d207a434f8d582a5936e9b9
-size 556675
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5
deleted file mode 100644
index 17e3ab4ef97..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:048b4690d9f1fdce1e3dbabf995e8e306965ea8e00c21a92129d67e0e3b8fb5c
-size 5992
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5
deleted file mode 100644
index 323cf93db29..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9e4033c901c80e97d4668d38e40b6677ec68d5d6f960d498c90d6311c971d6ae
-size 3168
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
deleted file mode 100644
index 9638837ac77..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ff6eb7db94e1e78d2aa4e2bad08eb92e0df1ae88ef29e27f7f81362ac56e4faa
-size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
deleted file mode 100644
index 02016de49a7..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:373207a645b1caa64fe15fd4fec77556c87c11f0f304b8e4920094a59cae89eb
-size 359135
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl
deleted file mode 100644
index ac1fe518303..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0510d85a2680139fbfb2d3c8f2d5fed8977af834bf82d09d9090a06fa8d454f4
-size 525312
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
deleted file mode 100644
index 6972268a95c..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:443ed54e4403ba517c2370e18f95200f9b9dc4648c914424344626bacab6c4f2
-size 5992
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
deleted file mode 100644
index 90420cf2917..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3cd7216a8a13adb6640b1e91d939600bc664202964b0cc74d76d8070c3422b75
-size 3168
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5
deleted file mode 100644
index 4ae3f77a6dd..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:298e36701a70226f3720cd40cd6ca8f37404a807cbd193c23d764141f5594f36
-size 2664
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl
deleted file mode 100644
index 8c7e18e918e..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ff46e688731825bb03a6c9504ed7847c998c129a1e13c507a14f7adb56d733ad
-size 108247
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl
deleted file mode 100644
index 2aba7f5c93a..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fe5ca4414318a540488afbdea99293e0f67b02725839c13faa5b3ff39b959e7e
-size 259163
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5
deleted file mode 100644
index 004a5bdb157..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7243a9e90836489a979905a386811a688bd07968c115063351b77bf91c72efc3
-size 2880
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5
deleted file mode 100644
index 9cc88998aef..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:00393e27ad2c98fd4c36a003d7dcbc175d31d8346ace63e52b458df76a8d7457
-size 2496
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5
deleted file mode 100644
index a52edeb5b40..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5b3040e1926e7b59c623e2b930e16556d26dd91c4d586120f7db156f3f2f14fe
-size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl
deleted file mode 100644
index df2c423b7d9..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c4cbb64dbf96f8cbc10908aed4c8ec2e3fcf01af7a3512c90e48aa743af3bf28
-size 368366
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl
deleted file mode 100644
index 9192d59204c..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:86e04d419ae8a211acbead86c467cd8c3578c3312f31fdc5600eaefd040ffc32
-size 556675
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5
deleted file mode 100644
index 483bfe123ec..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8007c65df2a62f1b12cb2bd9d0818ea106edf2fc18610318e1e09def0f1bd77a
-size 5992
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5
deleted file mode 100644
index b7ab0c0c576..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e6860775947699fe2df688d0c17321de501ef18c12c87dbb430221ba1c27e56b
-size 3168
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5
deleted file mode 100644
index 624a515e7f8..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:223ee064aed2de5a2e0c7a773b08d730666fa28b49181984dc8f16d07233b2ef
-size 2664
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl
deleted file mode 100644
index 54aa186cf73..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e0b377bad272e94bf11bca14ed7a7dd3c67296f347509d272b9538b695579199
-size 132823
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl
deleted file mode 100644
index 1142e1f4599..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:426cec56e16eec10f47c824797c035772d1ebf3cf4f73972e8a541deca622cd3
-size 248813
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5
deleted file mode 100644
index c91fcf083b5..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f80beea7f769595a56aa8af6d90335626fa5c93fe5f61761e998f59e774f3104
-size 2880
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5
deleted file mode 100644
index 64fbf36622e..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1ba9bf669a916ee10750cf8cf78968af0372a5d272640be9fe3fba97fc4e6059
-size 2496
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
deleted file mode 100644
index 6a831bf1e51..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dce6eb6a9175112940d97573e3e99818a041f4c8311bd0d790b11bad7153cc90
-size 4960
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
deleted file mode 100644
index 02016de49a7..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:373207a645b1caa64fe15fd4fec77556c87c11f0f304b8e4920094a59cae89eb
-size 359135
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl
deleted file mode 100644
index ac1fe518303..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0510d85a2680139fbfb2d3c8f2d5fed8977af834bf82d09d9090a06fa8d454f4
-size 525312
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
deleted file mode 100644
index 45b09bcdafc..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9885e1ace48f972b88d808fe9dcf31aae828b0f327909c3922305d48659c9516
-size 5992
diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
deleted file mode 100644
index 3baaddc00b8..00000000000
--- a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:60dfec8ee105371c8b69b4c6790f6920c1f94adfad1bca58fa353a016224c3c4
-size 3168