diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/argumentcollections/SequenceDictionaryValidationArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/cmdline/argumentcollections/SequenceDictionaryValidationArgumentCollection.java new file mode 100644 index 00000000000..b8ae6e4720d --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/cmdline/argumentcollections/SequenceDictionaryValidationArgumentCollection.java @@ -0,0 +1,42 @@ +package org.broadinstitute.hellbender.cmdline.argumentcollections; + +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; + +/** + * interface for argument collections that control how sequence dictionary validation should be handled + */ +public interface SequenceDictionaryValidationArgumentCollection { + + /** + * Should sequence dictionary validation be performed + * @return true if the tool should perform sequence dictionary validation + */ + boolean performSequenceDictionaryValidation(); + + + /** + * most tools will want to use this, it defaults to performing sequence dictionary validation but provides the option + * to disable it + */ + class StandardValidationCollection implements SequenceDictionaryValidationArgumentCollection { + @Argument(fullName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, shortName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, doc = "If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!", optional = true) + private boolean disableSequenceDictionaryValidation = false; + + @Override + public boolean performSequenceDictionaryValidation() { + return !disableSequenceDictionaryValidation; + } + } + + /** + * doesn't provide a configuration argument, and always returns false, useful for tools that do not want to perform + * sequence dictionary validation, like aligners + */ + class NoValidationCollection implements SequenceDictionaryValidationArgumentCollection { + @Override + public boolean performSequenceDictionaryValidation() { + return false; + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java b/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java index c9eee1182d7..001ce1a893f 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java @@ -21,6 +21,7 @@ import org.broadinstitute.hellbender.engine.filters.ReadFilter; import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter; +import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.transformers.ReadTransformer; import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils; @@ -66,8 +67,8 @@ public abstract class GATKTool extends CommandLineProgram { @Argument(fullName = SECONDS_BETWEEN_PROGRESS_UPDATES_NAME, shortName = SECONDS_BETWEEN_PROGRESS_UPDATES_NAME, doc = "Output traversal statistics every time this many seconds elapse", optional = true, common = true) private double secondsBetweenProgressUpdates = ProgressMeter.DEFAULT_SECONDS_BETWEEN_UPDATES; - @Argument(fullName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, shortName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, doc = "If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!", optional = true, common = true) - private boolean disableSequenceDictionaryValidation = false; + @ArgumentCollection + private SequenceDictionaryValidationArgumentCollection seqValidationArguments = getSequenceDictionaryValidationArgumentCollection(); @Argument(fullName=StandardArgumentDefinitions.CREATE_OUTPUT_BAM_INDEX_LONG_NAME, shortName=StandardArgumentDefinitions.CREATE_OUTPUT_BAM_INDEX_SHORT_NAME, @@ -452,6 +453,17 @@ public boolean requiresIntervals() { return false; } + + /** + * Get the {@link SequenceDictionaryValidationArgumentCollection} for the tool. + * Subclasses may override this method in order to customize validation options. + * + * @return a SequenceDictionaryValidationArgumentCollection + */ + protected SequenceDictionaryValidationArgumentCollection getSequenceDictionaryValidationArgumentCollection() { + return new SequenceDictionaryValidationArgumentCollection.StandardValidationCollection(); + } + /** * Load the master sequence dictionary as specified in {@code masterSequenceDictionaryFilename}. * Will only load the master sequence dictionary if it has not already been loaded. @@ -562,7 +574,7 @@ protected void onStartup() { initializeIntervals(); // Must be initialized after reference, reads and features, since intervals currently require a sequence dictionary from another data source - if ( ! disableSequenceDictionaryValidation ) { + if ( seqValidationArguments.performSequenceDictionaryValidation()) { validateSequenceDictionaries(); } diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKSparkTool.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKSparkTool.java index 91221c05781..c257bb18f72 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKSparkTool.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKSparkTool.java @@ -8,7 +8,6 @@ import org.broadinstitute.barclay.argparser.ArgumentCollection; import org.broadinstitute.barclay.argparser.CommandLinePluginDescriptor; import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKReadFilterPluginDescriptor; -import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.argumentcollections.*; import org.broadinstitute.hellbender.engine.TraversalParameters; import org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource; @@ -84,8 +83,9 @@ public abstract class GATKSparkTool extends SparkCommandLineProgram { optional = true) protected long bamPartitionSplitSize = 0; - @Argument(fullName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, shortName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, doc = "If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!", optional = true) - private boolean disableSequenceDictionaryValidation = false; + + @ArgumentCollection + protected SequenceDictionaryValidationArgumentCollection sequenceDictionaryValidationArguments = getSequenceDictionaryValidationArgumentCollection(); @Argument(doc = "For tools that write an output, write the output in multiple pieces (shards)", fullName = SHARDED_OUTPUT_LONG_NAME, @@ -187,6 +187,14 @@ public SerializableFunction getReferenceWindowFunction return ReferenceWindowFunctions.IDENTITY_FUNCTION; } + /** + * subclasses can override this to provide different default behavior for sequence dictionary validation + * @return a SequenceDictionaryValidationArgumentCollection + */ + protected SequenceDictionaryValidationArgumentCollection getSequenceDictionaryValidationArgumentCollection() { + return new SequenceDictionaryValidationArgumentCollection.StandardValidationCollection(); + } + /** * Returns the "best available" sequence dictionary. This will be the reference sequence dictionary if * there is a reference, otherwise it will be the sequence dictionary constructed from the reads if @@ -391,7 +399,7 @@ public List getIntervals() { @Override protected void runPipeline( JavaSparkContext sparkContext ) { initializeToolInputs(sparkContext); - validateToolInputs(); + validateSequenceDictionaries(); runTool(sparkContext); } @@ -485,8 +493,8 @@ protected List editIntervals(final List rawInter /** * Validates standard tool inputs against each other. */ - private void validateToolInputs() { - if ( ! disableSequenceDictionaryValidation ) { + protected void validateSequenceDictionaries() { + if ( sequenceDictionaryValidationArguments.performSequenceDictionaryValidation() ) { // Validate the reference sequence dictionary against the reads sequence dictionary, if both are present, // using standard GATK validation settings (requiring a common subset of equivalent contigs without respect // to ordering). @@ -510,4 +518,5 @@ private void validateToolInputs() { * @param ctx our Spark context */ protected abstract void runTool( JavaSparkContext ctx ); + } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaSpark.java index dd69ae4abc7..70df3334b23 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaSpark.java @@ -8,6 +8,7 @@ import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection; import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; import org.broadinstitute.hellbender.engine.filters.ReadFilter; import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; @@ -22,8 +23,8 @@ import java.util.List; @DocumentedFeature -@CommandLineProgramProperties(summary = "Runs BWA", - oneLineSummary = "BWA on Spark", +@CommandLineProgramProperties(summary = "Align reads using BWA", + oneLineSummary = "Align reads to a given reference using BWA on Spark", programGroup = ReadDataManipulationProgramGroup.class) @BetaFeature public final class BwaSpark extends GATKSparkTool { @@ -54,6 +55,11 @@ public List getDefaultReadFilters() { return Arrays.asList(ReadFilterLibrary.PRIMARY_LINE, ReadFilterLibrary.SEQ_IS_STORED); } + @Override + public SequenceDictionaryValidationArgumentCollection getSequenceDictionaryValidationArgumentCollection(){ + return new SequenceDictionaryValidationArgumentCollection.NoValidationCollection(); + } + @Override protected void runTool(final JavaSparkContext ctx) { try ( final BwaSparkEngine bwaEngine = diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/BwaAndMarkDuplicatesPipelineSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/BwaAndMarkDuplicatesPipelineSpark.java index 035ec7988e8..303bcd81a74 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/BwaAndMarkDuplicatesPipelineSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/BwaAndMarkDuplicatesPipelineSpark.java @@ -8,6 +8,7 @@ import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection; import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection; import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder; import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; @@ -57,6 +58,11 @@ public final class BwaAndMarkDuplicatesPipelineSpark extends GATKSparkTool { protected MarkDuplicatesSparkArgumentCollection markDuplicatesSparkArgumentCollection = new MarkDuplicatesSparkArgumentCollection(); + @Override + protected SequenceDictionaryValidationArgumentCollection getSequenceDictionaryValidationArgumentCollection() { + return new SequenceDictionaryValidationArgumentCollection.NoValidationCollection(); + } + @Override protected void runTool(final JavaSparkContext ctx) { try (final BwaSparkEngine bwaEngine = new BwaSparkEngine(ctx, referenceArguments.getReferenceFileName(), bwaArgs.indexImageFile, getHeaderForReads(), getReferenceSequenceDictionary())) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSpark.java index 690afdf8d74..d1325ad54f3 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSpark.java @@ -142,6 +142,14 @@ public SerializableFunction getReferenceWindowFunction return BaseRecalibrationEngine.BQSR_REFERENCE_WINDOW_FUNCTION; } + @Override + protected void validateSequenceDictionaries(){ + //don't validate unaligned reads because we don't require them to have a sequence dictionary + if( !align ){ + super.validateSequenceDictionaries(); + } + } + @Override protected void runTool(final JavaSparkContext ctx) { if (joinStrategy == JoinStrategy.BROADCAST && ! getReference().isCompatibleWithSparkBroadcast()){ diff --git a/src/test/java/org/broadinstitute/hellbender/cmdline/argumentcollections/SequenceDictionaryValidationArgumentCollectionTest.java b/src/test/java/org/broadinstitute/hellbender/cmdline/argumentcollections/SequenceDictionaryValidationArgumentCollectionTest.java new file mode 100644 index 00000000000..76b1065b9d8 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/cmdline/argumentcollections/SequenceDictionaryValidationArgumentCollectionTest.java @@ -0,0 +1,37 @@ +package org.broadinstitute.hellbender.cmdline.argumentcollections; + +import org.broadinstitute.barclay.argparser.ArgumentCollection; +import org.broadinstitute.barclay.argparser.CommandLineArgumentParser; +import org.broadinstitute.barclay.argparser.CommandLineParser; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class SequenceDictionaryValidationArgumentCollectionTest { + + private static class StandardArgumentCollection { + @ArgumentCollection + public SequenceDictionaryValidationArgumentCollection standard = new SequenceDictionaryValidationArgumentCollection.StandardValidationCollection(); + } + + @Test + public void testStandardArgumentCollectionDefaultsToTrue(){ + Assert.assertTrue(new SequenceDictionaryValidationArgumentCollection.StandardValidationCollection().performSequenceDictionaryValidation()); + } + + @Test + public void testStandardArgumentCollectionCanBeDisabled(){ + final String[] disabled = {"--"+StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME}; + StandardArgumentCollection std = new StandardArgumentCollection(); + CommandLineParser clp = new CommandLineArgumentParser(std); + clp.parseArguments(System.out, disabled); + Assert.assertFalse(std.standard.performSequenceDictionaryValidation()); + } + + @Test + public void testNoValidationDefaultsToFalse(){ + Assert.assertFalse(new SequenceDictionaryValidationArgumentCollection.NoValidationCollection().performSequenceDictionaryValidation()); + } + +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaAndMarkDuplicatesPipelineSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaAndMarkDuplicatesPipelineSparkIntegrationTest.java index e149fd2ca58..c9da134feb1 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaAndMarkDuplicatesPipelineSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaAndMarkDuplicatesPipelineSparkIntegrationTest.java @@ -1,7 +1,6 @@ package org.broadinstitute.hellbender.tools.spark.bwa; import org.broadinstitute.hellbender.CommandLineProgramTest; -import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection; import org.broadinstitute.hellbender.tools.spark.pipelines.BwaAndMarkDuplicatesPipelineSpark; import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder; @@ -34,8 +33,7 @@ public void test() throws Exception { args.addReference(ref); args.addInput(input); args.addOutput(output); - args.addBooleanArgument(StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, true); - args.add("--"+MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME); + args.addBooleanArgument(MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME, true); this.runCommandLine(args.getArgsArray()); SamAssertionUtils.assertSamsEqual(output, expectedSam); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaSparkIntegrationTest.java index ae495151857..7519c9bb2ea 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaSparkIntegrationTest.java @@ -30,7 +30,6 @@ public void testPairedEnd() throws Exception { final ArgumentsBuilder args = new ArgumentsBuilder(); args.addFileArgument(StandardArgumentDefinitions.REFERENCE_LONG_NAME, ref); args.addFileArgument(StandardArgumentDefinitions.INPUT_LONG_NAME, input); - args.addBooleanArgument(StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, true); // disable since input does not have a sequence dictionary args.addBooleanArgument(GATKSparkTool.SHARDED_OUTPUT_LONG_NAME, true); args.addArgument(GATKSparkTool.NUM_REDUCERS_LONG_NAME,"1"); args.addOutput(output); @@ -52,7 +51,6 @@ public void testSingleEnd() throws Exception { final ArgumentsBuilder args = new ArgumentsBuilder(); args.addFileArgument(StandardArgumentDefinitions.REFERENCE_LONG_NAME, ref); args.addFileArgument(StandardArgumentDefinitions.INPUT_LONG_NAME, input); - args.addBooleanArgument(StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME , true); // disable since input does not have a sequence dictionary args.addBooleanArgument(GATKSparkTool.SHARDED_OUTPUT_LONG_NAME, true); args.addArgument(GATKSparkTool.NUM_REDUCERS_LONG_NAME,"1"); args.addOutput(output); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSparkIntegrationTest.java index 3dd6ba2d649..3c752e8ed7c 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSparkIntegrationTest.java @@ -38,14 +38,6 @@ private PipelineTest(String referenceURL, String bam, String outputExtension, St this.expectedVcfFileName = expectedVcfFileName; } - public String getCommandLine() { - return " -R " + referenceURL + - " -I " + bam + - " " + args + - (knownSites.isEmpty() ? "": " --known-sites " + knownSites) + - " -O %s"; - } - @Override public String toString() { return String.format("ReadsPipeline(bam='%s', args='%s')", bam, args); @@ -97,7 +89,7 @@ public Object[][] createReadsPipelineSparkTestData() { {new PipelineTest(GRCh37Ref2bit_chr2021, hiSeqBam_chr20, ".bam", dbSNPb37_20, "--join-strategy OVERLAPS_PARTITIONER --read-shard-padding 1000 --known-sites " + more20Sites, getResourceDir() + expectedMultipleKnownSites, getResourceDir() + expectedMultipleKnownSitesVcf)}, // BWA-MEM - {new PipelineTest(GRCh37Ref2bit_chr2021, unalignedBam, ".bam", dbSNPb37_20, "--align --bwa-mem-index-image " + GRCh37Ref_2021_img + " --disable-sequence-dictionary-validation true --join-strategy BROADCAST --known-sites " + more20Sites, null, largeFileTestDir + expectedMultipleKnownSitesFromUnalignedVcf)}, + {new PipelineTest(GRCh37Ref2bit_chr2021, unalignedBam, ".bam", dbSNPb37_20, "--align --bwa-mem-index-image " + GRCh37Ref_2021_img + " --join-strategy BROADCAST --known-sites " + more20Sites, null, largeFileTestDir + expectedMultipleKnownSitesFromUnalignedVcf)}, }; }