Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

prevent sequence dictionary validation when aligning reads #4308

Merged
merged 1 commit into from
May 21, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package org.broadinstitute.hellbender.cmdline.argumentcollections;

import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;

/**
* interface for argument collections that control how sequence dictionary validation should be handled
*/
public interface SequenceDictionaryValidationArgumentCollection {

/**
* Should sequence dictionary validation be performed
* @return true if the tool should perform sequence dictionary validation
*/
boolean performSequenceDictionaryValidation();


/**
* most tools will want to use this, it defaults to performing sequence dictionary validation but provides the option
* to disable it
*/
class StandardValidationCollection implements SequenceDictionaryValidationArgumentCollection {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like having these as inner classes so we don't have separate files for them. I assume that its the case that an inner class inside of an interface is static wrt the containing/implementing class ? Can/should these be static ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They're static by default in an interface and intelli complains that it's redundnat if you mark them as static. I can add it in though if you think it's clearer.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to change anything. I just didn't recall seeing that before, but thats about what I suspected.

@Argument(fullName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, shortName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, doc = "If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!", optional = true)
private boolean disableSequenceDictionaryValidation = false;

@Override
public boolean performSequenceDictionaryValidation() {
return !disableSequenceDictionaryValidation;
}
}

/**
* doesn't provide a configuration argument, and always returns false, useful for tools that do not want to perform
* sequence dictionary validation, like aligners
*/
class NoValidationCollection implements SequenceDictionaryValidationArgumentCollection {
@Override
public boolean performSequenceDictionaryValidation() {
return false;
}
}
}
18 changes: 15 additions & 3 deletions src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.transformers.ReadTransformer;
import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils;
Expand Down Expand Up @@ -66,8 +67,8 @@ public abstract class GATKTool extends CommandLineProgram {
@Argument(fullName = SECONDS_BETWEEN_PROGRESS_UPDATES_NAME, shortName = SECONDS_BETWEEN_PROGRESS_UPDATES_NAME, doc = "Output traversal statistics every time this many seconds elapse", optional = true, common = true)
private double secondsBetweenProgressUpdates = ProgressMeter.DEFAULT_SECONDS_BETWEEN_UPDATES;

@Argument(fullName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, shortName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, doc = "If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!", optional = true, common = true)
private boolean disableSequenceDictionaryValidation = false;
@ArgumentCollection
private SequenceDictionaryValidationArgumentCollection seqValidationArguments = getSequenceDictionaryValidationArgumentCollection();

@Argument(fullName=StandardArgumentDefinitions.CREATE_OUTPUT_BAM_INDEX_LONG_NAME,
shortName=StandardArgumentDefinitions.CREATE_OUTPUT_BAM_INDEX_SHORT_NAME,
Expand Down Expand Up @@ -452,6 +453,17 @@ public boolean requiresIntervals() {
return false;
}


/**
* Get the {@link SequenceDictionaryValidationArgumentCollection} for the tool.
* Subclasses may override this method in order to customize validation options.
*
* @return a SequenceDictionaryValidationArgumentCollection
*/
protected SequenceDictionaryValidationArgumentCollection getSequenceDictionaryValidationArgumentCollection() {
return new SequenceDictionaryValidationArgumentCollection.StandardValidationCollection();
}

/**
* Load the master sequence dictionary as specified in {@code masterSequenceDictionaryFilename}.
* Will only load the master sequence dictionary if it has not already been loaded.
Expand Down Expand Up @@ -562,7 +574,7 @@ protected void onStartup() {

initializeIntervals(); // Must be initialized after reference, reads and features, since intervals currently require a sequence dictionary from another data source

if ( ! disableSequenceDictionaryValidation ) {
if ( seqValidationArguments.performSequenceDictionaryValidation()) {
validateSequenceDictionaries();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLinePluginDescriptor;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKReadFilterPluginDescriptor;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.*;
import org.broadinstitute.hellbender.engine.TraversalParameters;
import org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource;
Expand Down Expand Up @@ -84,8 +83,9 @@ public abstract class GATKSparkTool extends SparkCommandLineProgram {
optional = true)
protected long bamPartitionSplitSize = 0;

@Argument(fullName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, shortName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, doc = "If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!", optional = true)
private boolean disableSequenceDictionaryValidation = false;

@ArgumentCollection
protected SequenceDictionaryValidationArgumentCollection sequenceDictionaryValidationArguments = getSequenceDictionaryValidationArgumentCollection();

@Argument(doc = "For tools that write an output, write the output in multiple pieces (shards)",
fullName = SHARDED_OUTPUT_LONG_NAME,
Expand Down Expand Up @@ -187,6 +187,14 @@ public SerializableFunction<GATKRead, SimpleInterval> getReferenceWindowFunction
return ReferenceWindowFunctions.IDENTITY_FUNCTION;
}

/**
* subclasses can override this to provide different default behavior for sequence dictionary validation
* @return a SequenceDictionaryValidationArgumentCollection
*/
protected SequenceDictionaryValidationArgumentCollection getSequenceDictionaryValidationArgumentCollection() {
return new SequenceDictionaryValidationArgumentCollection.StandardValidationCollection();
}

/**
* Returns the "best available" sequence dictionary. This will be the reference sequence dictionary if
* there is a reference, otherwise it will be the sequence dictionary constructed from the reads if
Expand Down Expand Up @@ -391,7 +399,7 @@ public List<SimpleInterval> getIntervals() {
@Override
protected void runPipeline( JavaSparkContext sparkContext ) {
initializeToolInputs(sparkContext);
validateToolInputs();
validateSequenceDictionaries();
runTool(sparkContext);
}

Expand Down Expand Up @@ -485,8 +493,8 @@ protected List<SimpleInterval> editIntervals(final List<SimpleInterval> rawInter
/**
* Validates standard tool inputs against each other.
*/
private void validateToolInputs() {
if ( ! disableSequenceDictionaryValidation ) {
protected void validateSequenceDictionaries() {
if ( sequenceDictionaryValidationArguments.performSequenceDictionaryValidation() ) {
// Validate the reference sequence dictionary against the reads sequence dictionary, if both are present,
// using standard GATK validation settings (requiring a common subset of equivalent contigs without respect
// to ordering).
Expand All @@ -510,4 +518,5 @@ private void validateToolInputs() {
* @param ctx our Spark context
*/
protected abstract void runTool( JavaSparkContext ctx );

}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
Expand All @@ -22,8 +23,8 @@
import java.util.List;

@DocumentedFeature
@CommandLineProgramProperties(summary = "Runs BWA",
oneLineSummary = "BWA on Spark",
@CommandLineProgramProperties(summary = "Align reads using BWA",
oneLineSummary = "Align reads to a given reference using BWA on Spark",
programGroup = ReadDataManipulationProgramGroup.class)
@BetaFeature
public final class BwaSpark extends GATKSparkTool {
Expand Down Expand Up @@ -54,6 +55,11 @@ public List<ReadFilter> getDefaultReadFilters() {
return Arrays.asList(ReadFilterLibrary.PRIMARY_LINE, ReadFilterLibrary.SEQ_IS_STORED);
}

@Override
public SequenceDictionaryValidationArgumentCollection getSequenceDictionaryValidationArgumentCollection(){
return new SequenceDictionaryValidationArgumentCollection.NoValidationCollection();
}

@Override
protected void runTool(final JavaSparkContext ctx) {
try ( final BwaSparkEngine bwaEngine =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection;
import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection;
import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
Expand Down Expand Up @@ -57,6 +58,11 @@ public final class BwaAndMarkDuplicatesPipelineSpark extends GATKSparkTool {
protected MarkDuplicatesSparkArgumentCollection markDuplicatesSparkArgumentCollection = new MarkDuplicatesSparkArgumentCollection();


@Override
protected SequenceDictionaryValidationArgumentCollection getSequenceDictionaryValidationArgumentCollection() {
return new SequenceDictionaryValidationArgumentCollection.NoValidationCollection();
}

@Override
protected void runTool(final JavaSparkContext ctx) {
try (final BwaSparkEngine bwaEngine = new BwaSparkEngine(ctx, referenceArguments.getReferenceFileName(), bwaArgs.indexImageFile, getHeaderForReads(), getReferenceSequenceDictionary())) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,14 @@ public SerializableFunction<GATKRead, SimpleInterval> getReferenceWindowFunction
return BaseRecalibrationEngine.BQSR_REFERENCE_WINDOW_FUNCTION;
}

@Override
protected void validateSequenceDictionaries(){
//don't validate unaligned reads because we don't require them to have a sequence dictionary
if( !align ){
super.validateSequenceDictionaries();
}
}

@Override
protected void runTool(final JavaSparkContext ctx) {
if (joinStrategy == JoinStrategy.BROADCAST && ! getReference().isCompatibleWithSparkBroadcast()){
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package org.broadinstitute.hellbender.cmdline.argumentcollections;

import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineArgumentParser;
import org.broadinstitute.barclay.argparser.CommandLineParser;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection;
import org.testng.Assert;
import org.testng.annotations.Test;

public class SequenceDictionaryValidationArgumentCollectionTest {

private static class StandardArgumentCollection {
@ArgumentCollection
public SequenceDictionaryValidationArgumentCollection standard = new SequenceDictionaryValidationArgumentCollection.StandardValidationCollection();
}

@Test
public void testStandardArgumentCollectionDefaultsToTrue(){
Assert.assertTrue(new SequenceDictionaryValidationArgumentCollection.StandardValidationCollection().performSequenceDictionaryValidation());
}

@Test
public void testStandardArgumentCollectionCanBeDisabled(){
final String[] disabled = {"--"+StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME};
StandardArgumentCollection std = new StandardArgumentCollection();
CommandLineParser clp = new CommandLineArgumentParser(std);
clp.parseArguments(System.out, disabled);
Assert.assertFalse(std.standard.performSequenceDictionaryValidation());
}

@Test
public void testNoValidationDefaultsToFalse(){
Assert.assertFalse(new SequenceDictionaryValidationArgumentCollection.NoValidationCollection().performSequenceDictionaryValidation());
}

}
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package org.broadinstitute.hellbender.tools.spark.bwa;

import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection;
import org.broadinstitute.hellbender.tools.spark.pipelines.BwaAndMarkDuplicatesPipelineSpark;
import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder;
Expand Down Expand Up @@ -34,8 +33,7 @@ public void test() throws Exception {
args.addReference(ref);
args.addInput(input);
args.addOutput(output);
args.addBooleanArgument(StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, true);
args.add("--"+MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME);
args.addBooleanArgument(MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME, true);
this.runCommandLine(args.getArgsArray());

SamAssertionUtils.assertSamsEqual(output, expectedSam);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ public void testPairedEnd() throws Exception {
final ArgumentsBuilder args = new ArgumentsBuilder();
args.addFileArgument(StandardArgumentDefinitions.REFERENCE_LONG_NAME, ref);
args.addFileArgument(StandardArgumentDefinitions.INPUT_LONG_NAME, input);
args.addBooleanArgument(StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, true); // disable since input does not have a sequence dictionary
args.addBooleanArgument(GATKSparkTool.SHARDED_OUTPUT_LONG_NAME, true);
args.addArgument(GATKSparkTool.NUM_REDUCERS_LONG_NAME,"1");
args.addOutput(output);
Expand All @@ -52,7 +51,6 @@ public void testSingleEnd() throws Exception {
final ArgumentsBuilder args = new ArgumentsBuilder();
args.addFileArgument(StandardArgumentDefinitions.REFERENCE_LONG_NAME, ref);
args.addFileArgument(StandardArgumentDefinitions.INPUT_LONG_NAME, input);
args.addBooleanArgument(StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME , true); // disable since input does not have a sequence dictionary
args.addBooleanArgument(GATKSparkTool.SHARDED_OUTPUT_LONG_NAME, true);
args.addArgument(GATKSparkTool.NUM_REDUCERS_LONG_NAME,"1");
args.addOutput(output);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,6 @@ private PipelineTest(String referenceURL, String bam, String outputExtension, St
this.expectedVcfFileName = expectedVcfFileName;
}

public String getCommandLine() {
return " -R " + referenceURL +
" -I " + bam +
" " + args +
(knownSites.isEmpty() ? "": " --known-sites " + knownSites) +
" -O %s";
}

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wow.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was totally unused.

@Override
public String toString() {
return String.format("ReadsPipeline(bam='%s', args='%s')", bam, args);
Expand Down Expand Up @@ -97,7 +89,7 @@ public Object[][] createReadsPipelineSparkTestData() {
{new PipelineTest(GRCh37Ref2bit_chr2021, hiSeqBam_chr20, ".bam", dbSNPb37_20, "--join-strategy OVERLAPS_PARTITIONER --read-shard-padding 1000 --known-sites " + more20Sites, getResourceDir() + expectedMultipleKnownSites, getResourceDir() + expectedMultipleKnownSitesVcf)},

// BWA-MEM
{new PipelineTest(GRCh37Ref2bit_chr2021, unalignedBam, ".bam", dbSNPb37_20, "--align --bwa-mem-index-image " + GRCh37Ref_2021_img + " --disable-sequence-dictionary-validation true --join-strategy BROADCAST --known-sites " + more20Sites, null, largeFileTestDir + expectedMultipleKnownSitesFromUnalignedVcf)},
{new PipelineTest(GRCh37Ref2bit_chr2021, unalignedBam, ".bam", dbSNPb37_20, "--align --bwa-mem-index-image " + GRCh37Ref_2021_img + " --join-strategy BROADCAST --known-sites " + more20Sites, null, largeFileTestDir + expectedMultipleKnownSitesFromUnalignedVcf)},
};
}

Expand Down