-
Notifications
You must be signed in to change notification settings - Fork 597
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ec36b13
commit 31c4f06
Showing
14 changed files
with
676 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
213 changes: 213 additions & 0 deletions
213
...g/broadinstitute/hellbender/tools/walkers/realignmentfilter/FilterAlignmentArtifacts.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
package org.broadinstitute.hellbender.tools.walkers.realignmentfilter; | ||
|
||
import htsjdk.variant.variantcontext.VariantContext; | ||
import htsjdk.variant.variantcontext.VariantContextBuilder; | ||
import htsjdk.variant.variantcontext.writer.VariantContextWriter; | ||
import htsjdk.variant.vcf.VCFHeader; | ||
import htsjdk.variant.vcf.VCFHeaderLine; | ||
import org.apache.commons.lang.mutable.MutableInt; | ||
import org.apache.commons.math3.util.Pair; | ||
import org.broadinstitute.barclay.argparser.Argument; | ||
import org.broadinstitute.barclay.argparser.ArgumentCollection; | ||
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; | ||
import org.broadinstitute.barclay.argparser.ExperimentalFeature; | ||
import org.broadinstitute.barclay.help.DocumentedFeature; | ||
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; | ||
import org.broadinstitute.hellbender.engine.FeatureContext; | ||
import org.broadinstitute.hellbender.engine.ReadsContext; | ||
import org.broadinstitute.hellbender.engine.ReferenceContext; | ||
import org.broadinstitute.hellbender.engine.VariantWalker; | ||
import org.broadinstitute.hellbender.engine.filters.ReadFilter; | ||
import org.broadinstitute.hellbender.tools.walkers.mutect.Mutect2Engine; | ||
import org.broadinstitute.hellbender.utils.Trilean; | ||
import org.broadinstitute.hellbender.utils.bwa.BwaMemAlignment; | ||
import org.broadinstitute.hellbender.utils.read.GATKRead; | ||
import org.broadinstitute.hellbender.utils.read.ReadUtils; | ||
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; | ||
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; | ||
import picard.cmdline.programgroups.VariantFilteringProgramGroup; | ||
|
||
import java.io.File; | ||
import java.util.*; | ||
|
||
/** | ||
* <p>Filter false positive alignment artifacts from a VCF callset.</p> | ||
* | ||
* <p> | ||
* FilterAlignmentArtifacts identifies alignment artifacts, that is, apparent variants due to reads being mapped to the wrong genomic locus. | ||
* </p> | ||
* <p> | ||
* Alignment artifacts can occur whenever there is sufficient sequence similarity between two or more regions in the genome | ||
* to confuse the alignment algorithm. This can occur when the aligner for whatever reason overestimate how uniquely a read | ||
* maps, thereby assigning it too high of a mapping quality. It can also occur through no fault of the aligner due to gaps in | ||
* the reference, which can also hide the true position to which a read should map. By using a good alignment algorithm | ||
* (the GATK wrapper of BWA-MEM), giving it sensitive settings (which may have been impractically slow for the original | ||
* bam alignment) and mapping to the best available reference we can avoid these pitfalls. The last point is especially important: | ||
* one can (and should) use a BWA-MEM index image corresponding to the best reference, regardless of the reference to which | ||
* the bam was aligned. | ||
* </p> | ||
* <p> | ||
* This tool is featured in the Somatic Short Mutation calling Best Practice Workflow. | ||
* See <a href="https://software.broadinstitute.org/gatk/documentation/article?id=11136">Tutorial#11136</a> for a | ||
* step-by-step description of the workflow and <a href="https://software.broadinstitute.org/gatk/documentation/article?id=11127">Article#11127</a> | ||
* for an overview of what traditional somatic calling entails. For the latest pipeline scripts, see the | ||
* <a href="https://github.com/broadinstitute/gatk/tree/master/scripts/mutect2_wdl">Mutect2 WDL scripts directory</a>. | ||
* </p> | ||
* <p> | ||
* The bam input to this tool should be the reassembly bamout produced by HaplotypeCaller or Mutect2 in the process of generating | ||
* the input callset. The original bam will also work but might fail to filter some indels. The reference passed with the -R argument | ||
* must be the reference to which the input bam was realigned. This does not need to correspond to the reference of the BWA-MEM | ||
* index image. The latter should be derived from the best available reference, for example hg38 in humans as of February 2018. | ||
* </p> | ||
* | ||
* <h3>Usage example</h3> | ||
* <pre> | ||
* gatk FilterAlignmentArtifacts \ | ||
* -R hg19.fasta | ||
* -V somatic.vcf.gz \ | ||
* -I somatic_bamout.bam \ | ||
* --bwa-mem-index-image hg38.index_image \ | ||
* -O filtered.vcf.gz | ||
* </pre> | ||
* | ||
*/ | ||
@CommandLineProgramProperties( | ||
summary = "Filter alignment artifacts from a vcf callset.", | ||
oneLineSummary = "Filter alignment artifacts from a vcf callset.", | ||
programGroup = VariantFilteringProgramGroup.class | ||
) | ||
@DocumentedFeature | ||
@ExperimentalFeature | ||
public class FilterAlignmentArtifacts extends VariantWalker { | ||
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, | ||
shortName=StandardArgumentDefinitions.OUTPUT_SHORT_NAME, | ||
doc="The output filtered VCF file", optional=false) | ||
private final String outputVcf = null; | ||
|
||
public static final int DEFAULT_INDEL_START_TOLERANCE = 5; | ||
public static final String INDEL_START_TOLERANCE_LONG_NAME = "indel-start-tolerance"; | ||
@Argument(fullName = INDEL_START_TOLERANCE_LONG_NAME, | ||
doc="Max distance between indel start of aligned read in the bam and the variant in the vcf", optional=true) | ||
private int indelStartTolerance = DEFAULT_INDEL_START_TOLERANCE; | ||
|
||
public static final int DEFAULT_FRAGMENT_SIZE = 1000; | ||
public static final String FRAGMENT_SIZE_LONG_NAME = "fragment-size"; | ||
@Argument(fullName = FRAGMENT_SIZE_LONG_NAME, | ||
doc="Distance away from variant to look for reads' mates.", optional=true) | ||
private int fragmentSize = DEFAULT_FRAGMENT_SIZE; | ||
|
||
public static final int DEFAULT_MAX_FAILED_REALIGNMENTS = 3; | ||
public static final String MAX_FAILED_REALIGNMENTS_LONG_NAME = "max-failed-realignments"; | ||
@Argument(fullName = MAX_FAILED_REALIGNMENTS_LONG_NAME, | ||
doc="Maximum number of failed read realignments before a variant is rejected.", optional=true) | ||
private int maxFailedRealignments = DEFAULT_MAX_FAILED_REALIGNMENTS; | ||
|
||
public static final int DEFAULT_SUFFICIENT_GOOD_REALIGNMENTS = 2; | ||
public static final String SUFFICIENT_GOOD_REALIGNMENTS_LONG_NAME = "sufficient-good-realignments"; | ||
@Argument(fullName = SUFFICIENT_GOOD_REALIGNMENTS_LONG_NAME, | ||
doc="Sufficient number of good read realignments to accept a variant.", optional=true) | ||
private int sufficientGoodRealignments = DEFAULT_SUFFICIENT_GOOD_REALIGNMENTS; | ||
|
||
|
||
@ArgumentCollection | ||
protected RealignmentArgumentCollection realignmentArgumentCollection = new RealignmentArgumentCollection(); | ||
|
||
private VariantContextWriter vcfWriter; | ||
private RealignmentEngine realignmentEngine; | ||
|
||
@Override | ||
public List<ReadFilter> getDefaultReadFilters() { | ||
return Mutect2Engine.makeStandardMutect2ReadFilters(); | ||
} | ||
|
||
@Override | ||
public boolean requiresReads() { return true; } | ||
|
||
@Override | ||
public void onTraversalStart() { | ||
realignmentEngine = new RealignmentEngine(realignmentArgumentCollection); | ||
vcfWriter = createVCFWriter(new File(outputVcf)); | ||
|
||
final VCFHeader inputHeader = getHeaderForVariants(); | ||
final Set<VCFHeaderLine> headerLines = new HashSet<>(inputHeader.getMetaDataInSortedOrder()); | ||
headerLines.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.ALIGNMENT_ARTIFACT_FILTER_NAME)); | ||
headerLines.addAll(getDefaultToolVCFHeaderLines()); | ||
final VCFHeader vcfHeader = new VCFHeader(headerLines, inputHeader.getGenotypeSamples()); | ||
vcfWriter.writeHeader(vcfHeader); | ||
} | ||
|
||
@Override | ||
public Object onTraversalSuccess() { | ||
return "SUCCESS"; | ||
} | ||
|
||
@Override | ||
public void apply(final VariantContext vc, final ReadsContext readsContext, final ReferenceContext refContext, final FeatureContext fc) { | ||
Trilean passesFilter = vc.getNAlleles() == 1 ? Trilean.TRUE : Trilean.UNKNOWN; | ||
|
||
final MutableInt failedRealignmentCount = new MutableInt(0); | ||
final MutableInt succeededRealignmentCount = new MutableInt(0); | ||
|
||
final Map<GATKRead, GATKRead> mates = realignmentArgumentCollection.dontUseMates ? null | ||
: ReadUtils.getReadToMateMap(readsContext, fragmentSize); | ||
|
||
for (final GATKRead read : readsContext) { | ||
if (passesFilter != Trilean.UNKNOWN) { | ||
break; | ||
} else if ( !RealignmentEngine.supportsVariant(read, vc, indelStartTolerance)) { | ||
continue; | ||
} | ||
|
||
final RealignmentEngine.RealignmentResult readRealignment = realignmentEngine.realign(read); | ||
|
||
// if there's no mate we go by the read realignment | ||
if (mates == null || !mates.containsKey(read)) { | ||
(readRealignment.isGood() ? succeededRealignmentCount : failedRealignmentCount).increment(); | ||
} else { | ||
// check whether the pair maps uniquely | ||
final GATKRead mate = mates.get(read); | ||
final RealignmentEngine.RealignmentResult mateRealignment = realignmentEngine.realign(mate); | ||
|
||
final List<BwaMemAlignment> readRealignments = readRealignment.getRealignments(); | ||
final List<BwaMemAlignment> mateRealignments = mateRealignment.getRealignments(); | ||
final List<Pair<BwaMemAlignment, BwaMemAlignment>> plausiblePairs = RealignmentEngine.findPlausiblePairs(readRealignments, mateRealignments, realignmentArgumentCollection.maxReasonableFragmentLength); | ||
|
||
if (plausiblePairs.size() <= 1) { | ||
succeededRealignmentCount.increment(); | ||
} else { | ||
plausiblePairs.sort(Comparator.comparingInt(pair -> -pairScore(pair)) ); | ||
final int scoreDiff = pairScore(plausiblePairs.get(0)) - pairScore(plausiblePairs.get(1)); | ||
if (scoreDiff >= realignmentArgumentCollection.minAlignerScoreDifference) { | ||
succeededRealignmentCount.increment(); | ||
} else { | ||
failedRealignmentCount.increment(); | ||
} | ||
} | ||
} | ||
|
||
if (failedRealignmentCount.intValue() > maxFailedRealignments) { | ||
passesFilter = Trilean.FALSE; | ||
} else if (succeededRealignmentCount.intValue() >= sufficientGoodRealignments) { | ||
passesFilter = Trilean.TRUE; | ||
} | ||
} | ||
|
||
// if we haven't decided yet due to too few supporting reads, fail if there are more failures than successes | ||
passesFilter = passesFilter != Trilean.UNKNOWN ? passesFilter : | ||
Trilean.of(failedRealignmentCount.intValue() <= succeededRealignmentCount.intValue()); | ||
|
||
vcfWriter.add(passesFilter == Trilean.TRUE ? vc : new VariantContextBuilder(vc).filter(GATKVCFConstants.ALIGNMENT_ARTIFACT_FILTER_NAME).make()); | ||
} | ||
|
||
private static int pairScore(final Pair<BwaMemAlignment, BwaMemAlignment> pair) { | ||
return pair.getFirst().getAlignerScore() + pair.getSecond().getAlignerScore(); | ||
} | ||
|
||
@Override | ||
public void closeTool() { | ||
if ( vcfWriter != null ) { | ||
vcfWriter.close(); | ||
} | ||
} | ||
|
||
} |
62 changes: 62 additions & 0 deletions
62
...adinstitute/hellbender/tools/walkers/realignmentfilter/RealignmentArgumentCollection.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
package org.broadinstitute.hellbender.tools.walkers.realignmentfilter; | ||
|
||
import org.broadinstitute.barclay.argparser.Argument; | ||
import org.broadinstitute.hellbender.tools.BwaMemIndexImageCreator; | ||
|
||
public class RealignmentArgumentCollection { | ||
public static final int DEFAULT_MIN_SEED_LENGTH = 14; | ||
public static final double DEFAULT_DROP_RATIO = 0.2; | ||
public static final double DEFAULT_SEED_SPLIT_FACTOR = 0.5; | ||
public static final int DEFAULT_MAX_REASONABLE_FRAGMENT_LENGTH = 100000; | ||
public static final int DEFAULT_MIN_ALIGNER_SCORE_DIFFERENCE = 20; | ||
public static final int DEFAULT_NUM_REGULAR_CONTIGS = 25; | ||
|
||
/** | ||
* BWA-mem index image created by {@link BwaMemIndexImageCreator} | ||
*/ | ||
@Argument(fullName = "bwa-mem-index-image", shortName = "index", doc = "BWA-mem index image") | ||
public String bwaMemIndexImage; | ||
|
||
/** | ||
* Turn off the default mate-aware realignment | ||
*/ | ||
@Argument(fullName = "dont-use-mates", doc = "Realign individual reads without using their mates", optional = true) | ||
public boolean dontUseMates = false; | ||
|
||
/** | ||
* Maximum fragment length to be considered a reasonable pair alignment | ||
*/ | ||
@Argument(fullName = "max-reasonable-fragment-length", doc = "Maximum fragment length to be considered a reasonable pair alignment", optional = true) | ||
public int maxReasonableFragmentLength = DEFAULT_MAX_REASONABLE_FRAGMENT_LENGTH; | ||
|
||
/** | ||
* Minimum difference between best and second-best alignment for a read to be considered well-mapped | ||
*/ | ||
@Argument(fullName = "min-aligner-score-difference", doc = "Minimum difference between best and second-best alignment for a read to be considered well-mapped", optional = true) | ||
public int minAlignerScoreDifference = DEFAULT_MIN_ALIGNER_SCORE_DIFFERENCE; | ||
|
||
/** | ||
* Number of regular i.e. non-alt contigs | ||
*/ | ||
@Argument(fullName = "num-regular-contigs", doc = "Number of regular i.e. non-alt contigs", optional = true) | ||
public int numRegularContigs = DEFAULT_NUM_REGULAR_CONTIGS; | ||
|
||
/** | ||
* BWA-mem minimum seed length | ||
*/ | ||
@Argument(fullName = "minimum-seed-length", shortName = "min-seed-length", optional = true, doc = "Minimum number of matching bases to seed a MEM") | ||
public int minSeedLength = DEFAULT_MIN_SEED_LENGTH; | ||
|
||
/** | ||
* BWA-mem extension drop ratio | ||
*/ | ||
@Argument(fullName = "drop-ratio", shortName = "drop-ratio", doc = "Fraction of best MEM extension score below which other extensions are dropped") | ||
public double dropRatio = DEFAULT_DROP_RATIO; | ||
|
||
/** | ||
* BWA-mem seed split factor | ||
*/ | ||
@Argument(fullName = "seed-split-factor", shortName = "split-factor", doc = "MEMs longer than the minimum seed length times this factor are split and re-seeded.") | ||
public double splitFactor = DEFAULT_SEED_SPLIT_FACTOR; | ||
|
||
} |
Oops, something went wrong.