Realignment filter

broadinstitute · Apr 24, 2018 · 3eeef83 · 3eeef83
1 parent 36eb37f
commit 3eeef83
Show file tree

Hide file tree

Showing 10 changed files with 491 additions and 5 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/ReadsContext.java b/src/main/java/org/broadinstitute/hellbender/engine/ReadsContext.java
@@ -2,11 +2,12 @@
 
 import org.broadinstitute.hellbender.engine.filters.ReadFilter;
 import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.iterators.ReadFilteringIterator;
 import org.broadinstitute.hellbender.utils.read.GATKRead;
 
-import java.util.Collections;
-import java.util.Iterator;
+import java.util.*;
+import java.util.stream.Collectors;
 
 /**
  * Wrapper around ReadsDataSource that presents reads overlapping a specific interval to a client,
@@ -99,4 +100,35 @@ public Iterator<GATKRead> iterator() {
                 dataSource.query(interval) :
                 new ReadFilteringIterator(dataSource.query(interval), readFilter);
     }
+
+    /**
+     * Create a map of reads overlapping {@code interval} to their mates by looking for all possible mates within some
+     * maximum fragment size.  This is not guaranteed to find all mates, in particular near structural variant breakpoints
+     * where mates may align far away.
+     *
+     * The algorithm is:
+     * 1) make two maps of read name --> read for reads overlapping {@code interval}, one for first-of-pair reads and one
+     *    for second-of-pair reads.
+     * 2) For all reads in an expanded interval padded by {@code fragmentSize} on both sides look for a read of the same
+     *    that is second-of-pair if this read is first-of-pair or vice-versa.  If such a read is found then this is that read's mate.
+     *
+     * @param fragmentSize the maximum distance on either side of {@code interval} to look for mates.
+     * @return a map of reads ot their mates for all reads for which a mate could be found.
+     */
+    public Map<GATKRead, GATKRead> getReadToMateMap(final int fragmentSize) {
+        final Map<String, GATKRead> readOnes = new HashMap<>();
+        final Map<String, GATKRead> readTwos = new HashMap<>();
+        Utils.stream(dataSource.query(interval)).forEach(read -> (read.isFirstOfPair() ? readOnes : readTwos).put(read.getName(), read));
+
+        final Map<GATKRead, GATKRead> result = new HashMap<>();
+        final SimpleInterval expandedInterval = new SimpleInterval(interval.getContig(), Math.max(1, interval.getStart() - fragmentSize), interval.getEnd() + fragmentSize);
+        Utils.stream(dataSource.query(expandedInterval)).forEach(mate -> {
+            final GATKRead read = (mate.isFirstOfPair() ? readTwos : readOnes).get(mate.getName());
+            if (read != null) {
+                result.put(read, mate);
+            }
+        });
+
+        return result;
+    }
 }
diff --git a/...g/broadinstitute/hellbender/tools/walkers/realignmentfilter/FilterAlignmentArtifacts.java b/...g/broadinstitute/hellbender/tools/walkers/realignmentfilter/FilterAlignmentArtifacts.java
@@ -0,0 +1,274 @@
+package org.broadinstitute.hellbender.tools.walkers.realignmentfilter;
+
+import htsjdk.samtools.CigarElement;
+import htsjdk.samtools.CigarOperator;
+import htsjdk.samtools.SAMFlag;
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.variantcontext.VariantContextBuilder;
+import htsjdk.variant.variantcontext.writer.VariantContextWriter;
+import htsjdk.variant.vcf.VCFHeader;
+import htsjdk.variant.vcf.VCFHeaderLine;
+import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.lang.mutable.MutableInt;
+import org.apache.commons.math3.util.Pair;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.ArgumentCollection;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.argparser.ExperimentalFeature;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.engine.FeatureContext;
+import org.broadinstitute.hellbender.engine.ReadsContext;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.engine.VariantWalker;
+import org.broadinstitute.hellbender.utils.Trilean;
+import org.broadinstitute.hellbender.utils.bwa.BwaMemAlignment;
+import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.read.ReadUtils;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
+import picard.cmdline.programgroups.VariantFilteringProgramGroup;
+
+import java.io.File;
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * <p>Filter false positive alignment artifacts from a VCF callset.</p>
+ *
+ * <p>
+ *     FilterAlignmentArtifacts identifies alignment artifacts, that is, apparent variants due to reads being mapped to the wrong genomic locus.
+ * </p>
+ * <p>
+ *     Alignment artifacts can occur whenever there is sufficient sequence similarity between two or more regions in the genome
+ *     to confuse the alignment algorithm.  This can occur when the aligner for whatever reason overestimate how uniquely a read
+ *     maps, thereby assigning it too high of a mapping quality.  It can also occur through no fault of the aligner due to gaps in
+ *     the reference, which can also hide the true position to which a read should map.  By using a good alignment algorithm
+ *     (the GATK wrapper of BWA-MEM), giving it sensitive settings (which may have been impractically slow for the original
+ *     bam alignment) and mapping to the best available reference we can avoid these pitfalls.  The last point is especially important:
+ *     one can (and should) use a BWA-MEM index image corresponding to the best reference, regardless of the reference to which
+ *     the bam was aligned.
+ * </p>
+ * <p>
+ *     This tool is featured in the Somatic Short Mutation calling Best Practice Workflow.
+ *     See <a href="https://software.broadinstitute.org/gatk/documentation/article?id=11136">Tutorial#11136</a> for a
+ *     step-by-step description of the workflow and <a href="https://software.broadinstitute.org/gatk/documentation/article?id=11127">Article#11127</a>
+ *     for an overview of what traditional somatic calling entails. For the latest pipeline scripts, see the
+ *     <a href="https://github.com/broadinstitute/gatk/tree/master/scripts/mutect2_wdl">Mutect2 WDL scripts directory</a>.
+ * </p>
+ * <p>
+ *     The bam input to this tool should be the reassembly bamout produced by HaplotypeCaller or Mutect2 in the process of generating
+ *     the input callset.  The original bam will also work but might fail to filter some indels.  The reference passed with the -R argument
+ *     but be the reference to which the input bam was realigned.  This does not need to correspond to the reference of the BWAS-MEM
+ *     index image.  The latter should be derived from the best available reference, for example hg38 in humans as of February 2018.
+ * </p>
+ *
+ * <h3>Usage example</h3>
+ * <pre>
+ * gatk FilterAlignmentArtifacts \
+ *   -R hg19.fasta
+ *   -V somatic.vcf.gz \
+ *   -I somatic_bamout.bam \
+ *   --bwa-mem-index-image hg38.index_image \
+ *   -O filtered.vcf.gz
+ * </pre>
+ *
+ */
+@CommandLineProgramProperties(
+        summary = "Filter alignment artifacts from a vcf callset.",
+        oneLineSummary = "Filter alignment artifacts from a vcf callset.",
+        programGroup = VariantFilteringProgramGroup.class
+)
+@DocumentedFeature
+@ExperimentalFeature
+public class FilterAlignmentArtifacts extends VariantWalker {
+    @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+            shortName=StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
+            doc="The output filtered VCF file", optional=false)
+    private final String outputVcf = null;
+
+    public static final int DEFAULT_INDEL_START_TOLERANCE = 5;
+    public static final String INDEL_START_TOLERANCE_LONG_NAME = "indel-start-tolerance";
+    @Argument(fullName = INDEL_START_TOLERANCE_LONG_NAME,
+            doc="Max distance between indel start of aligned read in the bam and the variant in the vcf", optional=true)
+    private int indelStartTolerance = DEFAULT_INDEL_START_TOLERANCE;
+
+    public static final int DEFAULT_FRAGMENT_SIZE = 1000;
+    public static final String FRAGMENT_SIZE_LONG_NAME = "fragment-size";
+    @Argument(fullName = FRAGMENT_SIZE_LONG_NAME,
+            doc="Distance away from variant to look for reads' mates.", optional=true)
+    private int fragmentSize = DEFAULT_FRAGMENT_SIZE;
+
+    public static final int DEFAULT_MIN_MAPPING_QUALITY = 20;
+    public static final String MIN_MAPPING_QUALITY_LONG_NAME = "min-mapping-quality";
+    @Argument(fullName = MIN_MAPPING_QUALITY_LONG_NAME,
+            doc="Ignore reads with mapping quality less than this", optional=true)
+    private int minMappingQuality = DEFAULT_MIN_MAPPING_QUALITY;
+
+    public static final int DEFAULT_MAX_FAILED_REALIGNMENTS = 3;
+    public static final String MAX_FAILED_REALIGNMENTS_LONG_NAME = "max-failed-realignments";
+    @Argument(fullName = MAX_FAILED_REALIGNMENTS_LONG_NAME,
+            doc="Maximum number of failed read realignments before a variant is rejected.", optional=true)
+    private int maxFailedRealignments = DEFAULT_MAX_FAILED_REALIGNMENTS;
+
+    public static final int DEFAULT_SUFFICIENT_GOOD_REALIGNMENTS = 2;
+    public static final String SUFFICIENT_GOOD_REALIGNMENTS_LONG_NAME = "sufficient-good-realignments";
+    @Argument(fullName = SUFFICIENT_GOOD_REALIGNMENTS_LONG_NAME,
+            doc="Sufficient number of good read realignments to accept a variant.", optional=true)
+    private int sufficientGoodRealignments = DEFAULT_SUFFICIENT_GOOD_REALIGNMENTS;
+
+
+    @ArgumentCollection
+    protected RealignmentArgumentCollection realignmentArgumentCollection = new RealignmentArgumentCollection();
+
+    private VariantContextWriter vcfWriter;
+    private Realigner realigner;
+
+    @Override
+    public boolean requiresReads() { return true; }
+
+    @Override
+    public void onTraversalStart() {
+        realigner = new Realigner(realignmentArgumentCollection);
+        vcfWriter = createVCFWriter(new File(outputVcf));
+
+        final VCFHeader inputHeader = getHeaderForVariants();
+        final Set<VCFHeaderLine> headerLines = new HashSet<>(inputHeader.getMetaDataInSortedOrder());
+        headerLines.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.ALIGNMENT_ARTIFACT_FILTER_NAME));
+        headerLines.addAll(getDefaultToolVCFHeaderLines());
+        final VCFHeader vcfHeader = new VCFHeader(headerLines, inputHeader.getGenotypeSamples());
+        vcfWriter.writeHeader(vcfHeader);
+    }
+
+    @Override
+    public Object onTraversalSuccess() {
+        return "SUCCESS";
+    }
+
+    @Override
+    public void apply(final VariantContext vc, final ReadsContext readsContext, final ReferenceContext refContext, final FeatureContext fc) {
+        Trilean passesFilter = Trilean.UNKNOWN;
+
+        if (vc.getNAlleles() == 1) {
+            passesFilter = Trilean.TRUE;
+        }
+
+        final MutableInt failedRealignmentCount = new MutableInt(0);
+        final MutableInt succeededRealignmentCount = new MutableInt(0);
+
+        final Map<GATKRead, GATKRead> mates = realignmentArgumentCollection.dontUseMates ? null
+                : readsContext.getReadToMateMap(fragmentSize);
+
+        for (final GATKRead read : readsContext) {
+            if (passesFilter != Trilean.UNKNOWN) {
+                break;
+            } else if (read.getMappingQuality() < minMappingQuality || !supportsVariant(read, vc)) {
+                continue;
+            }
+
+            // First check the read itself.  If it maps uniquely we don't need to bother with the mate
+            final Realigner.RealignmentResult readRealignment = realigner.realign(read);
+
+            // if there's no mate we go by the read realignment
+            if (mates == null || !mates.containsKey(read)) {
+                (readRealignment.isGood() ? succeededRealignmentCount : failedRealignmentCount).increment();
+            } else {
+                // check whether the pair maps uniquely
+                final GATKRead mate = mates.get(read);
+                final Realigner.RealignmentResult mateRealignment = realigner.realign(mate);
+
+                final List<BwaMemAlignment> readRealignments = readRealignment.getRealignments();
+                final List<BwaMemAlignment> mateRealignments = mateRealignment.getRealignments();
+
+                // note that we realigned the original aligned bases of the read and mate, not the raw sequenced bases
+                // thus one of them has been reverse-complemented and we look for pairs that map to the same strand
+                final List<Pair<BwaMemAlignment, BwaMemAlignment>> plausiblePairs = readRealignments.stream()
+                        .flatMap(r -> mateRealignments.stream()
+                                .filter(m -> m.getRefId() == r.getRefId())
+                                .filter(m -> Math.abs(m.getRefStart() - r.getRefStart()) < realignmentArgumentCollection.maxReasonableFragmentLength)
+                                .filter(m -> SAMFlag.READ_REVERSE_STRAND.isSet(m.getSamFlag()) == SAMFlag.READ_REVERSE_STRAND.isSet(r.getSamFlag()))
+                                .map(m -> new Pair<BwaMemAlignment, BwaMemAlignment>(r,m)))
+                        .collect(Collectors.toList());
+
+                if (plausiblePairs.size() <= 1) {
+                    succeededRealignmentCount.increment();
+                } else {
+                    plausiblePairs.sort(Comparator.comparingInt(pair -> -pairScore(pair)) );
+                    final int scoreDiff = pairScore(plausiblePairs.get(0)) - pairScore(plausiblePairs.get(1));
+                    if (scoreDiff >= realignmentArgumentCollection.minAlignerScoreDifference) {
+                        succeededRealignmentCount.increment();
+                    } else {
+                        failedRealignmentCount.increment();
+                    }
+                }
+            }
+
+            if (failedRealignmentCount.intValue() > maxFailedRealignments) {
+                passesFilter = Trilean.FALSE;
+            } else if (succeededRealignmentCount.intValue() >= sufficientGoodRealignments) {
+                passesFilter = Trilean.TRUE;
+            }
+        }
+
+        // if we haven't decided yet due to too few supporting reads, fail if there are more failures than successes
+        passesFilter = passesFilter != Trilean.UNKNOWN ? passesFilter :
+                Trilean.of(failedRealignmentCount.intValue() <= succeededRealignmentCount.intValue());
+
+        vcfWriter.add(passesFilter == Trilean.TRUE ? vc : new VariantContextBuilder(vc).filter(GATKVCFConstants.ALIGNMENT_ARTIFACT_FILTER_NAME).make());
+    }
+
+    private static int pairScore(final Pair<BwaMemAlignment, BwaMemAlignment> pair) {
+        return pair.getFirst().getAlignerScore() + pair.getSecond().getAlignerScore();
+    }
+
+    @Override
+    public void closeTool() {
+        if ( vcfWriter != null ) {
+            vcfWriter.close();
+        }
+    }
+
+    private boolean supportsVariant(final GATKRead read, final VariantContext vc) {
+        final byte[] readBases = read.getBasesNoCopy();
+
+        final int variantPositionInRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(),
+                vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL, true);
+        if ( variantPositionInRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED || AlignmentUtils.isInsideDeletion(read.getCigar(), variantPositionInRead)) {
+            return false;
+        }
+
+        for (final Allele allele : vc.getAlternateAlleles()) {
+            final int referenceLength = vc.getReference().length();
+            if (allele.length() == referenceLength) {   // SNP or MNP -- check whether read bases match variant allele bases
+                if (allele.basesMatch(ArrayUtils.subarray(readBases, variantPositionInRead, Math.min(variantPositionInRead + allele.length(), readBases.length)))) {
+                    return true;
+                }
+            } else {    // indel -- check if the read has the right CIGAR operator near position -- we don't require exact an exact match because indel representation is non-unique
+                final boolean isDeletion = allele.length() < referenceLength;
+                int readPosition = 0;    // offset by 1 because the variant position is one base before the first indel base
+                for (final CigarElement cigarElement : read.getCigarElements()) {
+                    if (Math.abs(readPosition - variantPositionInRead) <= indelStartTolerance) {
+                        if (isDeletion && mightSupportDeletion(cigarElement) || !isDeletion && mightSupportInsertion(cigarElement)) {
+                            return true;
+                        }
+                    } else {
+                        readPosition += cigarElement.getLength();
+                    }
+                }
+            }
+        }
+        return false;
+    }
+
+    private static boolean mightSupportDeletion(final CigarElement cigarElement) {
+        final CigarOperator cigarOperator = cigarElement.getOperator();
+        return cigarOperator == CigarOperator.D || cigarOperator == CigarOperator.S;
+    }
+
+    private final static boolean mightSupportInsertion(final CigarElement cigarElement) {
+        final CigarOperator cigarOperator = cigarElement.getOperator();
+        return cigarOperator == CigarOperator.I || cigarOperator == CigarOperator.S;
+    }
+}