Skip to content

Commit

Permalink
Mutect2 and HaplotypeCaller can emit MNPs according to adjustable dis…
Browse files Browse the repository at this point in the history
…tance threshold (#4650)
  • Loading branch information
davidbenjamin authored May 22, 2018
1 parent 06a5050 commit e078a57
Show file tree
Hide file tree
Showing 21 changed files with 432 additions and 37 deletions.
2 changes: 1 addition & 1 deletion scripts/m2_cromwell_tests/test_m2_wdl_multi.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"Mutect2_Multi.gatk_docker": "__GATK_DOCKER__",
"Mutect2_Multi.oncotator_docker": "broadinstitute/oncotator:1.9.7.0",
"Mutect2_Multi.oncotator_docker": "broadinstitute/oncotator:1.9.9.0",
"Mutect2_Multi.intervals": "/home/travis/build/broadinstitute/gatk/scripts/m2_cromwell_tests/interval_list.interval_list",
"Mutect2_Multi.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta",
"Mutect2_Multi.ref_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta.fai",
Expand Down
2 changes: 1 addition & 1 deletion scripts/mutect2_wdl/mutect2.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ workflow Mutect2 {
String gatk_docker
String basic_bash_docker = "ubuntu:16.04"
String? oncotator_docker
String oncotator_docker_or_default = select_first([oncotator_docker, "broadinstitute/oncotator:1.9.8.0"])
String oncotator_docker_or_default = select_first([oncotator_docker, "broadinstitute/oncotator:1.9.9.0"])
Boolean? filter_oncotator_maf
Boolean filter_oncotator_maf_or_default = select_first([filter_oncotator_maf, true])
String? oncotator_extra_args
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ public abstract class AssemblyBasedCallerArgumentCollection extends StandardCall
public static final String CAPTURE_ASSEMBLY_FAILURE_BAM_LONG_NAME = "capture-assembly-failure-bam";
public static final String ERROR_CORRECT_READS_LONG_NAME = "error-correct-reads";
public static final String DO_NOT_RUN_PHYSICAL_PHASING_LONG_NAME = "do-not-run-physical-phasing";

public static final String MIN_BASE_QUALITY_SCORE_LONG_NAME = "min-base-quality-score";
public static final String SMITH_WATERMAN_LONG_NAME = "smith-waterman";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,18 +82,24 @@ public Set<Haplotype> getCalledHaplotypes() {
* @param ref the reference bases (over the same interval as the haplotypes)
* @param refLoc the span of the reference bases
* @param activeAllelesToGenotype alleles we want to ensure are scheduled for genotyping (GGA mode)
* @param maxMnpDistance Phased substitutions separated by this distance or less are merged into MNPs. More than
* two substitutions occurring in the same alignment block (ie the same M/X/EQ CIGAR element)
* are merged until a substitution is separated from the previous one by a greater distance.
* That is, if maxMnpDistance = 1, substitutions at positions 10,11,12,14,15,17 are partitioned into a MNP
* at 10-12, a MNP at 14-15, and a SNP at 17.
* @return never {@code null} but perhaps an empty list if there is no variants to report.
*/
protected TreeSet<Integer> decomposeHaplotypesIntoVariantContexts(final List<Haplotype> haplotypes,
final byte[] ref,
final SimpleInterval refLoc,
final List<VariantContext> activeAllelesToGenotype) {
final List<VariantContext> activeAllelesToGenotype,
final int maxMnpDistance) {
final boolean inGGAMode = ! activeAllelesToGenotype.isEmpty();

// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
// IMPORTANT NOTE: This needs to be done even in GGA mode, as this method call has the side effect of setting the
// event maps in the Haplotype objects!
final TreeSet<Integer> startPosKeySet = EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, configuration.debug);
final TreeSet<Integer> startPosKeySet = EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, configuration.debug, maxMnpDistance);

if ( inGGAMode ) {
startPosKeySet.clear();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import org.broadinstitute.hellbender.utils.collections.CountSet;
import org.broadinstitute.hellbender.utils.haplotype.EventMap;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.param.ParamUtils;

import java.io.PrintWriter;
import java.io.StringWriter;
Expand Down Expand Up @@ -501,13 +502,18 @@ private void updateReferenceHaplotype(final Haplotype newHaplotype) {
*
* <p/>
* The result is sorted incrementally by location.
*
* @param maxMnpDistance Phased substitutions separated by this distance or less are merged into MNPs. More than
* two substitutions occuring in the same alignment block (ie the same M/X/EQ CIGAR element)
* are merged until a substitution is separated from the previous one by a greater distance.
* That is, if maxMnpDistance = 1, substitutions at 10,11,12,14,15,17 are partitioned into a MNP
* at 10-12, a MNP at 14-15, and a SNP at 17. May not be negative.
* @return never {@code null}, but perhaps an empty collection.
*/
public SortedSet<VariantContext> getVariationEvents() {
public SortedSet<VariantContext> getVariationEvents(final int maxMnpDistance) {
ParamUtils.isPositiveOrZero(maxMnpDistance, "maxMnpDistance may not be negative.");
if (variationEvents == null) {
final List<Haplotype> haplotypeList = getHaplotypeList();
EventMap.buildEventMapsForHaplotypes(haplotypeList, fullReferenceWithPadding, paddedReferenceLoc, debug);
EventMap.buildEventMapsForHaplotypes(haplotypeList, fullReferenceWithPadding, paddedReferenceLoc, debug, maxMnpDistance);
variationEvents = EventMap.getAllVariantContexts(haplotypeList);
}
return variationEvents;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import java.nio.file.Path;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
Expand All @@ -15,10 +15,11 @@
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.hellbender.utils.io.IOUtils;

import java.io.FileNotFoundException;
import java.nio.file.Path;
import java.util.List;
import org.broadinstitute.hellbender.utils.io.IOUtils;


/**
Expand Down Expand Up @@ -190,6 +191,9 @@ public AssemblyRegionEvaluator assemblyRegionEvaluator() {

@Override
public void onTraversalStart() {
if (hcArgs.emitReferenceConfidence == ReferenceConfidenceMode.GVCF && hcArgs.maxMnpDistance > 0) {
throw new CommandLineException.BadArgumentValue("Non-zero maxMnpDistance is incompatible with GVCF mode.");
}
final ReferenceSequenceFile referenceReader = getReferenceReader(referenceArguments);
hcEngine = new HaplotypeCallerEngine(hcArgs, createOutputBamIndex, createOutputBamMD5, getHeaderForReads(), referenceReader);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgumentCollection implements Serializable{
private static final long serialVersionUID = 1L;

public static final String MAX_MNP_DISTANCE_LONG_NAME = "max-mnp-distance";
public static final String MAX_MNP_DISTANCE_SHORT_NAME = "mnp-dist";

/**
* When HaplotypeCaller is run with -ERC GVCF or -ERC BP_RESOLUTION, some annotations are excluded from the
* output by default because they will only be meaningful once they have been recalculated by GenotypeGVCFs. As
Expand Down Expand Up @@ -140,4 +143,12 @@ public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgume
@Hidden
@Argument(fullName = "dont-genotype", doc = "Perform assembly but do not genotype variants", optional = true)
public boolean dontGenotype = false;

/**
* Two or more phased substitutions separated by this distance or less are merged into MNPs.
*/
@Advanced
@Argument(fullName = MAX_MNP_DISTANCE_LONG_NAME, shortName = MAX_MNP_DISTANCE_SHORT_NAME,
doc = "Two or more phased substitutions separated by this distance or less are merged into MNPs.", optional = true)
public int maxMnpDistance = 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,7 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur
// run the local assembler, getting back a collection of information on how we should proceed
final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(region, givenAlleles, hcArgs, readsHeader, samplesList, logger, referenceReader, assemblyEngine, aligner);

final SortedSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents();
final SortedSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents(hcArgs.maxMnpDistance);
// TODO - line bellow might be unnecessary : it might be that assemblyResult will always have those alleles anyway
// TODO - so check and remove if that is the case:
allVariationEvents.addAll(givenAlleles);
Expand Down Expand Up @@ -575,6 +575,7 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur
features,
(hcArgs.assemblerArgs.consensusMode ? Collections.<VariantContext>emptyList() : givenAlleles),
emitReferenceConfidence(),
hcArgs.maxMnpDistance,
readsHeader);

if ( haplotypeBAMWriter.isPresent() ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.hellbender.utils.genotyper.SampleList;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.param.ParamUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.reference.ReferenceBases;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
Expand Down Expand Up @@ -84,6 +85,11 @@ protected boolean forceKeepAllele(final Allele allele) {
* @param activeRegionWindow Active window
* @param activeAllelesToGenotype Alleles to genotype
* @param emitReferenceConfidence whether we should add a &lt;NON_REF&gt; alternative allele to the result variation contexts.
* @param maxMnpDistance Phased substitutions separated by this distance or less are merged into MNPs. More than
* two substitutions occuring in the same alignment block (ie the same M/X/EQ CIGAR element)
* are merged until a substitution is separated from the previous one by a greater distance.
* That is, if maxMnpDistance = 1, substitutions at 10,11,12,14,15,17 are partitioned into a MNP
* at 10-12, a MNP at 14-15, and a SNP at 17. May not be negative.
*
* @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes
*
Expand All @@ -97,6 +103,7 @@ public CalledHaplotypes assignGenotypeLikelihoods(final List<Haplotype> haplotyp
final FeatureContext tracker,
final List<VariantContext> activeAllelesToGenotype,
final boolean emitReferenceConfidence,
final int maxMnpDistance,
final SAMFileHeader header) {
// sanity check input arguments
Utils.nonEmpty(haplotypes, "haplotypes input should be non-empty and non-null");
Expand All @@ -107,10 +114,11 @@ public CalledHaplotypes assignGenotypeLikelihoods(final List<Haplotype> haplotyp
Utils.nonNull(activeRegionWindow, "activeRegionWindow must be non-null");
Utils.nonNull(activeAllelesToGenotype, "activeAllelesToGenotype must be non-null");
Utils.validateArg(refLoc.contains(activeRegionWindow), "refLoc must contain activeRegionWindow");
ParamUtils.isPositiveOrZero(maxMnpDistance, "maxMnpDistance may not be negative.");

// update the haplotypes so we're ready to call, getting the ordered list of positions on the reference
// that carry events among the haplotypes
final SortedSet<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, ref, refLoc, activeAllelesToGenotype);
final SortedSet<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, ref, refLoc, activeAllelesToGenotype, maxMnpDistance);

// Walk along each position in the key set and create each event to be outputted
final Set<Haplotype> calledHaplotypes = new HashSet<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection
public static final String DOWNSAMPLING_STRIDE_SHORT_NAME = "stride";
public static final String MAX_SUSPICIOUS_READS_PER_ALIGNMENT_START_LONG_NAME = "max-suspicious-reads-per-alignment-start";
public static final String NORMAL_LOD_LONG_NAME = "normal-lod";
public static final String MAX_MNP_DISTANCE_LONG_NAME = "max-mnp-distance";
public static final String MAX_MNP_DISTANCE_SHORT_NAME = "mnp-dist";


public static final double DEFAULT_AF_FOR_TUMOR_ONLY_CALLING = 5e-8;
Expand Down Expand Up @@ -140,6 +142,14 @@ public double getDefaultAlleleFrequency() {
@Argument(fullName = NORMAL_LOD_LONG_NAME, optional = true, doc = "LOD threshold for calling normal variant non-germline.")
public double normalLodThreshold = 2.2;

/**
* Two or more phased substitutions separated by this distance or less are merged into MNPs.
*/
@Advanced
@Argument(fullName = MAX_MNP_DISTANCE_LONG_NAME, shortName = MAX_MNP_DISTANCE_SHORT_NAME,
doc = "Two or more phased substitutions separated by this distance or less are merged into MNPs.", optional = true)
public int maxMnpDistance = 1;


/**
* Set of annotation arguments to use.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ public List<VariantContext> callRegion(final AssemblyRegion originalAssemblyRegi

final AssemblyRegion assemblyActiveRegion = AssemblyBasedCallerUtils.assemblyRegionWithWellMappedReads(originalAssemblyRegion, READ_QUALITY_FILTER_THRESHOLD, header);
final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(assemblyActiveRegion, givenAlleles, MTAC, header, samplesList, logger, referenceReader, assemblyEngine, aligner);
final SortedSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents();
final SortedSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents(MTAC.maxMnpDistance);
final AssemblyRegionTrimmer.Result trimmingResult = trimmer.trim(originalAssemblyRegion,allVariationEvents);
if (!trimmingResult.isVariationPresent()) {
return NO_CALLS;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ public CalledHaplotypes callMutations(
// non-empty. At this point any given alleles have already been injected into the haplotypes, and passing
// givenAlleles to {@code decomposeHaplotypesIntoVariantContexts} actually overrides any non-given (discovery) alleles, which
// is not what we want.
final List<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, assemblyResultSet.getFullReferenceWithPadding(), assemblyResultSet.getPaddedReferenceLoc(), Collections.emptyList()).stream()
final List<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, assemblyResultSet.getFullReferenceWithPadding(), assemblyResultSet.getPaddedReferenceLoc(), Collections.emptyList(), MTAC.maxMnpDistance).stream()
.filter(loc -> activeRegionWindow.getStart() <= loc && loc <= activeRegionWindow.getEnd())
.collect(Collectors.toList());

Expand Down
Loading

0 comments on commit e078a57

Please sign in to comment.