Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New HaplotypeCaller priors for variants sites and homRef blocks #4793

Merged
merged 8 commits into from
May 30, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,7 @@ private static List<Allele> replaceWithNoCallsAndDels(final VariantContext vc) {
* @return never {@code null}
*/
//TODO as part of a larger refactoring effort {@link #remapAlleles} can be merged with {@link GATKVariantContextUtils#remapAlleles}.
@VisibleForTesting
static List<Allele> remapAlleles(final VariantContext vc, final Allele refAllele) {
public static List<Allele> remapAlleles(final VariantContext vc, final Allele refAllele) {
final Allele vcRef = vc.getReference();
final byte[] refBases = refAllele.getBases();
final int extraBaseCount = refBases.length - vcRef.getBases().length;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,16 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,
}

final boolean useNewLikelihoods = newLikelihoods != null && (depth != 0 || GATKVariantContextUtils.isInformative(newLikelihoods));
final GenotypeBuilder gb = useNewLikelihoods ? new GenotypeBuilder(g).PL(newLikelihoods).log10PError(newLog10GQ) : new GenotypeBuilder(g).noPL().noGQ();

final GenotypeBuilder gb = new GenotypeBuilder(g);
if (useNewLikelihoods) {
final Map<String, Object> attributes = new HashMap<>(g.getExtendedAttributes());
gb.PL(newLikelihoods).log10PError(newLog10GQ);
attributes.remove(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY);
gb.noAttributes().attributes(attributes);
}
else {
gb.noPL().noGQ();
}
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, assignmentMethod, newLikelihoods, allelesToKeep);

// restrict SAC to the new allele subset
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
package org.broadinstitute.hellbender.tools.walkers.genotyper;

import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.engine.FeatureInput;
import org.broadinstitute.hellbender.tools.walkers.variantutils.CalculateGenotypePosteriors;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.variant.HomoSapiensConstants;

Expand All @@ -13,6 +16,10 @@
public final class GenotypeCalculationArgumentCollection implements Serializable {
private static final long serialVersionUID = 1L;

public static final String SUPPORTING_CALLSET_LONG_NAME = "population-callset";
public static final String SUPPORTING_CALLSET_SHORT_NAME = "population";
public static final String NUM_REF_SAMPLES_LONG_NAME = "num-reference-samples-if-no-call";

/**
* Creates a GenotypeCalculationArgumentCollection with default values.
*/
Expand All @@ -34,6 +41,8 @@ public GenotypeCalculationArgumentCollection( final GenotypeCalculationArgumentC
this.MAX_ALTERNATE_ALLELES = other.MAX_ALTERNATE_ALLELES;
this.inputPrior = new ArrayList<>(other.inputPrior);
this.samplePloidy = other.samplePloidy;
this.supportVariants = other.supportVariants;
this.numRefIfMissing = other.numRefIfMissing;
}

/**
Expand Down Expand Up @@ -90,7 +99,7 @@ public GenotypeCalculationArgumentCollection( final GenotypeCalculationArgumentC
* The standard deviation of the distribution of alt allele fractions. The above heterozygosity parameters give the
* *mean* of this distribution; this parameter gives its spread.
*/
@Argument(fullName = "heterozygosity-stdev", doc = "Standard deviation of eterozygosity for SNP and indel calling.", optional = true)
@Argument(fullName = "heterozygosity-stdev", doc = "Standard deviation of heterozygosity for SNP and indel calling.", optional = true)
public double heterozygosityStandardDeviation = 0.01;

/**
Expand Down Expand Up @@ -165,4 +174,20 @@ public GenotypeCalculationArgumentCollection( final GenotypeCalculationArgumentC
*/
@Argument(shortName="ploidy", fullName="sample-ploidy", doc="Ploidy (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", optional=true)
public int samplePloidy = HomoSapiensConstants.DEFAULT_PLOIDY;

/**
* Supporting external panel. Allele counts from this panel (taken from AC,AN or MLEAC,AN or raw genotypes) will
* be used to inform the frequency distribution underlying the genotype priors. These files must be VCF 4.2 spec or later.
* Note that unlike CalculateGenotypePosteriors, HaplotypeCaller only allows one supporting callset.
*/
@Argument(fullName=SUPPORTING_CALLSET_LONG_NAME, shortName=SUPPORTING_CALLSET_SHORT_NAME, doc="Callset to use in calculating genotype priors", optional=true)
public FeatureInput<VariantContext> supportVariants = null;

/**
* When a variant is not seen in any panel, this argument controls whether to infer (and with what effective strength)
* that only reference alleles were observed at that site. E.g. "If not seen in 1000Genomes, treat it as AC=0,
* AN=2000".
*/
@Argument(fullName= NUM_REF_SAMPLES_LONG_NAME,doc="Number of hom-ref genotypes to infer at sites not present in a panel",optional=true)
public int numRefIfMissing = 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
Expand All @@ -18,6 +19,10 @@
import org.broadinstitute.hellbender.utils.io.IOUtils;

import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.variant.HomoSapiensConstants;
import java.nio.file.Path;
import java.util.List;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ private void initialize(boolean createBamOutIndex, final boolean createBamOutMD5
genotypingEngine = new HaplotypeCallerGenotypingEngine(hcArgs, samplesList, FixedAFCalculatorProvider.createThreadSafeProvider(hcArgs), ! hcArgs.doNotRunPhysicalPhasing);
genotypingEngine.setAnnotationEngine(annotationEngine);

referenceConfidenceModel = new ReferenceConfidenceModel(samplesList, readsHeader, hcArgs.indelSizeToEliminateInRefModel);
referenceConfidenceModel = new ReferenceConfidenceModel(samplesList, readsHeader, hcArgs.indelSizeToEliminateInRefModel, hcArgs.genotypeArgs.numRefIfMissing);

//Allele-specific annotations are not yet supported in the VCF mode
if (isAlleleSpecificMode(annotationEngine) && isVCFMode()){
Expand Down Expand Up @@ -373,6 +373,14 @@ public VCFHeader makeVCFHeader( final SAMSequenceDictionary sequenceDictionary,
VCFConstants.DEPTH_KEY,
VCFConstants.GENOTYPE_PL_KEY);

if (hcArgs.genotypeArgs.supportVariants != null) {
headerInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY));
headerInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY));
headerInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY));
headerInfo.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY));
headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.GENOTYPE_PRIOR_KEY));
}

if ( ! hcArgs.doNotRunPhysicalPhasing ) {
headerInfo.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_ID_KEY));
headerInfo.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_GT_KEY));
Expand Down Expand Up @@ -481,13 +489,18 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur
return NO_CALLS;
}

final List<VariantContext> VCpriors = new ArrayList<>();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can take it or leave it but personally I hate to separate a declaration from an initialization and prefer

final List<VariantContext> VCpriors = hcArgs.genotypeArgs.applyPriors ? features.getValues(hcArgs.genotypeArgs.supportVariants) : Collections.emptyList();

if (hcArgs.genotypeArgs.supportVariants != null) {
features.getValues(hcArgs.genotypeArgs.supportVariants).stream().forEach(VCpriors::add);
}

if ( hcArgs.sampleNameToUse != null ) {
removeReadsFromAllSamplesExcept(hcArgs.sampleNameToUse, region);
}

if( ! region.isActive() ) {
// Not active so nothing to do!
return referenceModelForNoVariation(region, true);
return referenceModelForNoVariation(region, true, VCpriors);
}

final List<VariantContext> givenAlleles = new ArrayList<>();
Expand All @@ -496,11 +509,11 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur

// No alleles found in this region so nothing to do!
if ( givenAlleles.isEmpty() ) {
return referenceModelForNoVariation(region, true);
return referenceModelForNoVariation(region, true, VCpriors);
}
} else if( region.size() == 0 ) {
// No reads here so nothing to do!
return referenceModelForNoVariation(region, true);
return referenceModelForNoVariation(region, true, VCpriors);
}

// run the local assembler, getting back a collection of information on how we should proceed
Expand All @@ -514,7 +527,7 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur
final AssemblyRegionTrimmer.Result trimmingResult = trimmer.trim(region, allVariationEvents);

if ( ! trimmingResult.isVariationPresent() && ! hcArgs.disableOptimizations ) {
return referenceModelForNoVariation(region, false);
return referenceModelForNoVariation(region, false, VCpriors);
}

final AssemblyResultSet assemblyResult =
Expand All @@ -533,7 +546,7 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur
// abort early if something is out of the acceptable range
// TODO is this ever true at this point??? perhaps GGA. Need to check.
if( ! assemblyResult.isVariationPresent() && ! hcArgs.disableOptimizations ) {
return referenceModelForNoVariation(region, false);
return referenceModelForNoVariation(region, false, VCpriors);
}

// For sure this is not true if gVCF is on.
Expand All @@ -544,7 +557,7 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur
// TODO is this ever true at this point??? perhaps GGA. Need to check.
if ( regionForGenotyping.size() == 0 && ! hcArgs.disableOptimizations ) {
// no reads remain after filtering so nothing else to do!
return referenceModelForNoVariation(region, false);
return referenceModelForNoVariation(region, false, VCpriors);
}

// evaluate each sample's reads against all haplotypes
Expand Down Expand Up @@ -594,21 +607,22 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur
if ( emitReferenceConfidence() ) {
if ( !containsCalls(calledHaplotypes) ) {
// no called all of the potential haplotypes
return referenceModelForNoVariation(region, false);
return referenceModelForNoVariation(region, false, VCpriors);
}
else {
final List<VariantContext> result = new LinkedList<>();
// output left-flanking non-variant section:
if (trimmingResult.hasLeftFlankingRegion()) {
result.addAll(referenceModelForNoVariation(trimmingResult.nonVariantLeftFlankRegion(), false));
result.addAll(referenceModelForNoVariation(trimmingResult.nonVariantLeftFlankRegion(), false, VCpriors));
}
// output variant containing region.
result.addAll(referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(),
calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping,
readLikelihoods, genotypingEngine.getPloidyModel(), calledHaplotypes.getCalls()));
readLikelihoods, genotypingEngine.getPloidyModel(), calledHaplotypes.getCalls(), hcArgs.genotypeArgs.supportVariants != null,
VCpriors));
// output right-flanking non-variant section:
if (trimmingResult.hasRightFlankingRegion()) {
result.addAll(referenceModelForNoVariation(trimmingResult.nonVariantRightFlankRegion(), false));
result.addAll(referenceModelForNoVariation(trimmingResult.nonVariantRightFlankRegion(), false, VCpriors));
}
return result;
}
Expand Down Expand Up @@ -636,7 +650,7 @@ private boolean containsCalls(final HaplotypeCallerGenotypingEngine.CalledHaplot
* @param needsToBeFinalized should the region be finalized before computing the ref model (should be false if already done)
* @return a list of variant contexts (can be empty) to emit for this ref region
*/
private List<VariantContext> referenceModelForNoVariation(final AssemblyRegion region, final boolean needsToBeFinalized) {
private List<VariantContext> referenceModelForNoVariation(final AssemblyRegion region, final boolean needsToBeFinalized, final List<VariantContext> VCpriors) {
if ( emitReferenceConfidence() ) {
//TODO - why the activeRegion cannot manage its own one-time finalization and filtering?
//TODO - perhaps we can remove the last parameter of this method and the three lines bellow?
Expand All @@ -650,7 +664,7 @@ private List<VariantContext> referenceModelForNoVariation(final AssemblyRegion r
final List<Haplotype> haplotypes = Collections.singletonList(refHaplotype);
return referenceConfidenceModel.calculateRefConfidence(refHaplotype, haplotypes,
paddedLoc, region, createDummyStratifiedReadMap(refHaplotype, samplesList, region),
genotypingEngine.getPloidyModel(), Collections.emptyList());
genotypingEngine.getPloidyModel(), Collections.emptyList(), hcArgs.genotypeArgs.supportVariants != null, VCpriors);
}
else {
return NO_CALLS;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ public class HaplotypeCallerGenotypingEngine extends AssemblyBasedCallerGenotypi

private final PloidyModel ploidyModel;
private final ReferenceConfidenceMode referenceConfidenceMode;
protected final double snpHeterozygosity;
protected final double indelHeterozygosity;

private final int maxGenotypeCountToEnumerate;
private final Map<Integer, Integer> practicalAlleleCountForPloidy = new HashMap<>();
Expand All @@ -56,6 +58,8 @@ public HaplotypeCallerGenotypingEngine(final HaplotypeCallerArgumentCollection c
genotypingModel = new IndependentSampleGenotypesModel();
maxGenotypeCountToEnumerate = configuration.genotypeArgs.MAX_GENOTYPE_COUNT;
referenceConfidenceMode = configuration.emitReferenceConfidence;
snpHeterozygosity = configuration.genotypeArgs.snpHeterozygosity;
indelHeterozygosity = configuration.genotypeArgs.indelHeterozygosity;
}

@Override
Expand Down
Loading