Skip to content

Commit

Permalink
Add haploid support to GnarlyGenotyper (#7750)
Browse files Browse the repository at this point in the history
  • Loading branch information
ldgauthier authored Jul 17, 2023
1 parent 248dd79 commit 7d900f5
Show file tree
Hide file tree
Showing 14 changed files with 70,749 additions and 54 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,6 @@
@BetaFeature
public final class GnarlyGenotyper extends VariantWalker {

public static final int PIPELINE_MAX_ALT_COUNT = GenotypeCalculationArgumentCollection.DEFAULT_MAX_ALTERNATE_ALLELES;

private static final OneShotLogger warning = new OneShotLogger(GnarlyGenotyper.class);

private static final boolean CALL_GENOTYPES = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.*;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.*;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAlleleCounts;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypesCache;
import org.broadinstitute.hellbender.tools.walkers.genotyper.*;
import org.broadinstitute.hellbender.utils.GenotypeCounts;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
Expand Down Expand Up @@ -41,6 +39,7 @@ public final class GnarlyGenotyperEngine {

// cache the ploidy 2 PL array sizes for increasing numbers of alts up to the maximum of maxAltAllelesToOutput
private int[] likelihoodSizeCache;
private final ArrayList<GenotypeLikelihoodCalculator> glcCache = new ArrayList<>();
private Set<Class<? extends InfoFieldAnnotation>> allASAnnotations;

private final int maxAltAllelesToOutput;
Expand All @@ -58,12 +57,6 @@ public GnarlyGenotyperEngine(final boolean keepAllSites, final int maxAltAlleles
this.keepAllSites = keepAllSites;
this.stripASAnnotations = stripASAnnotations;

//initialize PL size cache -- HTSJDK cache only goes up to 4 alts, but I need 6
likelihoodSizeCache = new int[maxAltAllelesToOutput + 1 + 1]; //+1 for ref and +1 so index == numAlleles
for (final int numAlleles : IntStream.rangeClosed(1, maxAltAllelesToOutput + 1).boxed().collect(Collectors.toList())) {
likelihoodSizeCache[numAlleles] = GenotypeLikelihoods.numLikelihoods(numAlleles, ASSUMED_PLOIDY);
}

//TODO: fix weird reflection logging?
final Reflections reflections = new Reflections("org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific");
allASAnnotations = reflections.getSubTypesOf(InfoFieldAnnotation.class);
Expand Down Expand Up @@ -320,34 +313,32 @@ protected GenotypesContext iterateOnGenotypes(final VariantContext vc, final Lis
int newPLsize = -1;
final int maximumAlleleCount = inputAllelesWithNonRef.size();
final int numConcreteAlts = maximumAlleleCount - 2; //-1 for NON_REF and -1 for ref
if (maximumAlleleCount <= maxAllelesToOutput) {
newPLsize = likelihoodSizeCache[numConcreteAlts + 1]; //cache is indexed by #alleles with ref; don't count NON_REF
} else {
newPLsize = GenotypeLikelihoods.numLikelihoods(numConcreteAlts + 1, ASSUMED_PLOIDY);
}

for ( final Genotype g : vc.getGenotypes() ) {
final String name = g.getSampleName();
if(g.getPloidy() != ASSUMED_PLOIDY && !isGDBnoCall(g)) {
throw new UserException.BadInput("This tool assumes diploid genotypes, but sample " + name + " has ploidy "
+ g.getPloidy() + " at position " + vc.getContig() + ":" + vc.getStart() + ".");
}
final Genotype calledGT;
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g);
genotypeBuilder.name(name);
if (isGDBnoCall(g) || g.getAllele(0).equals(Allele.NON_REF_ALLELE) || g.getAllele(1).equals(Allele.NON_REF_ALLELE)) {
genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(ASSUMED_PLOIDY));
if (g.getAlleles().contains(Allele.NON_REF_ALLELE)) {
genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy())).noGQ();
//there will be cases when we're running over Y or haploid X and we haven't seen any variants yet
// and we assume diploid when we shouldn't, but there's not a lot to be done about it
} else if (isGDBnoCall(g)) {
genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(ASSUMED_PLOIDY)).noGQ();
}
else if (nonRefReturned) {
if (g.hasAD()) {
final int[] AD = trimADs(g, targetAlleles.size());
genotypeBuilder.AD(AD);
}
else if (g.countAllele(Allele.NON_REF_ALLELE) > 0) {
genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(ASSUMED_PLOIDY)).noGQ();
}
if (nonRefReturned && g.hasAD()) {
final int[] AD = trimADs(g, targetAlleles.size());
genotypeBuilder.AD(AD);
}
if (g.hasPL()) {
//lookup PL size from cache if ploidy matches and cache has our number of alts
if (maximumAlleleCount <= maxAllelesToOutput && g.getPloidy() == ASSUMED_PLOIDY) {
newPLsize = GenotypeIndexCalculator.genotypeCount(g.getPloidy(), numConcreteAlts+1);
//newPLsize = likelihoodSizeCache[numConcreteAlts + 1]; //cache is indexed by #alleles with ref; don't count NON_REF
//otherwise calculate size on the fly
} else {
newPLsize = GenotypeLikelihoods.numLikelihoods(numConcreteAlts + 1, g.getPloidy());
}
final int[] PLs = trimPLs(g, newPLsize);
if (emitPLs) {
genotypeBuilder.PL(PLs);
Expand All @@ -357,7 +348,7 @@ else if (g.countAllele(Allele.NON_REF_ALLELE) > 0) {
genotypeBuilder.GQ(MathUtils.secondSmallestMinusSmallest(PLs, 0));
//If GenomicsDB returns no-call genotypes like CombineGVCFs (depending on the GenomicsDBExportConfiguration),
// then we need to actually find the GT from PLs
makeGenotypeCall(genotypeBuilder, GenotypeLikelihoods.fromPLs(PLs).getAsVector(), targetAlleles);
makeGenotypeCall(g, genotypeBuilder, GenotypeLikelihoods.fromPLs(PLs).getAsVector(), targetAlleles);
}
final Map<String, Object> attrs = new HashMap<>(g.getExtendedAttributes());
attrs.remove(GATKVCFConstants.MIN_DP_FORMAT_KEY);
Expand All @@ -370,7 +361,7 @@ else if (g.countAllele(Allele.NON_REF_ALLELE) > 0) {
}

//running total for AC values
for (int i = 0; i < ASSUMED_PLOIDY; i++) {
for (int i = 0; i < calledGT.getPloidy(); i++) {
final Allele a = calledGT.getAllele(i);
final int count = targetAlleleCounts.getOrDefault(a, 0);
if (!a.equals(Allele.NO_CALL)) {
Expand All @@ -391,37 +382,32 @@ else if (g.countAllele(Allele.NON_REF_ALLELE) > 0) {
* For a genotype with likelihoods that has a no-call GT, determine the most likely genotype from PLs and set it
* We use a GenotypeLikelihoodCalculator to convert from the best PL index to the indexes of the alleles for that
* genotype so we can set the GenotypeBuilder with the alleles
* @param g Genotype used to make the gb GenotypeBuilder
* @param gb GenotypeBuilder to modify and pass back
* @param genotypeLikelihoods PLs to use to call genotype; count should agree with number of alleles in allelesToUse
* @param allelesToUse alleles in the parent VariantContext (with ref), because GenotypeBuilder needs the allele String rather than index
*/
@VisibleForTesting
protected void makeGenotypeCall(final GenotypeBuilder gb,
protected void makeGenotypeCall(final Genotype g, final GenotypeBuilder gb,
final double[] genotypeLikelihoods,
final List<Allele> allelesToUse) {
final int maxAllelesToOutput = maxAltAllelesToOutput + 1; //+1 for ref

if ( genotypeLikelihoods == null || !GATKVariantContextUtils.isInformative(genotypeLikelihoods) ) {
gb.alleles(GATKVariantContextUtils.noCallAlleles(ASSUMED_PLOIDY)).noGQ();
//gb.alleles(GATKVariantContextUtils.noCallAlleles(g.getAlleles().size())).noGQ();
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, GenotypeAssignmentMethod.SET_TO_NO_CALL,
genotypeLikelihoods, allelesToUse, null);
} else {
final int maxLikelihoodIndex = MathUtils.maxElementIndex(genotypeLikelihoods);
final GenotypeAlleleCounts alleleCounts = GenotypesCache.get(ASSUMED_PLOIDY, maxLikelihoodIndex);

gb.alleles(alleleCounts.asAlleleList(allelesToUse));
final int numAltAlleles = allelesToUse.size() - 1;
if ( numAltAlleles > 0 ) {
gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(maxLikelihoodIndex, genotypeLikelihoods));
}
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN,
genotypeLikelihoods, allelesToUse, null);
}
}

/**
* Some legacy versions of GenomicsDB report no-calls as `0` or `.`
* @param g genotype to check
* @return true if this is a genotype that should be represented as a ploidy-aware, spec compliant no-call
* @return true if this is a genotype that should be represented spec compliant no-call (may need to fix ploidy)
*/
private static boolean isGDBnoCall(final Genotype g) {
return g.getPloidy() == 1 && (g.getAllele(0).isReference() || g.getAllele(0).isNoCall());
return !g.hasPL() && !g.hasAD() && g.isNoCall();
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.broadinstitute.hellbender.tools.walkers;

import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
Expand All @@ -8,7 +10,6 @@
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.FeatureDataSource;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AS_RMSMappingQuality;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
Expand All @@ -22,7 +23,6 @@

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
Expand Down Expand Up @@ -53,13 +53,19 @@ public void assertThatExpectedOutputUpdateToggleIsDisabled() {
@DataProvider(name="VCFdata")
public Object[][] getVCFdata() {
return new Object[][]{
/*
//chrX haploid sample plus diploid sample -- expected results validated with vcf-validator (samtools?)
{new File[]{getTestFile("NA12891.chrX.haploid.rb.g.vcf"), getTestFile("NA12892.chrX.diploid.rb.g.vcf")},
getTestFile("haploidPlusDiploid.expected.vcf"), null, Arrays.asList(new SimpleInterval("chrX", 1000000, 5000000)), Arrays.asList("--merge-input-intervals", "--only-output-calls-starting-in-intervals"), b38_reference_20_21},
//8 ALT alleles -- no PLs
{new File[]{getTestFile("sample6.vcf"), getTestFile("sample7.vcf"), getTestFile("sample8.vcf"), getTestFile("sample9.vcf")},
getTestFile("lotsOfAltsNoPLs.vcf"), null, Arrays.asList(new SimpleInterval("chr20", 257008, 257008)), Arrays.asList("--merge-input-intervals", "--only-output-calls-starting-in-intervals"), b38_reference_20_21},
//6 ALT alleles -- yes PLs
{new File[]{getTestFile("sample6.vcf"), getTestFile("sample7.vcf"), getTestFile("sample8.vcf")},
getTestFile("lotsOfAltsYesPLs.vcf"), null, Arrays.asList(new SimpleInterval("chr20", 257008, 257008)), Arrays.asList("--merge-input-intervals", "--only-output-calls-starting-in-intervals"), b38_reference_20_21},
// Simple Test, spanning deletions; standard calling confidence
*/
// Simple Test, spanning deletions; standard calling confidence
//No variants outside requested intervals; no SNPs with QUAL < 60, no INDELs with QUAL < 69?; has star alleles after deletion at chr20:263497; has AC, AF, AN, DP, ExcessHet, FS, MQ, (MQRankSum), (ReadPosRankSum), SOR, QD; has called genotypes
{new File[]{getTestFile("sample1.vcf"), getTestFile("sample2.vcf"), getTestFile("sample3.vcf"), getTestFile("sample4.vcf"), getTestFile("sample5.vcf")},
getTestFile("fiveSampleTest.vcf"), null, Arrays.asList(new SimpleInterval("chr20", 251370, 252000), new SimpleInterval("chr20", 263000, 265600)), Arrays.asList("--merge-input-intervals", "--only-output-calls-starting-in-intervals"), b38_reference_20_21},
Expand Down Expand Up @@ -217,4 +223,26 @@ public void testOnEmptyAnnotations() {
Assert.assertEquals(sors.get(1), VCFConstants.MISSING_VALUE_v4);
}

@Test
public void testHaploidInput() {
final File haploidGVCF = new File(getToolTestDataDir(), "chrY_haploid_dragen.g.vcf");
final File output = createTempFile("GnarlyGenotyper", ".vcf");
final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(new File(hg38Reference))
.add("V", haploidGVCF)
.addOutput(output)
.add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false");
runCommandLine(args);

final Pair<VCFHeader, List<VariantContext>> outputData = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath());
Assert.assertEquals(outputData.getRight().size(), 1);
final VariantContext vc = outputData.getRight().get(0);
Assert.assertEquals(vc.getGenotypes().size(), 1);
final Genotype g = vc.getGenotype(0);
Assert.assertEquals(g.getPloidy(), 1);
Assert.assertEquals(g.getAlleles().get(0), Allele.ALT_A);
Assert.assertTrue(g.hasPL());
Assert.assertEquals(g.getPL().length, 2);
Assert.assertEquals(g.getPL(), new int[]{83,0});
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -61,19 +61,19 @@ public void testLotsOfAlts() {
//use more alts than the maxAltAllelesToOutput for the engine, forcing on-the-fly generation of GLCalculator not in the cache
@Test
public void testGenotypeCallForLotsOfAlts() {
final GnarlyGenotyperEngine engine = new GnarlyGenotyperEngine(false, 4, false, true);
final GnarlyGenotyperEngine engine = new GnarlyGenotyperEngine(false, 4, true);

final Genotype g1 = VariantContextTestUtils.makeG("g1", oneInserted, twoInserted, sample1pls);
final Genotype g2 = VariantContextTestUtils.makeG("g1", Aref, oneInserted, sample2pls);
final VariantContext vc = VariantContextTestUtils.makeVC("test", Arrays.asList(Aref, oneInserted, twoInserted, threeInserted, fourRepeats, fiveRepeats), g1, g2);

final GenotypeBuilder builder1 = new GenotypeBuilder(g1);
engine.makeGenotypeCall(builder1, GenotypeLikelihoods.fromPLs(sample1pls).getAsVector(), Arrays.asList(Aref, oneInserted, twoInserted, threeInserted, fourRepeats, fiveRepeats));
engine.makeGenotypeCall(g1, builder1, GenotypeLikelihoods.fromPLs(sample1pls).getAsVector(), Arrays.asList(Aref, oneInserted, twoInserted, threeInserted, fourRepeats, fiveRepeats));
final List<Allele> calledAlleles1 = builder1.make().getAlleles();
Assert.assertTrue(calledAlleles1.size() == 2 && calledAlleles1.contains(oneInserted) && calledAlleles1.contains(twoInserted));

final GenotypeBuilder builder2 = new GenotypeBuilder(g2);
engine.makeGenotypeCall(builder2, GenotypeLikelihoods.fromPLs(sample2pls).getAsVector(), Arrays.asList(Aref, oneInserted, twoInserted, threeInserted, fourRepeats, fiveRepeats));
engine.makeGenotypeCall(g2, builder2, GenotypeLikelihoods.fromPLs(sample2pls).getAsVector(), Arrays.asList(Aref, oneInserted, twoInserted, threeInserted, fourRepeats, fiveRepeats));
final List<Allele> calledAlleles2 = builder2.make().getAlleles();
Assert.assertTrue(calledAlleles2.size() == 2 && calledAlleles2.contains(Aref) && calledAlleles2.contains(oneInserted));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,7 @@ public void testMultipleInputs() {
}

@Test(expectedExceptions = UserException.class)
//ReblockGVCF can take multiple inputs, but only if they're non-overlapping shards from the same sample
public void testMixedSamples() {
final File output = createTempFile("reblockedgvcf", ".vcf");
final ArgumentsBuilder args = new ArgumentsBuilder();
Expand Down
Loading

0 comments on commit 7d900f5

Please sign in to comment.