Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix no data hom refs #8715

Merged
merged 8 commits into from
Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ private VariantContext referenceBlockMerge(final List<VariantContext> vcs, final
final GenotypeBuilder gBuilder = new GenotypeBuilder(g);
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(),
gBuilder, assignmentMethod,
g.hasLikelihoods() ? g.getLikelihoods().getAsVector() : null, allelesToUse, g.getAlleles(), null);
g.hasLikelihoods() ? g.getLikelihoods().getAsVector() : null, allelesToUse, g, null);
genotypes.add(gBuilder.make());
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -658,7 +658,7 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(),
genotypeBuilder, assignmentMethod,
g.hasLikelihoods() ? g.getLikelihoods().getAsVector() : null,
targetAlleles, originalGTAlleles, null);
targetAlleles, new GenotypeBuilder(g.getSampleName(), originalGTAlleles).make(), null);
mergedGenotypes.add(genotypeBuilder.make());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,
}
gb.PL(newLikelihoods);

GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, assignmentMethod, newLikelihoods, allelesToKeep, g.getAlleles(), gpc);
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, assignmentMethod, newLikelihoods, allelesToKeep, g, gpc);

// restrict SAC to the new allele subset
if (g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ public enum GenotypeAssignmentMethod {

/**
* Use PLs unless they are unavailable, in which case use best match to original
* GQ0 hom-refs will be converted to no-calls
*/
PREFER_PLS
}
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,8 @@ protected GenotypesContext iterateOnGenotypes(final VariantContext vc, final Lis
//If GenomicsDB returns no-call genotypes like CombineGVCFs (depending on the GenomicsDBExportConfiguration),
// then we need to actually find the GT from PLs
makeGenotypeCall(g, genotypeBuilder, GenotypeLikelihoods.fromPLs(PLs).getAsVector(), targetAlleles);
} else if (g.hasGQ() && g.getGQ() == 0) {
ldgauthier marked this conversation as resolved.
Show resolved Hide resolved
makeGenotypeCall(g, genotypeBuilder, null, targetAlleles); //null likelihoods for reblocked hom-ref that we want to no-call
}
final Map<String, Object> attrs = new HashMap<>(g.getExtendedAttributes());
attrs.remove(GATKVCFConstants.MIN_DP_FORMAT_KEY);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ public final class ReblockGVCF extends MultiVariantWalker {

/**
* Output the band lower bound for each GQ block instead of the min GQ -- for better compression
* Note that this argument also drops PLS for more efficient storage
*/
@Advanced
@Argument(fullName=HaplotypeCallerArgumentCollection.OUTPUT_BLOCK_LOWER_BOUNDS, doc = "Output the band lower bound for each GQ block regardless of the data it represents", optional = true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -296,17 +296,20 @@ public static List<Allele> homozygousAlleleList(final Allele allele, final int p
/**
* Add the genotype call (GT) field to GenotypeBuilder using the requested {@link GenotypeAssignmentMethod}
*
* @param ploidy output ploidy for no-call GTs and likelihood array length
* @param gb the builder where we should put our newly called alleles, cannot be null
* @param assignmentMethod the method to use to do the assignment, cannot be null
* @param genotypeLikelihoods a vector of likelihoods to use if the method requires PLs, should be log10 likelihoods, cannot be null
* @param allelesToUse the alleles with respect to which the likelihoods are defined
* @param originalGT Genotype that includes GQ when available
* @param gpc utility class to help with likelihood calculations
*/
public static void makeGenotypeCall(final int ploidy,
final GenotypeBuilder gb,
final GenotypeAssignmentMethod assignmentMethod,
final double[] genotypeLikelihoods,
final List<Allele> allelesToUse,
final List<Allele> originalGT,
final Genotype originalGT,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had to change this method signature so that we could use the GQ info to distinguish hom-ref genotypes that we want to no-call from hom-refs that stay hom-refs because with reblocking there are no PLs to use.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add to method doc

final GenotypePriorCalculator gpc) {
if(originalGT == null && assignmentMethod == GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL) {
throw new IllegalArgumentException("original GT cannot be null if assignmentMethod is BEST_MATCH_TO_ORIGINAL");
Expand All @@ -315,12 +318,14 @@ public static void makeGenotypeCall(final int ploidy,
gb.alleles(noCallAlleles(ploidy));
} else if (assignmentMethod == GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN ||
assignmentMethod == GenotypeAssignmentMethod.PREFER_PLS) {
if ( genotypeLikelihoods == null || !isInformative(genotypeLikelihoods) ) {
if (genotypeLikelihoods == null || !isInformative(genotypeLikelihoods)) {
if (assignmentMethod == GenotypeAssignmentMethod.PREFER_PLS) {
if (originalGT == null) {
throw new IllegalArgumentException("original GT cannot be null if assignmentMethod is PREFER_PLS");
} else if (originalGT.hasGQ() && originalGT.getGQ() > 0){
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i see. this makes sense and was certainly better than my hack attempt to fix this... so its now "prefer_PLs EXCEPT for nocalls" Can you add a note about this in GenotypeAssignmentMethod?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason to preserve the intermediate behavior at all? We could split PREFER_PLS to also have a mode that treats the 0/0s like they used to be.

gb.alleles(bestMatchToOriginalGT(allelesToUse, originalGT.getAlleles()));
} else {
gb.alleles(bestMatchToOriginalGT(allelesToUse, originalGT));
gb.alleles(noCallAlleles(ploidy));
}
} else {
gb.alleles(noCallAlleles(ploidy)).noGQ();
Expand All @@ -344,7 +349,7 @@ public static void makeGenotypeCall(final int ploidy,
} else if (assignmentMethod == GenotypeAssignmentMethod.SET_TO_NO_CALL_NO_ANNOTATIONS) {
gb.alleles(noCallAlleles(ploidy)).noGQ().noAD().noPL().noAttributes();
} else if (assignmentMethod == GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL) {
gb.alleles(bestMatchToOriginalGT(allelesToUse, originalGT));
gb.alleles(bestMatchToOriginalGT(allelesToUse, originalGT.getAlleles()));
} else if (assignmentMethod == GenotypeAssignmentMethod.USE_POSTERIOR_PROBABILITIES) {
if (gpc == null) {
throw new GATKException("cannot uses posteriors without an genotype prior calculator present");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ public void testCombineReblockedGVCFs() {
Assert.assertFalse(g0.hasPL());
Assert.assertFalse(g0.hasGQ());
Assert.assertFalse(g0.hasExtendedAttribute(VCFConstants.DEPTH_KEY));
Assert.assertTrue(g1.isHomRef());
Assert.assertTrue(g1.isNoCall());
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In a lot of these tests I was explicitly testing the behavior of the GQ0 hom-ref, but that's exactly what we're trying to change here

Assert.assertFalse(g1.hasPL());
Assert.assertEquals(g1.getGQ(), 0);
Assert.assertEquals(g1.getAnyAttribute(VCFConstants.DEPTH_KEY), 34);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@
import java.util.stream.Stream;

public class GenotypeGVCFsIntegrationTest extends CommandLineProgramTest {
//GenotypeGVCFs test coverage should include:
// * HaplotypeCaller GVCFs
// * HaplotypeCaller GVCFs post-processed with ReblockGVCF
// * DRAGEN GVCFs (versions 3.4 and 3.7.8)
// * DRAGEN GVCFs (3.4 & 3.7.8) post-processed with ReblockGVCF
//Note that tests may need to create a temp GenomicsDB to take multiple input GVCFs

// If true, update the expected outputs in tests that assert an exact match vs. prior output,
// instead of actually running the tests. Can be used with "./gradlew test --tests org.broadinstitute.hellbender.tools.walkers.GenotypeGVCFsIntegrationTest"
Expand Down Expand Up @@ -91,6 +97,7 @@ public Object[][] gvcfsToGenotype() {
return new Object[][]{
//combine not supported yet, see https://github.com/broadinstitute/gatk/issues/2429 and https://github.com/broadinstitute/gatk/issues/2584
//{"combine.single.sample.pipeline.1.vcf", null, Arrays.asList("-V", getTestFile("combine.single.sample.pipeline.2.vcf").toString() , "-V", getTestFile("combine.single.sample.pipeline.3.vcf").toString()), b37_reference_20_21},
/*
{getTestFile("leadingDeletion.g.vcf"), getTestFile("leadingDeletionRestrictToStartExpected.vcf"), Arrays.asList("-L", "20:69512-69513", "--"+GenotypeGVCFs.ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME), b37_reference_20_21},
{getTestFile("leadingDeletion.g.vcf"), getTestFile("leadingDeletionExpected.vcf"), Arrays.asList("-L", "20:69512-69513"), b37_reference_20_21},
{getTestFile(BASE_PAIR_GVCF), getTestFile( BASE_PAIR_EXPECTED), NO_EXTRA_ARGS, b37_reference_20_21}, //base pair level gvcf
Expand Down Expand Up @@ -119,7 +126,7 @@ public Object[][] gvcfsToGenotype() {
{getTestFile( "withOxoGReadCounts.g.vcf"), getTestFile( "withOxoGReadCounts.vcf"), Arrays.asList("-G", "AS_StandardAnnotation", "-G", "StandardAnnotation"), b37_reference_20_21},
{getTestFile( "multiSamples.g.vcf"), getTestFile( "multiSamples.GATK3expected.g.vcf"), Arrays.asList( "-A", "ClippingRankSumTest", "-G", "AS_StandardAnnotation", "-G", "StandardAnnotation"), b37_reference_20_21},
{getTestFile( "testAlleleSpecificAnnotations.CombineGVCF.output.g.vcf"), getTestFile( "testAlleleSpecificAnnotations.CombineGVCF.expected.g.vcf"), Arrays.asList( "-A", "ClippingRankSumTest", "-G", "AS_StandardAnnotation", "-G", "StandardAnnotation"), b37_reference_20_21},

*/
// all sites/--include-non-variant-sites tests
// The results from these tests differ from GATK3 in the following ways:
// - sites where the only alternate allele is a spanning deletion are emitted by GATK3, but not emitted by GATK4
Expand Down Expand Up @@ -158,6 +165,7 @@ public Object[][] gvcfsToGenotype() {

//23 highly multi-allelic sites across 54 1000G exomes to test allele subsetting and QUAL calculation
//plus one 10-allele WGS variant that's all hom-ref with one GT that has unnormalized PLs from some sort of GenomicsDB corner case
//this VCF still has the haploid-looking GDB no-calls as in sample NA21137 at position chr1:3836468 -- allegedly GATK 4.2.5.0 from February 7, 2022, possibly due to --call-genotypes
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔍

{getTestFile("multiallelicQualRegression.vcf "),
getTestFile("multiallelicQualRegression.expected.vcf"),
NO_EXTRA_ARGS, hg38Reference}
Expand Down Expand Up @@ -804,6 +812,9 @@ public void testWithReblockedGVCF() {
final List<VariantContext> actualVC = VariantContextTestUtils.getVariantContexts(output);
Assert.assertFalse(actualVC.stream().anyMatch(vc -> vc.getGenotype(1).isHomRef() && vc.getGenotype(1).hasPL())); //second sample has a bunch of 0/0s -- shouldn't have PLs

//this comes from a callset of NYGC 1000G samples plus syndip
//it seems likely that there's a variant that wasn't discovered in the graph because a bunch of samples are hom-ref with PL=[0,0,X]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a minor comment... but its possible to manufacture a sample with lots of assembly failures if you need to test this. You can set the kmer size to 1 and disallow stepped increases to get a sample that can assemble nothing.... Doesn't look necessary for you here though.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ooh, that is good to know.

//this is a pretty variant dense region with 4 in 20 bases for NA12872
final File bigCombinedReblockedGVCF = new File("src/test/resources/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs/combineReblocked.g.vcf");
final File cohortOutput = createTempFile("biggerCohort.rb", ".vcf");

Expand All @@ -822,10 +833,11 @@ public void testWithReblockedGVCF() {
Assert.assertEquals(vc0.getAlternateAllele(0).getBaseString(), "G");
Assert.assertTrue(vc0.getGenotypes().stream().allMatch(g -> g.isCalled() && g.hasGQ() && g.hasDP()));

//reblocked GVCFs with no PLs have genotypes that will be assigned as no-calls because of GQ0, so AN and ExcessHet differ here
final VariantContext vc1 = outputVCs.get(1);
Assert.assertTrue(vc1.getAttributeAsDouble(GATKVCFConstants.EXCESS_HET_KEY, 1000.0) < 10.0); //will be ~72 if homRefs aren't counted
Assert.assertTrue(vc1.getAttributeAsDouble(GATKVCFConstants.EXCESS_HET_KEY, 0.0) > 50.0); //will be ~72 if homRefs aren't counted
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some hom-refs in previous test result go to no call, so InbreedingCoeff and ExcessHet also change

Assert.assertTrue(vc1.hasAttribute(GATKVCFConstants.INBREEDING_COEFFICIENT_KEY));
Assert.assertEquals(vc1.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY, 0), 362);
Assert.assertEquals(vc1.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY, 0), 300);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This changes because of the new no-calls

Assert.assertEquals(vc1.getAlternateAlleles().size(), 3);
Assert.assertTrue(vc1.isIndel());
Assert.assertTrue(vc0.getGenotypes().stream().allMatch(g -> g.isCalled() && g.hasGQ() && g.hasDP()));
Expand All @@ -838,7 +850,7 @@ public void testWithReblockedGVCF() {
.addOutput(compareICvalues);
runCommandLine(args3);

//requires InbreedingCoeff to use isCalledAndDiploidWithLikelihoodsOrWithGQ for sampleNum
//highlight differences with and without PLs
final List<VariantContext> compareICvariants = VariantContextTestUtils.getVariantContexts(compareICvalues);
final VariantContext vcWithPLs = compareICvariants.get(0);
final VariantContext vcWithoutPLs = compareICvariants.get(1);
Expand All @@ -848,8 +860,9 @@ public void testWithReblockedGVCF() {
final double ic2 = vcWithoutPLs.getAttributeAsDouble(GATKVCFConstants.INBREEDING_COEFFICIENT_KEY, 0);
Assert.assertTrue(ic1 > 0); //make sure lookup worked, otherwise 0 == 0
Assert.assertTrue(ic2 > 0); //if GQ0s with no data are output as hom-ref, then ic2 is ~0.7
Assert.assertEquals(ic1, ic2, 0.1); //there will be some difference because the old version zeros out low depth hom-refs and makes them no-calls
Assert.assertEquals(vcWithoutPLs.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY, 0), 114); //don't count no-calls that are PL=[0,0,0] in classic VCF
Assert.assertTrue(ic1 - ic2 > .25); //there will be a significant difference because with noPLs, GQ0s become no-calls
Assert.assertEquals(vcWithPLs.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY, 0) -
vcWithoutPLs.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY, 0), 16); //don't count no-calls that are PL=[0,0,0] in classic VCF
}

@Test
Expand Down Expand Up @@ -976,4 +989,75 @@ public void dbSNPError() {
//make sure it doesn't throw an error
runCommandLine(args);
}

@Test
public void testNoReadsOutputAsNoCall() {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The input data for this test comes from a GVCF produced from an empty region of one of the NA12878 test bams and a GVCF that was edited to have a variant so we can force that position to be output. (I should probably put this test description in the code.)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

//The input data for this test comes from a GVCF produced from an empty region of one of the NA12878 test bams
// and a GVCF that was edited to have a variant so we can force that position to be output.
//note these are b37 data
File no_reads = new File(toolsTestDir, "/walkers/GenotypeGVCFs/combine.single.sample.pipeline.1.vcf");
File fake_variant = getTestFile("fake_sample2.vcf");
final SimpleInterval interval = new SimpleInterval("20", 10000000, 10000000);
File tempGdb = GenomicsDBTestUtils.createTempGenomicsDB(Arrays.asList(no_reads, fake_variant), interval);
final String genomicsDBUri = GenomicsDBTestUtils.makeGenomicsDBUri(tempGdb);

final File output = createTempFile("checkNoCall", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(b37_reference_20_21)
.addFlag("allow-old-rms-mapping-quality-annotation-data") //old GVCFs have old annotations
.addVCF(genomicsDBUri)
.addInterval(interval)
.addOutput(output);
runCommandLine(args);

final Pair<VCFHeader, List<VariantContext>> outputData = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath());
Assert.assertEquals(outputData.getRight().size(), 1);
final VariantContext vc = outputData.getRight().get(0);
Assert.assertEquals(vc.getGenotypes().size(), 2);
final Genotype g = vc.getGenotype("GTEX-RVPV-0003");
Assert.assertTrue(g.isNoCall()); //most importantly!
Assert.assertTrue(g.hasGQ());
Assert.assertEquals(g.getGQ(), 0);
Assert.assertTrue(g.hasDP());
Assert.assertEquals(g.getDP(), 0);
Assert.assertTrue(g.hasAD());
Assert.assertEquals(g.getAD(), new int[2]);
Assert.assertTrue(g.hasPL());
Assert.assertEquals(g.getPL(), new int[3]);
}

@Test
public void testNoReadsReblockedOutputAsNoCall() {
//There's a very similar test for Gnarly, but we expect the outputs to be a bit different here
//note these are b37 data
File no_reads = new File(toolsTestDir, "/walkers/GnarlyGenotyper/testNoReads.rb.g.vcf");
//this is an artisanal, hand-crafted VCF with a QUAL approx that's been artificially enhanced
File fake_variant = new File(toolsTestDir, "/walkers/GnarlyGenotyper/fake_sample2.rb.g.vcf");
final SimpleInterval interval = new SimpleInterval("20", 10000000, 10000000);
File tempGdb = GenomicsDBTestUtils.createTempGenomicsDB(Arrays.asList(no_reads, fake_variant), interval);
final String genomicsDBUri = GenomicsDBTestUtils.makeGenomicsDBUri(tempGdb);

final File output = createTempFile("checkNoCall", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(b37_reference_20_21)
.addVCF(genomicsDBUri)
.addInterval(interval)
.addOutput(output);
runCommandLine(args);

final Pair<VCFHeader, List<VariantContext>> outputData = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath());
Assert.assertEquals(outputData.getRight().size(), 1);
final VariantContext vc = outputData.getRight().get(0);
Assert.assertEquals(vc.getGenotypes().size(), 2);
final Genotype g = vc.getGenotype("GTEX-RVPV-0003");
Assert.assertTrue(g.isNoCall()); //most importantly!
Assert.assertTrue(g.hasGQ());
Assert.assertEquals(g.getGQ(), 0);
Assert.assertTrue(g.hasDP());
Assert.assertEquals(g.getDP(), 0);
Assert.assertTrue(g.hasAD()); //this is different from Gnarly behavior
Assert.assertFalse(g.hasPL());
}
}
Loading
Loading