From 0be44f286541c0290e9ad088d8241ca4826e3309 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Mon, 3 Jun 2024 14:32:35 -0400 Subject: [PATCH] Mutect2 germline resource can have split multiallelic format (#8837) --- .../mutect/SomaticGenotypingEngine.java | 40 +++++++++++-------- .../SomaticGenotypingEngineUnitTest.java | 35 +++++++++++++++- 2 files changed, 56 insertions(+), 19 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java index 5324df8f59a..f5f56a2483e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java @@ -404,9 +404,7 @@ private Optional getForNormal(final Supplier supplier) { private static Map getNegativeLogPopulationAFAnnotation(List germlineResourceVariants, final List allAlleles, final double afOfAllelesNotInGermlineResource) { - final Optional germlineVC = germlineResourceVariants.isEmpty() ? Optional.empty() - : Optional.of(germlineResourceVariants.get(0)); // assume only one VC per site - final double[] populationAlleleFrequencies = getGermlineAltAlleleFrequencies(allAlleles, germlineVC, afOfAllelesNotInGermlineResource); + final double[] populationAlleleFrequencies = getGermlineAltAlleleFrequencies(allAlleles, germlineResourceVariants, afOfAllelesNotInGermlineResource); return ImmutableMap.of(GATKVCFConstants.POPULATION_AF_KEY, MathUtils.applyToArray(populationAlleleFrequencies, x -> - Math.log10(x))); } @@ -416,27 +414,35 @@ private static Map getNegativeLogPopulationAFAnnotation(List allAlleles, final Optional germlineVC, final double afOfAllelesNotInGermlineResource) { + static double[] getGermlineAltAlleleFrequencies(final List allAlleles, final List germlineVCs, final double afOfAllelesNotInGermlineResource) { Utils.validateArg(!allAlleles.isEmpty(), "allAlleles are empty -- there is not even a reference allele."); - if (germlineVC.isPresent()) { - if (! germlineVC.get().hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)) { - logger.warn("Germline resource variant at " + germlineVC.get().getContig() + ":" + germlineVC.get().getStart() +" missing AF attribute"); - return Doubles.toArray(Collections.nCopies(allAlleles.size() - 1, afOfAllelesNotInGermlineResource)); + final Map alleleFrequencies = new HashMap<>(); + allAlleles.forEach(a -> alleleFrequencies.put(a, afOfAllelesNotInGermlineResource)); // initialize everything to the default + + // look through every germline resource variant context at this locus and fill in the AFs + for (final VariantContext germlineVC : germlineVCs) { + if (! germlineVC.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)) { + logger.warn("Germline resource variant at " + germlineVC.getContig() + ":" + germlineVC.getStart() +" missing AF attribute"); + } + + List germlineIndices = GATKVariantContextUtils.alleleIndices(allAlleles, germlineVC.getAlleles()); + final List germlineAltAFs = Mutect2Engine.getAttributeAsDoubleList(germlineVC, VCFConstants.ALLELE_FREQUENCY_KEY, afOfAllelesNotInGermlineResource); + + if (germlineAltAFs.size() == (germlineVC.getNAlleles() - 1)) { // skip VCs with a bad AF field that got parsed as a wrong-length list + for (int alleleIndex = 1; alleleIndex < allAlleles.size(); alleleIndex++) { // start at 1 to skip the reference, which doesn't have an AF annotation + final Allele allele = allAlleles.get(alleleIndex); + // note the -1 since germlineAltAFs do not include ref + germlineIndices.get(alleleIndex).ifPresent(germlineIndex -> alleleFrequencies.put(allele, germlineAltAFs.get(germlineIndex - 1))); + } } - List germlineIndices = GATKVariantContextUtils.alleleIndices(allAlleles, germlineVC.get().getAlleles()); - final List germlineAltAFs = Mutect2Engine.getAttributeAsDoubleList(germlineVC.get(), VCFConstants.ALLELE_FREQUENCY_KEY, afOfAllelesNotInGermlineResource); - - return germlineIndices.stream().skip(1) // skip the reference allele - .mapToDouble(idx -> idx.isPresent() ? germlineAltAFs.get(idx.getAsInt() - 1) : afOfAllelesNotInGermlineResource) // note the -1 since germlineAltAFs do not include ref - .toArray(); - } else { - return Doubles.toArray(Collections.nCopies(allAlleles.size() - 1, afOfAllelesNotInGermlineResource)); } + + return allAlleles.stream().skip(1).mapToDouble(alleleFrequencies::get).toArray(); // skip the reference allele } /** diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngineUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngineUnitTest.java index 9a6dd2b54eb..c99fe8be850 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngineUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngineUnitTest.java @@ -10,7 +10,9 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import javax.ws.rs.core.Variant; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Optional; @@ -53,8 +55,23 @@ public void testGetGermlineAltAlleleFrequencies(final List calledAlleles final int stop = start + length - 1; final VariantContext vc1 = new VariantContextBuilder("SOURCE", "1", start, stop, germlineAlleles) .attribute(VCFConstants.ALLELE_FREQUENCY_KEY, germlineAltAFs).make(); - final double[] result = SomaticGenotypingEngine.getGermlineAltAlleleFrequencies(calledAlleles, Optional.of(vc1), DEFAULT_AF); + final double[] result = SomaticGenotypingEngine.getGermlineAltAlleleFrequencies(calledAlleles, List.of(vc1), DEFAULT_AF); Assert.assertEquals(result, expected, 1.0e-10); + + // multiallelic -- test splitting into multiple VCs + if (germlineAlleles.size() > 2) { + final Allele ref = germlineAlleles.get(0); + final List germlineVCs = new ArrayList<>(); + for (int n = 1; n < germlineAlleles.size(); n++) { + final Allele alt = germlineAlleles.get(n); + final VariantContext splitVC = new VariantContextBuilder("SOURCE", "1", start, stop, List.of(ref, alt)) + .attribute(VCFConstants.ALLELE_FREQUENCY_KEY, new double[]{germlineAltAFs[n-1]}).make(); + germlineVCs.add(splitVC); + } + + final double[] splitResult = SomaticGenotypingEngine.getGermlineAltAlleleFrequencies(calledAlleles, germlineVCs, DEFAULT_AF); + Assert.assertEquals(splitResult, expected, 1.0e-10); + } } @DataProvider(name = "missingAFData") @@ -69,7 +86,21 @@ Object[][] missingAFData() { @Test(dataProvider = "missingAFData") public void testGetGermlineAltAlleleFrequenciesWithMissingAF(final VariantContext vc) { - final double[] result = SomaticGenotypingEngine.getGermlineAltAlleleFrequencies(vc.getAlleles(), Optional.of(vc), DEFAULT_AF); + final double[] result = SomaticGenotypingEngine.getGermlineAltAlleleFrequencies(vc.getAlleles(), List.of(vc), DEFAULT_AF); Assert.assertEquals(result, new double[] {DEFAULT_AF}, 1.0e-10); } + + // test getting alt allele frequencies when each alt has its own VCF line and its own VariantContext + @Test + public void testGetGermlineAltAlleleFrequenciesFromSplitAllelesFormat() { + final double af1 = 0.1; + final double af2 = 0.2; + final VariantContext vc1 = new VariantContextBuilder("SOURCE", "1", 1, 1, Arrays.asList(Allele.REF_A, Allele.ALT_C)) + .attribute(VCFConstants.ALLELE_FREQUENCY_KEY, af1).make(); + final VariantContext vc2 = new VariantContextBuilder("SOURCE", "1", 1, 1, Arrays.asList(Allele.REF_A, Allele.ALT_T)) + .attribute(VCFConstants.ALLELE_FREQUENCY_KEY, af2).make(); + final List alleles = List.of(Allele.REF_A, Allele.ALT_C, Allele.ALT_T); + final double[] result = SomaticGenotypingEngine.getGermlineAltAlleleFrequencies(alleles, List.of(vc1, vc2), DEFAULT_AF); + Assert.assertEquals(result, new double[] {af1, af2}, 1.0e-10); + } } \ No newline at end of file