Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix GenotypeGVCFs with mixed ploidy sites #8862

Merged
merged 8 commits into from
Jun 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -395,16 +395,20 @@ boolean isVcCoveredByDeletion(final VariantContext vc) {
*/
protected final boolean cannotBeGenotyped(final VariantContext vc) {
if (vc.getNAlleles() <= GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED
&& vc.getGenotypes().stream().anyMatch(Genotype::hasLikelihoods)) { //likelihoods may be missing when reading from GenomicsDB if there are more alts that GDB args allow
// likelihoods may be missing when reading from GenomicsDB if there are more alts that GDB args allow
// so ensure all genotypes (outside of 0/0 and ./.) have likelihoods
&& vc.getGenotypes().stream().filter( g -> !(g.isNoCall() || g.isHomRef()) ).allMatch(Genotype::hasLikelihoods)
// if all sites are no calls or hom refs then keep a site where any samples have likelihoods
&& vc.getGenotypes().stream().anyMatch(Genotype::hasLikelihoods)) {
return false;
}
// protect against too many alternate alleles that we can't even run AF on:
if (vc.getNAlleles() > GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) {
logger.warn("Attempting to genotype more than " + GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED +
" alleles. Site will be skipped at location " + vc.getContig() + ":" + vc.getStart());
return true;
}else {
logger.warn("No genotype contained sufficient data to recalculate site and allele qualities. Site will be skipped at location "
} else {
logger.warn("Some genotypes contained insufficient data to recalculate site and allele qualities. Site will be skipped at location "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch!

+ vc.getContig() + ":" + vc.getStart());
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1096,4 +1096,43 @@ public void testNoReadsReblockedOutputAsNoCall() {
Assert.assertFalse(g.hasAD());
Assert.assertFalse(g.hasPL());
}

@Test
public void testMixHaploidDiploidHighAltSite() {
final String inputVcfsDir = toolsTestDir + "/walkers/GenotypeGVCFs/mixHaploidDiploidHighAlt/";
List<File> inputs = new ArrayList<>();

// list of 3 genotype 1/2 samples and one genotype 1 sample with 6 alt alleles
inputs.add(new File(inputVcfsDir + "haploid.rb.g.vcf"));
for (int i = 1; i <= 3; i++) {
String str = String.format("%ss%02d.rb.g.vcf", inputVcfsDir, i);
inputs.add(new File(str));
}
final SimpleInterval interval = new SimpleInterval("chrX", 66780645, 66780646);
final File tempGenomicsDB2 = GenomicsDBTestUtils.createTempGenomicsDB(inputs, interval);
final String genomicsDBUri2 = GenomicsDBTestUtils.makeGenomicsDBUri(tempGenomicsDB2);
final File output = createTempFile("mixHaploidDiploidHighAlt", ".vcf");

final ArgumentsBuilder argsGenotypeGVCFs = new ArgumentsBuilder();
argsGenotypeGVCFs.addReference(hg38Reference)
.addVCF(genomicsDBUri2)
.addInterval(interval)
// 6 alt alleles is 6C2 = 15 genotypes for diploids, but only 6 genotypes for haploids
.add("max-genotype-count", 8)
.addOutput(output);
//Make sure we don't hit an exception
runCommandLine(argsGenotypeGVCFs);

final Pair<VCFHeader, List<VariantContext>> outputData = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath());
//Make sure the first site was successfully removed
Assert.assertEquals(outputData.getRight().size(), 1);
final VariantContext vc = outputData.getRight().get(0);
//Make sure the second site was successfully genotyped (3 no calls and 1 haploid alt genotype)
Assert.assertEquals(vc.getStart(), 66780646);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm feeling a little paranoid after so many GenotypeGVCFs hiccups. If you've got the patience to wait for tests again, can you check that the remaining VC has genotypes called as expected with PLs? We do expect PLs right?

Assert.assertEquals(vc.getGenotypes().size(), 4);
List<Genotype> calledGenotypes = vc.getGenotypes().stream().filter(g -> !g.isNoCall()).toList();
Assert.assertEquals(calledGenotypes.size(), 1);
Assert.assertEquals(calledGenotypes.get(0).getAlleles().size(), 1);
Assert.assertEquals(calledGenotypes.get(0).getAlleles().get(0), Allele.create("TA", false));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths (counting only informative reads out of the total reads) for the ref and alt alleles in the order listed">
##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele fractions for alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=F1R2,Number=R,Type=Integer,Description="Count of reads in F1R2 pair orientation supporting each allele">
##FORMAT=<ID=F2R1,Number=R,Type=Integer,Description="Count of reads in F2R1 pair orientation supporting each allele">
##FORMAT=<ID=GP,Number=G,Type=Float,Description="Phred-scaled posterior probabilities for genotypes as defined in the VCF specification">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=ICNT,Number=2,Type=Integer,Description="Counts of INDEL informative reads based on the reference confidence model">
##FORMAT=<ID=MB,Number=4,Type=Integer,Description="Per-sample component statistics to detect mate bias">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=PRI,Number=G,Type=Float,Description="Phred-scaled prior probabilities for genotypes">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias">
##FORMAT=<ID=SPL,Number=.,Type=Integer,Description="Normalized, Phred-scaled likelihoods for SNPs based on the reference confidence model">
##FORMAT=<ID=SQ,Number=1,Type=Float,Description="Somatic quality">
##GVCFBlock0-20=minGQ=0(inclusive),maxGQ=20(exclusive)
##GVCFBlock20-30=minGQ=20(inclusive),maxGQ=30(exclusive)
##GVCFBlock30-40=minGQ=30(inclusive),maxGQ=40(exclusive)
##GVCFBlock40-100=minGQ=40(inclusive),maxGQ=100(exclusive)
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=AS_BaseQRankSum,Number=A,Type=Float,Description="allele specific Z-score from Wilcoxon rank sum test of each Alt Vs. Ref base qualities">
##INFO=<ID=AS_FS,Number=A,Type=Float,Description="allele specific phred-scaled p-value using Fisher's exact test to detect strand bias of each alt allele">
##INFO=<ID=AS_InbreedingCoeff,Number=A,Type=Float,Description="Allele-specific inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=AS_MQ,Number=A,Type=Float,Description="Allele-specific RMS Mapping Quality">
##INFO=<ID=AS_MQRankSum,Number=A,Type=Float,Description="Allele-specific Mapping Quality Rank Sum">
##INFO=<ID=AS_QD,Number=A,Type=Float,Description="Allele-specific Variant Confidence/Quality by Depth">
##INFO=<ID=AS_QUALapprox,Number=1,Type=String,Description="Allele-specific QUAL approximations">
##INFO=<ID=AS_ReadPosRankSum,Number=A,Type=Float,Description="allele specific Z-score from Wilcoxon rank sum test of each Alt vs. Ref read position bias">
##INFO=<ID=AS_SOR,Number=A,Type=Float,Description="Allele specific strand Odds Ratio of 2x|Alts| contingency table to detect allele specific strand bias">
##INFO=<ID=AS_VarDP,Number=1,Type=String,Description="Allele-specific (informative) depth over variant genotypes -- including ref, RAW format">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (informative and non-informative); some reads may have been filtered based on mapq etc.">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=FractionInformativeReads,Number=1,Type=Float,Description="The fraction of informative reads out of the total reads">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=LOD,Number=1,Type=Float,Description="Variant LOD score">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=MQ_DP,Number=1,Type=Integer,Description="Depth over variant samples for better MQ calculation (deprecated -- use RAW_MQandDP instead.)">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=QUALapprox,Number=1,Type=Integer,Description="Sum of PL[0] values; used to approximate the QUAL score">
##INFO=<ID=R2_5P_bias,Number=1,Type=Float,Description="Score based on mate bias and distance from 5 prime end">
##INFO=<ID=RAW_GT_COUNT,Number=3,Type=Integer,Description="Counts of genotypes w.r.t. the reference allele in the following order: 0/0, 0/*, */*, i.e. all alts lumped together; for use in calculating excess heterozygosity">
##INFO=<ID=RAW_MQandDP,Number=2,Type=Integer,Description="Raw data (sum of squared MQ and total depth) for improved RMS Mapping Quality calculation. Incompatible with deprecated RAW_MQ formulation.">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias">
##INFO=<ID=VarDP,Number=1,Type=Integer,Description="(informative) depth over variant genotypes">
##contig=<ID=chrX,length=156040895>
##reference=file:///seq/dragen/references/GRCh38dh/v3.7.8/reference.bin
##source=ReblockGVCF
##bcftools_viewVersion=1.19+htslib-1.19
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT haploid
chrX 66780645 . C CT,<NON_REF> 65.55 . AS_QUALapprox=|71|0;AS_VarDP=0|4|0;DP=9;MQ=216.86;QUALapprox=71;RAW_GT_COUNT=0,0,1;RAW_MQandDP=423254,9;VarDP=4 GT:AD:AF:DP:F1R2:F2R1:GQ:ICNT:MB:PL:SB:SPL 1:0,4,0:1,0:4:0,1,0:0,3,0:71:0,4:0,0,2,2:71,0,89:0,0,1,3:0,110
chrX 66780646 . T TA,<NON_REF> 44.8 . AS_QUALapprox=|50|0;AS_VarDP=13|8|0;DP=30;MQ=218.46;QUALapprox=50;RAW_GT_COUNT=0,1,0;RAW_MQandDP=1431743,30;VarDP=21 GT:AD:AF:DP:F1R2:F2R1:GQ:ICNT:MB:PL:SB:SPL 1:0,4,0:1,0:4:0,1,0:0,3,0:71:0,4:0,0,2,2:71,0,89:0,0,1,3:0,110
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths (counting only informative reads out of the total reads) for the ref and alt alleles in the order listed">
##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele fractions for alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=F1R2,Number=R,Type=Integer,Description="Count of reads in F1R2 pair orientation supporting each allele">
##FORMAT=<ID=F2R1,Number=R,Type=Integer,Description="Count of reads in F2R1 pair orientation supporting each allele">
##FORMAT=<ID=GP,Number=G,Type=Float,Description="Phred-scaled posterior probabilities for genotypes as defined in the VCF specification">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=ICNT,Number=2,Type=Integer,Description="Counts of INDEL informative reads based on the reference confidence model">
##FORMAT=<ID=MB,Number=4,Type=Integer,Description="Per-sample component statistics to detect mate bias">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=PRI,Number=G,Type=Float,Description="Phred-scaled prior probabilities for genotypes">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias">
##FORMAT=<ID=SPL,Number=.,Type=Integer,Description="Normalized, Phred-scaled likelihoods for SNPs based on the reference confidence model">
##FORMAT=<ID=SQ,Number=1,Type=Float,Description="Somatic quality">
##GVCFBlock0-20=minGQ=0(inclusive),maxGQ=20(exclusive)
##GVCFBlock20-30=minGQ=20(inclusive),maxGQ=30(exclusive)
##GVCFBlock30-40=minGQ=30(inclusive),maxGQ=40(exclusive)
##GVCFBlock40-100=minGQ=40(inclusive),maxGQ=100(exclusive)
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=AS_BaseQRankSum,Number=A,Type=Float,Description="allele specific Z-score from Wilcoxon rank sum test of each Alt Vs. Ref base qualities">
##INFO=<ID=AS_FS,Number=A,Type=Float,Description="allele specific phred-scaled p-value using Fisher's exact test to detect strand bias of each alt allele">
##INFO=<ID=AS_InbreedingCoeff,Number=A,Type=Float,Description="Allele-specific inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=AS_MQ,Number=A,Type=Float,Description="Allele-specific RMS Mapping Quality">
##INFO=<ID=AS_MQRankSum,Number=A,Type=Float,Description="Allele-specific Mapping Quality Rank Sum">
##INFO=<ID=AS_QD,Number=A,Type=Float,Description="Allele-specific Variant Confidence/Quality by Depth">
##INFO=<ID=AS_QUALapprox,Number=1,Type=String,Description="Allele-specific QUAL approximations">
##INFO=<ID=AS_ReadPosRankSum,Number=A,Type=Float,Description="allele specific Z-score from Wilcoxon rank sum test of each Alt vs. Ref read position bias">
##INFO=<ID=AS_SOR,Number=A,Type=Float,Description="Allele specific strand Odds Ratio of 2x|Alts| contingency table to detect allele specific strand bias">
##INFO=<ID=AS_VarDP,Number=1,Type=String,Description="Allele-specific (informative) depth over variant genotypes -- including ref, RAW format">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (informative and non-informative); some reads may have been filtered based on mapq etc.">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=FractionInformativeReads,Number=1,Type=Float,Description="The fraction of informative reads out of the total reads">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=LOD,Number=1,Type=Float,Description="Variant LOD score">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=MQ_DP,Number=1,Type=Integer,Description="Depth over variant samples for better MQ calculation (deprecated -- use RAW_MQandDP instead.)">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=QUALapprox,Number=1,Type=Integer,Description="Sum of PL[0] values; used to approximate the QUAL score">
##INFO=<ID=R2_5P_bias,Number=1,Type=Float,Description="Score based on mate bias and distance from 5 prime end">
##INFO=<ID=RAW_GT_COUNT,Number=3,Type=Integer,Description="Counts of genotypes w.r.t. the reference allele in the following order: 0/0, 0/*, */*, i.e. all alts lumped together; for use in calculating excess heterozygosity">
##INFO=<ID=RAW_MQandDP,Number=2,Type=Integer,Description="Raw data (sum of squared MQ and total depth) for improved RMS Mapping Quality calculation. Incompatible with deprecated RAW_MQ formulation.">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias">
##INFO=<ID=VarDP,Number=1,Type=Integer,Description="(informative) depth over variant genotypes">
##contig=<ID=chrX,length=156040895>
##reference=file:///seq/dragen/references/GRCh38dh/v3.7.8/reference.bin
##source=ReblockGVCF
##bcftools_viewVersion=1.19+htslib-1.19
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s01
chrX 66780645 . C CT,CTT,<NON_REF> 44.8 . AS_QUALapprox=|50|0;AS_VarDP=13|8|0;DP=30;MQ=218.46;QUALapprox=50;RAW_GT_COUNT=0,1,0;RAW_MQandDP=1431743,30;VarDP=21 GT:AD:AF:DP:F1R2:F2R1:GQ:ICNT:MB:PL:SB:SPL 1/2:0,6,8,0:0.429,0.571,0:14:0,2,3,0:0,4,5,0:55:0,13:0,0,10,4:317,325,55,262,0,55,227,131,60,172:0,0,3,11:255,0,173
chrX 66780646 . T TA,TAA,<NON_REF> 44.8 . AS_QUALapprox=|50|0;AS_VarDP=13|8|0;DP=30;MQ=218.46;QUALapprox=50;RAW_GT_COUNT=0,1,0;RAW_MQandDP=1431743,30;VarDP=21 GT:PL ./.:0,0,0,0,0,0
Binary file not shown.
Loading
Loading