Skip to content

Commit

Permalink
Fix and test for calculating CombineGVCFs intermediate band interval …
Browse files Browse the repository at this point in the history
…start locations. (broadinstitute#4681)
  • Loading branch information
cmnbroad authored and cwhelan committed May 25, 2018
1 parent 1404c8c commit 07890fe
Show file tree
Hide file tree
Showing 4 changed files with 177 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -152,15 +152,15 @@ public void apply(List<VariantContext> variantContexts, ReferenceContext referen
*/
@VisibleForTesting
void createIntermediateVariants(SimpleInterval intervalToClose) {
Set<Integer> sitesToStop = new HashSet<>();
resizeReferenceIfNeeded(intervalToClose);

// Break up the GVCF according to the provided reference blocking scheme
if ( multipleAtWhichToBreakBands > 0) {
for (int i = ((intervalToClose.getStart())/multipleAtWhichToBreakBands)*multipleAtWhichToBreakBands; i <= intervalToClose.getEnd(); i+=multipleAtWhichToBreakBands) {
sitesToStop.add(i-1); // Subtract 1 here because we want to split before this base
}
}
// The values returned from getIntermediateStopSites represent a proposed set of stop sites that may include
// intervals that are outside the actual interval being closed. These sites are filtered out below.
// Note: Precomputing these is really inefficient when large reference blocks are closed with
// fine band resolution because it results in very large collections of stop sites (tens or hundreds of millions)
// that must subsequently be sorted.
final Set<Integer> sitesToStop = getIntermediateStopSites(intervalToClose, multipleAtWhichToBreakBands);

// If any variant contexts ended (or were spanning deletions) the last context compute where we should stop them
for (VariantContext vc : variantContextsOverlappingCurrentMerge) {
Expand All @@ -181,7 +181,7 @@ void createIntermediateVariants(SimpleInterval intervalToClose) {
List<Integer> stoppedLocs = new ArrayList<>(sitesToStop);
stoppedLocs.sort(Comparator.naturalOrder());

// For each stopped loc, create a fake QueuedContextState and pass it to endPreviousStats
// For each stopped loc that is within the interval being closed, create a fake QueuedContextState and pass it to endPreviousStats
for (int stoppedLoc : stoppedLocs) {
SimpleInterval loc = new SimpleInterval(intervalToClose.getContig(), stoppedLoc, stoppedLoc);
if (( stoppedLoc <= intervalToClose.getEnd() && stoppedLoc>= intervalToClose.getStart()) && isWithinInterval(loc)) {
Expand All @@ -192,6 +192,25 @@ void createIntermediateVariants(SimpleInterval intervalToClose) {

}

// Get any intermediate stop sites based on the break band multiple.
@VisibleForTesting
protected final static Set<Integer> getIntermediateStopSites(final SimpleInterval intervalToClose, final int breakBandMultiple) {
final Set<Integer> sitesToStop = new HashSet<>();

if ( breakBandMultiple > 0) {
// if the intermediate interval to close starts before the end of the first band multiple,
// create the first stop position at the end of the band multiple
for (int blockEndPosition = intervalToClose.getStart() < (breakBandMultiple + 1) ?
Math.max(2, breakBandMultiple) :
(intervalToClose.getStart() / breakBandMultiple) * breakBandMultiple;
blockEndPosition <= intervalToClose.getEnd();
blockEndPosition += breakBandMultiple) {
sitesToStop.add(blockEndPosition - 1); // Subtract 1 here because we want to split before this base
}
}
return sitesToStop;
}

/**
* Resize {@link #storedReferenceContext} to cover at least as much as intervalToClose
* @param intervalToClose
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@ public Object[][] gvcfsToCombine() {
{new File[]{getTestFile("NA12878.AS.chr20snippet.g.vcf"), getTestFile("NA12892.AS.chr20snippet.g.vcf")}, getTestFile("testAlleleSpecificAnnotationsNoGroup.vcf"), Arrays.asList("-G", "Standard", "-G", "AS_Standard"), b37_reference_20_21},
//Test that trailing reference blocks are emitted with correct intervals
{new File[]{getTestFile("gvcfExample1WithTrailingReferenceBlocks.g.vcf"), getTestFile("gvcfExample2WithTrailingReferenceBlocks.g.vcf")}, getTestFile("gvcfWithTrailingReferenceBlocksExpected.g.vcf"), NO_EXTRA_ARGS, b38_reference_20_21},
// same test as the previous one, except with a band multiple specified
{new File[]{getTestFile("gvcfExample1WithTrailingReferenceBlocks.g.vcf"), getTestFile("gvcfExample2WithTrailingReferenceBlocks.g.vcf")},
getTestFile("gvcfWithTrailingReferenceBlocksBandedExpected.g.vcf"),
Arrays.asList("--" + CombineGVCFs.BREAK_BANDS_LONG_NAME, "2000000"),
b38_reference_20_21},
};
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package org.broadinstitute.hellbender.tools.walkers;

import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.util.*;
import java.util.stream.Collectors;

public class CombineGVCFsUnitTest {

@DataProvider(name="breakIntermediateStopSites")
public Object[][] getIntermediateStopSitesData() {
return new Object[][] {
// Note that the expected results here do not represent a final set of stop sites for the given
// interval. Rather, they are intended to match the output of the CombineGVCFs.getIntermediateStopSites
// method, which returns an initial set of intermediate stop sites that in some cases includes sites
// outside the actual interval being closed, but which are subsequently filtered out by additional
// downstream code in CombineGVCFs.
{ new SimpleInterval("contig", 1, 1), 1, Collections.EMPTY_LIST },
{ new SimpleInterval("contig", 1, 2), 1, Arrays.asList(1) },
{ new SimpleInterval("contig", 1, 2), 2, Arrays.asList(1) },
{ new SimpleInterval("contig", 1, 10), 2, Arrays.asList(1, 3, 5, 7, 9) },
{ new SimpleInterval("contig", 1, 10), 5, Arrays.asList(4, 9) },
{ new SimpleInterval("contig", 1, 100), 25, Arrays.asList(24, 49, 74, 99) },

{ new SimpleInterval("contig", 10, 10), 2, Arrays.asList(9) },
{ new SimpleInterval("contig", 10, 10), 5, Arrays.asList(9) },

{ new SimpleInterval("contig", 10, 10), 10, Arrays.asList(9) },
{ new SimpleInterval("contig", 10, 10), 100, Collections.EMPTY_LIST },
{ new SimpleInterval("contig", 10, 20), 5, Arrays.asList(9, 14, 19) },
{ new SimpleInterval("contig", 10, 20), 10, Arrays.asList(9, 19) },
{ new SimpleInterval("contig", 10, 20), 100, Collections.EMPTY_LIST },

{ new SimpleInterval("contig", 10, 100), 25, Arrays.asList(24, 49, 74, 99) },
{ new SimpleInterval("contig", 10, 100), 50, Arrays.asList(49, 99) },
{ new SimpleInterval("contig", 10, 100), 100, Arrays.asList(99) },
{ new SimpleInterval("contig", 10, 100), 1000, Collections.EMPTY_LIST },
{ new SimpleInterval("contig", 110, 120), 100, Arrays.asList(99) }
};
}

@Test(dataProvider = "breakIntermediateStopSites")
public void testGetIntermediateStopSites(
final SimpleInterval intervalToClose,
final int breakBandMultiple,
final List<Integer> expectedCloseSites)
{
final List<Integer> actualStopSites = new ArrayList<>(CombineGVCFs.getIntermediateStopSites(intervalToClose, breakBandMultiple));
actualStopSites.sort(Comparator.naturalOrder());
// validate that the resulting stop sites all result in valid single-position stop intervals
actualStopSites.stream().forEach(stopSite -> Assert.assertNotNull(new SimpleInterval(intervalToClose.getContig(), stopSite, stopSite)));
Assert.assertEquals(actualStopSites, expectedCloseSites);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
##fileformat=VCFv4.2
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=MIN_GQ,Number=1,Type=Integer,Description="Minimum GQ observed within the GVCF block">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GATKCommandLine=<ID=CombineGVCFs,CommandLine="CombineGVCFs --output /var/folders/cr/16ghvyfj5lvfwxx01rt1k4tdl04sy3/T/combinegvcfs3475467159514601676.vcf --break-bands-at-multiples-of 2000000 --variant /Users/cnorman/projects/gatk/src/test/resources/org/broadinstitute/hellbender/tools/walkers/CombineGVCFs/gvcfExample1WithTrailingReferenceBlocks.g.vcf --variant /Users/cnorman/projects/gatk/src/test/resources/org/broadinstitute/hellbender/tools/walkers/CombineGVCFs/gvcfExample2WithTrailingReferenceBlocks.g.vcf --reference /Users/cnorman/projects/gatk/src/test/resources/large/Homo_sapiens_assembly38.20.21.fasta --verbosity ERROR --annotation-group StandardAnnotation --disable-tool-default-annotations false --convert-to-base-pair-resolution false --ignore-variants-starting-outside-interval false --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --help false --version false --showHidden false --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --disable-tool-default-read-filters false",Version=Unavailable,Date="April 19, 2018 9:53:59 AM EDT">
##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive)
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=BLOCK_SIZE,Number=1,Type=Integer,Description="Size of the homozygous reference GVCF block">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=RAW_MQ,Number=1,Type=Float,Description="Raw data for RMS Mapping Quality">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias">
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr20_GL383577v2_alt,length=128386>
##contig=<ID=chr20_KI270869v1_alt,length=118774>
##contig=<ID=chr20_KI270871v1_alt,length=58661>
##contig=<ID=chr20_KI270870v1_alt,length=183433>
##contig=<ID=chr21_GL383578v2_alt,length=63917>
##contig=<ID=chr21_KI270874v1_alt,length=166743>
##contig=<ID=chr21_KI270873v1_alt,length=143900>
##contig=<ID=chr21_GL383579v2_alt,length=201197>
##contig=<ID=chr21_GL383580v2_alt,length=74653>
##contig=<ID=chr21_GL383581v2_alt,length=116689>
##contig=<ID=chr21_KI270872v1_alt,length=82692>
##source=CombineGVCFs
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA1 NA2
chr20 69491 . G <NON_REF> . . END=69497 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:94:99:82:99:0,120,1800 ./.
chr20 69498 . C <NON_REF> . . END=69506 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:94:99:82:99:0,120,1800 ./.:94:99:82:99:0,120,1800
chr20 69507 . T <NON_REF> . . END=69510 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:94:99:82:99:0,120,1800 ./.
chr20 69511 . A G,<NON_REF> . . BaseQRankSum=1.17;DP=82;MQ=31.05;MQ0=0;MQRankSum=-8.660e-01;ReadPosRankSum=1.69 GT:AD:DP:GQ:PL:SB ./.:1,79,0:80:99:2284,207,0,2287,237,2316:0,1,46,33 ./.
chr20 69512 . C <NON_REF> . . END=69513 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:96:99:82:99:0,120,1800 ./.
chr20 69514 . T <NON_REF> . . END=69521 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:96:99:82:99:0,120,1800 ./.:96:99:82:99:0,120,1800
chr20 69522 . T <NON_REF> . . END=69548 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:95:0:95:0:0,0,0 ./.:96:99:82:99:0,120,1800
chr20 69549 . G <NON_REF> . . END=69624 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:156:99:56:66:0,66,990 ./.:96:99:82:99:0,120,1800
chr20 69625 . A <NON_REF> . . END=69634 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:156:99:56:66:0,66,990 ./.:7:18:7:18:0,18,270
chr20 69635 . G T,<NON_REF> . . BaseQRankSum=0.937;DP=14;MQ=34.15;MQ0=0;MQRankSum=1.30;ReadPosRankSum=1.75 GT:AD:DP:GQ:MIN_DP:MIN_GQ:PL:SB ./.:4,3,0:7:89:.:.:89,0,119,101,128,229:0,4,0,3 ./.:.:7:18:7:18:0,18,270,18,270,270
chr20 69636 . A <NON_REF> . . END=69675 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./. ./.:7:18:7:18:0,18,270
chr20 69762 . T <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:7:18:7:18:0,18,270 ./.
chr20 69763 . A <NON_REF> . . END=69766 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:7:21:7:21:0,21,253 ./.
chr20 69767 . C <NON_REF> . . END=69771 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:7:12:7:12:0,12,180 ./.:7:12:7:15:0,12,180
chr20 69772 . AAAGC A,<NON_REF> . . BaseQRankSum=0.937;DP=14;MQ=34.15;MQ0=0;MQRankSum=1.30;ReadPosRankSum=1.75 GT:AD:DP:GQ:MIN_DP:MIN_GQ:PL:SB ./.:4,3,0:7:89:.:.:89,0,119,101,128,229:0,4,0,3 ./.:.:7:12:7:15:0,12,180,12,180,180
chr20 69773 . A <NON_REF> . . END=69774 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:7:0:3:0:0,0,0 ./.:7:12:7:15:0,12,180
chr20 69775 . C <NON_REF> . . END=69783 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:7:0:3:0:0,0,0 ./.:7:15:7:15:0,15,160
chr20 69784 . A <NON_REF> . . END=69791 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./. ./.:7:15:7:15:0,15,160
chr21 1 . G <NON_REF> . . END=1999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 2000000 . N <NON_REF> . . END=3999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 4000000 . N <NON_REF> . . END=5999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 6000000 . C <NON_REF> . . END=7999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 8000000 . G <NON_REF> . . END=9999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 10000000 . C <NON_REF> . . END=11999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 12000000 . N <NON_REF> . . END=13999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 14000000 . T <NON_REF> . . END=15999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 16000000 . A <NON_REF> . . END=17999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 18000000 . G <NON_REF> . . END=19999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 20000000 . A <NON_REF> . . END=21999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 22000000 . T <NON_REF> . . END=23999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 24000000 . G <NON_REF> . . END=25999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 26000000 . G <NON_REF> . . END=27999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 28000000 . C <NON_REF> . . END=29999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 30000000 . G <NON_REF> . . END=31999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 32000000 . A <NON_REF> . . END=33999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 34000000 . T <NON_REF> . . END=35999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 36000000 . G <NON_REF> . . END=37999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 38000000 . A <NON_REF> . . END=39999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 40000000 . T <NON_REF> . . END=41999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 42000000 . G <NON_REF> . . END=43999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 44000000 . G <NON_REF> . . END=45999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 46000000 . G <NON_REF> . . END=46709983 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr20_GL383577v2_alt 1 . G <NON_REF> . . END=128386 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0

0 comments on commit 07890fe

Please sign in to comment.