Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix and test for calculating CombineGVCFs intermediate band interval start locations. #4681

Merged
merged 1 commit into from
Apr 25, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -152,15 +152,15 @@ public void apply(List<VariantContext> variantContexts, ReferenceContext referen
*/
@VisibleForTesting
void createIntermediateVariants(SimpleInterval intervalToClose) {
Set<Integer> sitesToStop = new HashSet<>();
resizeReferenceIfNeeded(intervalToClose);

// Break up the GVCF according to the provided reference blocking scheme
if ( multipleAtWhichToBreakBands > 0) {
for (int i = ((intervalToClose.getStart())/multipleAtWhichToBreakBands)*multipleAtWhichToBreakBands; i <= intervalToClose.getEnd(); i+=multipleAtWhichToBreakBands) {
sitesToStop.add(i-1); // Subtract 1 here because we want to split before this base
}
}
// The values returned from getIntermediateStopSites represent a proposed set of stop sites that may include
// intervals that are outside the actual interval being closed. These sites are filtered out below.
// Note: Precomputing these is really inefficient when large reference blocks are closed with
// fine band resolution because it results in very large collections of stop sites (tens or hundreds of millions)
// that must subsequently be sorted.
final Set<Integer> sitesToStop = getIntermediateStopSites(intervalToClose, multipleAtWhichToBreakBands);

// If any variant contexts ended (or were spanning deletions) the last context compute where we should stop them
for (VariantContext vc : variantContextsOverlappingCurrentMerge) {
Expand All @@ -181,7 +181,7 @@ void createIntermediateVariants(SimpleInterval intervalToClose) {
List<Integer> stoppedLocs = new ArrayList<>(sitesToStop);
stoppedLocs.sort(Comparator.naturalOrder());

// For each stopped loc, create a fake QueuedContextState and pass it to endPreviousStats
// For each stopped loc that is within the interval being closed, create a fake QueuedContextState and pass it to endPreviousStats
for (int stoppedLoc : stoppedLocs) {
SimpleInterval loc = new SimpleInterval(intervalToClose.getContig(), stoppedLoc, stoppedLoc);
if (( stoppedLoc <= intervalToClose.getEnd() && stoppedLoc>= intervalToClose.getStart()) && isWithinInterval(loc)) {
Expand All @@ -192,6 +192,25 @@ void createIntermediateVariants(SimpleInterval intervalToClose) {

}

// Get any intermediate stop sites based on the break band multiple.
@VisibleForTesting
protected final static Set<Integer> getIntermediateStopSites(final SimpleInterval intervalToClose, final int breakBandMultiple) {
final Set<Integer> sitesToStop = new HashSet<>();

if ( breakBandMultiple > 0) {
// if the intermediate interval to close starts before the end of the first band multiple,
// create the first stop position at the end of the band multiple
for (int blockEndPosition = intervalToClose.getStart() < (breakBandMultiple + 1) ?
Math.max(2, breakBandMultiple) :
(intervalToClose.getStart() / breakBandMultiple) * breakBandMultiple;
blockEndPosition <= intervalToClose.getEnd();
blockEndPosition += breakBandMultiple) {
sitesToStop.add(blockEndPosition - 1); // Subtract 1 here because we want to split before this base
}
}
return sitesToStop;
}

/**
* Resize {@link #storedReferenceContext} to cover at least as much as intervalToClose
* @param intervalToClose
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@ public Object[][] gvcfsToCombine() {
{new File[]{getTestFile("NA12878.AS.chr20snippet.g.vcf"), getTestFile("NA12892.AS.chr20snippet.g.vcf")}, getTestFile("testAlleleSpecificAnnotationsNoGroup.vcf"), Arrays.asList("-G", "Standard", "-G", "AS_Standard"), b37_reference_20_21},
//Test that trailing reference blocks are emitted with correct intervals
{new File[]{getTestFile("gvcfExample1WithTrailingReferenceBlocks.g.vcf"), getTestFile("gvcfExample2WithTrailingReferenceBlocks.g.vcf")}, getTestFile("gvcfWithTrailingReferenceBlocksExpected.g.vcf"), NO_EXTRA_ARGS, b38_reference_20_21},
// same test as the previous one, except with a band multiple specified
{new File[]{getTestFile("gvcfExample1WithTrailingReferenceBlocks.g.vcf"), getTestFile("gvcfExample2WithTrailingReferenceBlocks.g.vcf")},
getTestFile("gvcfWithTrailingReferenceBlocksBandedExpected.g.vcf"),
Arrays.asList("--" + CombineGVCFs.BREAK_BANDS_LONG_NAME, "2000000"),
b38_reference_20_21},
};
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package org.broadinstitute.hellbender.tools.walkers;

import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.util.*;
import java.util.stream.Collectors;

public class CombineGVCFsUnitTest {

@DataProvider(name="breakIntermediateStopSites")
public Object[][] getIntermediateStopSitesData() {
return new Object[][] {
// Note that the expected results here do not represent a final set of stop sites for the given
// interval. Rather, they are intended to match the output of the CombineGVCFs.getIntermediateStopSites
// method, which returns an initial set of intermediate stop sites that in some cases includes sites
// outside the actual interval being closed, but which are subsequently filtered out by additional
// downstream code in CombineGVCFs.
{ new SimpleInterval("contig", 1, 1), 1, Collections.EMPTY_LIST },
{ new SimpleInterval("contig", 1, 2), 1, Arrays.asList(1) },
{ new SimpleInterval("contig", 1, 2), 2, Arrays.asList(1) },
{ new SimpleInterval("contig", 1, 10), 2, Arrays.asList(1, 3, 5, 7, 9) },
{ new SimpleInterval("contig", 1, 10), 5, Arrays.asList(4, 9) },
{ new SimpleInterval("contig", 1, 100), 25, Arrays.asList(24, 49, 74, 99) },

{ new SimpleInterval("contig", 10, 10), 2, Arrays.asList(9) },
{ new SimpleInterval("contig", 10, 10), 5, Arrays.asList(9) },

{ new SimpleInterval("contig", 10, 10), 10, Arrays.asList(9) },
{ new SimpleInterval("contig", 10, 10), 100, Collections.EMPTY_LIST },
{ new SimpleInterval("contig", 10, 20), 5, Arrays.asList(9, 14, 19) },
{ new SimpleInterval("contig", 10, 20), 10, Arrays.asList(9, 19) },
{ new SimpleInterval("contig", 10, 20), 100, Collections.EMPTY_LIST },

{ new SimpleInterval("contig", 10, 100), 25, Arrays.asList(24, 49, 74, 99) },
{ new SimpleInterval("contig", 10, 100), 50, Arrays.asList(49, 99) },
{ new SimpleInterval("contig", 10, 100), 100, Arrays.asList(99) },
{ new SimpleInterval("contig", 10, 100), 1000, Collections.EMPTY_LIST },
{ new SimpleInterval("contig", 110, 120), 100, Arrays.asList(99) }
};
}

@Test(dataProvider = "breakIntermediateStopSites")
public void testGetIntermediateStopSites(
final SimpleInterval intervalToClose,
final int breakBandMultiple,
final List<Integer> expectedCloseSites)
{
final List<Integer> actualStopSites = new ArrayList<>(CombineGVCFs.getIntermediateStopSites(intervalToClose, breakBandMultiple));
actualStopSites.sort(Comparator.naturalOrder());
// validate that the resulting stop sites all result in valid single-position stop intervals
actualStopSites.stream().forEach(stopSite -> Assert.assertNotNull(new SimpleInterval(intervalToClose.getContig(), stopSite, stopSite)));
Assert.assertEquals(actualStopSites, expectedCloseSites);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
##fileformat=VCFv4.2
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=MIN_GQ,Number=1,Type=Integer,Description="Minimum GQ observed within the GVCF block">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GATKCommandLine=<ID=CombineGVCFs,CommandLine="CombineGVCFs --output /var/folders/cr/16ghvyfj5lvfwxx01rt1k4tdl04sy3/T/combinegvcfs3475467159514601676.vcf --break-bands-at-multiples-of 2000000 --variant /Users/cnorman/projects/gatk/src/test/resources/org/broadinstitute/hellbender/tools/walkers/CombineGVCFs/gvcfExample1WithTrailingReferenceBlocks.g.vcf --variant /Users/cnorman/projects/gatk/src/test/resources/org/broadinstitute/hellbender/tools/walkers/CombineGVCFs/gvcfExample2WithTrailingReferenceBlocks.g.vcf --reference /Users/cnorman/projects/gatk/src/test/resources/large/Homo_sapiens_assembly38.20.21.fasta --verbosity ERROR --annotation-group StandardAnnotation --disable-tool-default-annotations false --convert-to-base-pair-resolution false --ignore-variants-starting-outside-interval false --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --help false --version false --showHidden false --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --disable-tool-default-read-filters false",Version=Unavailable,Date="April 19, 2018 9:53:59 AM EDT">
##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive)
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=BLOCK_SIZE,Number=1,Type=Integer,Description="Size of the homozygous reference GVCF block">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=RAW_MQ,Number=1,Type=Float,Description="Raw data for RMS Mapping Quality">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias">
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr20_GL383577v2_alt,length=128386>
##contig=<ID=chr20_KI270869v1_alt,length=118774>
##contig=<ID=chr20_KI270871v1_alt,length=58661>
##contig=<ID=chr20_KI270870v1_alt,length=183433>
##contig=<ID=chr21_GL383578v2_alt,length=63917>
##contig=<ID=chr21_KI270874v1_alt,length=166743>
##contig=<ID=chr21_KI270873v1_alt,length=143900>
##contig=<ID=chr21_GL383579v2_alt,length=201197>
##contig=<ID=chr21_GL383580v2_alt,length=74653>
##contig=<ID=chr21_GL383581v2_alt,length=116689>
##contig=<ID=chr21_KI270872v1_alt,length=82692>
##source=CombineGVCFs
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA1 NA2
chr20 69491 . G <NON_REF> . . END=69497 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:94:99:82:99:0,120,1800 ./.
chr20 69498 . C <NON_REF> . . END=69506 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:94:99:82:99:0,120,1800 ./.:94:99:82:99:0,120,1800
chr20 69507 . T <NON_REF> . . END=69510 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:94:99:82:99:0,120,1800 ./.
chr20 69511 . A G,<NON_REF> . . BaseQRankSum=1.17;DP=82;MQ=31.05;MQ0=0;MQRankSum=-8.660e-01;ReadPosRankSum=1.69 GT:AD:DP:GQ:PL:SB ./.:1,79,0:80:99:2284,207,0,2287,237,2316:0,1,46,33 ./.
chr20 69512 . C <NON_REF> . . END=69513 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:96:99:82:99:0,120,1800 ./.
chr20 69514 . T <NON_REF> . . END=69521 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:96:99:82:99:0,120,1800 ./.:96:99:82:99:0,120,1800
chr20 69522 . T <NON_REF> . . END=69548 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:95:0:95:0:0,0,0 ./.:96:99:82:99:0,120,1800
chr20 69549 . G <NON_REF> . . END=69624 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:156:99:56:66:0,66,990 ./.:96:99:82:99:0,120,1800
chr20 69625 . A <NON_REF> . . END=69634 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:156:99:56:66:0,66,990 ./.:7:18:7:18:0,18,270
chr20 69635 . G T,<NON_REF> . . BaseQRankSum=0.937;DP=14;MQ=34.15;MQ0=0;MQRankSum=1.30;ReadPosRankSum=1.75 GT:AD:DP:GQ:MIN_DP:MIN_GQ:PL:SB ./.:4,3,0:7:89:.:.:89,0,119,101,128,229:0,4,0,3 ./.:.:7:18:7:18:0,18,270,18,270,270
chr20 69636 . A <NON_REF> . . END=69675 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./. ./.:7:18:7:18:0,18,270
chr20 69762 . T <NON_REF> . . . GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:7:18:7:18:0,18,270 ./.
chr20 69763 . A <NON_REF> . . END=69766 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:7:21:7:21:0,21,253 ./.
chr20 69767 . C <NON_REF> . . END=69771 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:7:12:7:12:0,12,180 ./.:7:12:7:15:0,12,180
chr20 69772 . AAAGC A,<NON_REF> . . BaseQRankSum=0.937;DP=14;MQ=34.15;MQ0=0;MQRankSum=1.30;ReadPosRankSum=1.75 GT:AD:DP:GQ:MIN_DP:MIN_GQ:PL:SB ./.:4,3,0:7:89:.:.:89,0,119,101,128,229:0,4,0,3 ./.:.:7:12:7:15:0,12,180,12,180,180
chr20 69773 . A <NON_REF> . . END=69774 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:7:0:3:0:0,0,0 ./.:7:12:7:15:0,12,180
chr20 69775 . C <NON_REF> . . END=69783 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./.:7:0:3:0:0,0,0 ./.:7:15:7:15:0,15,160
chr20 69784 . A <NON_REF> . . END=69791 GT:DP:GQ:MIN_DP:MIN_GQ:PL ./. ./.:7:15:7:15:0,15,160
chr21 1 . G <NON_REF> . . END=1999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 2000000 . N <NON_REF> . . END=3999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 4000000 . N <NON_REF> . . END=5999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 6000000 . C <NON_REF> . . END=7999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 8000000 . G <NON_REF> . . END=9999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 10000000 . C <NON_REF> . . END=11999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 12000000 . N <NON_REF> . . END=13999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 14000000 . T <NON_REF> . . END=15999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 16000000 . A <NON_REF> . . END=17999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 18000000 . G <NON_REF> . . END=19999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 20000000 . A <NON_REF> . . END=21999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 22000000 . T <NON_REF> . . END=23999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 24000000 . G <NON_REF> . . END=25999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 26000000 . G <NON_REF> . . END=27999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 28000000 . C <NON_REF> . . END=29999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 30000000 . G <NON_REF> . . END=31999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 32000000 . A <NON_REF> . . END=33999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 34000000 . T <NON_REF> . . END=35999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 36000000 . G <NON_REF> . . END=37999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 38000000 . A <NON_REF> . . END=39999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 40000000 . T <NON_REF> . . END=41999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 42000000 . G <NON_REF> . . END=43999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 44000000 . G <NON_REF> . . END=45999999 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr21 46000000 . G <NON_REF> . . END=46709983 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0
chr20_GL383577v2_alt 1 . G <NON_REF> . . END=128386 GT:DP:GQ:MIN_DP:PL ./.:0:0:0:0,0,0 ./.:0:0:0:0,0,0