-
Notifications
You must be signed in to change notification settings - Fork 596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
WeightedSplitInterval fixes [VS-384] [VS-332] #7795
Changes from all commits
e84990c
592e687
2f461a5
4a80bc1
1eee19f
699ecf2
ef51fc9
553417e
ab8e4a2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -111,7 +111,7 @@ public void onTraversalStart() { | |
WeightedInterval wi = (WeightedInterval) iter.next(); | ||
|
||
// if we're not mixing contigs, but we switched contigs, emit the list | ||
if (dontMixContigs && lastContig != null && !lastContig.equals(wi.getContig()) ) { | ||
if (dontMixContigs && lastContig != null && !lastContig.equals(wi.getContig())) { | ||
// write out the current list (uniqued and sorted) and start a new one | ||
writeIntervalList(formatString, scatterPiece++, currentList); | ||
currentList = new IntervalList(sequenceDictionary); | ||
|
@@ -122,24 +122,29 @@ public void onTraversalStart() { | |
lastContig = wi.getContig(); | ||
|
||
// if the interval fits completely, just add it | ||
if (cumulativeWeight + wi.getWeight() <= targetWeightPerScatter ) { | ||
if (cumulativeWeight + wi.getWeight() <= targetWeightPerScatter) { | ||
cumulativeWeight += wi.getWeight(); | ||
currentList.add(wi); | ||
|
||
// if it would push us over the edge | ||
} else { | ||
// add a piece of it | ||
// try to add a piece of it | ||
float remainingSpace = targetWeightPerScatter - cumulativeWeight; | ||
|
||
// how many bases can we take? | ||
int basesToTake = (int) Math.floor(remainingSpace / wi.getWeightPerBase()); | ||
|
||
// split and add the first part into this list | ||
WeightedInterval[] pair = wi.split(basesToTake); | ||
currentList.add(pair[0]); | ||
|
||
// push the remainder back onto the iterator | ||
iter.pushback(pair[1]); | ||
int basesToTake = (int) Math.floor(remainingSpace / wi.getWeightPerBase()); | ||
|
||
if (basesToTake == 0) { | ||
// We can't add any more bases to the current interval list so put this interval back. | ||
iter.pushback(wi); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixes the Charlie crashing issue and the resulting interval list files are identical to those produced by the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. looks great |
||
} else { | ||
// split and add the first part into this list | ||
WeightedInterval[] pair = wi.split(basesToTake); | ||
currentList.add(pair[0]); | ||
|
||
// push the remainder back onto the iterator | ||
iter.pushback(pair[1]); | ||
} | ||
|
||
// add uniqued, sorted output list and reset | ||
writeIntervalList(formatString, scatterPiece++, currentList); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,29 +1,21 @@ | ||
package org.broadinstitute.hellbender.tools.gvs.common; | ||
|
||
import htsjdk.samtools.SAMSequenceDictionary; | ||
import htsjdk.samtools.SAMSequenceRecord; | ||
import htsjdk.samtools.util.Interval; | ||
import htsjdk.samtools.util.IntervalList; | ||
import htsjdk.samtools.util.OverlapDetector; | ||
import org.broadinstitute.hellbender.CommandLineProgramTest; | ||
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; | ||
import org.broadinstitute.hellbender.tools.walkers.SplitIntervals; | ||
import org.broadinstitute.hellbender.tools.walkers.SplitIntervalsIntegrationTest; | ||
import org.broadinstitute.hellbender.utils.SimpleInterval; | ||
import org.testng.Assert; | ||
import org.testng.annotations.Test; | ||
|
||
import java.io.File; | ||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.TreeSet; | ||
import java.util.stream.Collectors; | ||
|
||
import static java.util.Arrays.asList; | ||
|
||
|
||
// Suppresses the "expression might be null" warnings which would be genuine concerns in production code but which | ||
// are fine in tests (unexpected and dereferenced nulls should crash the test which will appropriately signal failure). | ||
@SuppressWarnings("ConstantConditions") | ||
public class WeightedSplitIntervalsIntegrationTest extends CommandLineProgramTest { | ||
|
||
@Test | ||
|
@@ -62,7 +54,6 @@ public void testNoLossRealisticWgs() { | |
|
||
final int scatterCount = 100; | ||
final File outputDir = createTempDir("output"); | ||
final Interval interval = new Interval("chr20", 1000000, 2000000); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this variable in an existing test wasn't being used which may or may not be concerning? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably just a cut and paste from testNoLossSimple There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good spot, I think you're probably right. |
||
|
||
final ArgumentsBuilder args = new ArgumentsBuilder() | ||
.addInterval(wgsIntervalList.getAbsolutePath()) | ||
|
@@ -88,6 +79,44 @@ public void testNoLossRealisticWgs() { | |
|
||
} | ||
|
||
@Test | ||
public void testHandleNotOneMoreBase() { | ||
// Sometimes, the remaining space in a scatter interval is non-zero, yet can't take | ||
// even a single base of the next interval. | ||
// e.g. | ||
// 1 bases, 200 weight each | ||
// 10 bases 1000 weight each | ||
// target: sum 10200 / 10 -> 1020 per shard | ||
// chr20 60000 60001 . 200 | ||
// chr20 60001 60011 . 10000 | ||
final File weights = new File(publicTestDir + "example_weights_chr20_test_zero.bed"); | ||
final int scatterCount = 10; | ||
final File outputDir = createTempDir("output"); | ||
final Interval inputInterval = new Interval("chr20",60000,60109); | ||
|
||
final ArgumentsBuilder args = new ArgumentsBuilder() | ||
.addInterval(inputInterval.getContig() + ":" + inputInterval.getStart() + "-" + inputInterval.getEnd()) | ||
.addReference(hg38Reference) | ||
.add(SplitIntervals.SCATTER_COUNT_SHORT_NAME, scatterCount) | ||
.add(WeightedSplitIntervals.WEIGHTS_BED_FILE_FULL_NAME, weights) | ||
.addOutput(outputDir); | ||
runCommandLine(args); | ||
|
||
// even though we asked for 10 scatter, we should get 11 because the target per shard is 1020 weight | ||
// and due to the size of our weights each of the 11 bases must be in their own shard | ||
Assert.assertEquals(outputDir.listFiles().length, 11); | ||
|
||
// verify we have exactly the input intervals | ||
IntervalList outList = IntervalList.fromFiles(Arrays.stream(outputDir.listFiles()).collect(Collectors.toList())).uniqued().sorted(); | ||
|
||
// assert it's a single interval | ||
Assert.assertEquals(outList.getIntervals().size(), 1); | ||
|
||
// and the interval itself is the same | ||
Assert.assertEquals(outList.getIntervals().get(0).compareTo(inputInterval), 0); | ||
|
||
} | ||
|
||
@Test | ||
public void testDontMixContigs() { | ||
final File weights = new File(publicTestDir + "example_weights_chr20_chr21.bed.gz"); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
chr20 60000 60001 . 200 | ||
chr20 60001 60011 . 10000 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
interval list files were getting lines like
which should have been
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nice catch---but are nulls okay here? Seems odd we were getting nulls to begin with?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
WeightedSplitIntervals
only uses the two-arg version of theWeightedInterval
constructor which means theWeightedInterval
s it creates always have anull
name.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it make sense to give them names? Not necessary, but looks kind of odd without them.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would defer to others on that, I am new enough to this domain that this didn't jump out as odd to me. 🙂
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah I've always found the interval names in these files less than useful since unless your targeting biological constructs (ie gene names, etc) you typically end up doing something like "foo-XXX" which sort of is meaningless