Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FilterIntervals now filters out any singleton intervals #6559

Merged
merged 2 commits into from
Jun 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,16 @@
* {@link CollectReadCounts}, outputs a filtered Picard interval list. The set intersection of intervals from the
* specified intervals, the annotated intervals, and the first count file will be taken as the initial set of intervals
* on which to perform filtering. Parameters for filtering based on the annotations and counts can be adjusted.
* Annotation-based filters will be applied first, followed by count-based filters. The result may be passed via -L to
* other tools (e.g., {@link DetermineGermlineContigPloidy} and {@link GermlineCNVCaller}) to mask intervals from
* analysis.
* Annotation-based filters will be applied first, followed by count-based filters. In the end, any singleton intervals
* (i.e., those being by themselves on their corresponding contigs) found after applying other filters will be filtered
* out. The result may be passed via -L to other tools (e.g., {@link DetermineGermlineContigPloidy} and
* {@link GermlineCNVCaller}) to mask intervals from analysis.
*
* <h3>Inputs</h3>
*
* <ul>
* <li>
Intervals to be filtered (typically, the bins output by {@link PreprocessIntervals}).
* Intervals to be filtered (typically, the bins output by {@link PreprocessIntervals}).
* The argument {@code interval-merging-rule} must be set to {@link IntervalMergingRule#OVERLAPPING_ONLY}
* and all other common arguments for interval padding or merging must be set to their defaults.
* A blacklist of regions in which intervals should always be filtered (regardless of other annotation-based
Expand Down Expand Up @@ -431,6 +432,23 @@ private SimpleIntervalCollection filterIntervals() {
countNumberPassing(mask), numIntersectedIntervals));
}

//finally, filter intervals that are solitary in their corresponding contigs
final Map<String, Long> contigToIntervalCountMap = IntStream.range(0, numIntersectedIntervals)
.filter(i -> !mask[i])
.mapToObj(i -> intersectedIntervals.getRecords().get(i))
.collect(Collectors.groupingBy(SimpleInterval::getContig, Collectors.counting()));
IntStream.range(0, numIntersectedIntervals)
.filter(i -> !mask[i])
.forEach(i -> {
final String contig = intersectedIntervals.getRecords().get(i).getContig();
final long intervalCount = contigToIntervalCountMap.get(contig);
if (intervalCount == 1) {
logger.warn(String.format("After applying provided filters, contig %s was left with a single" +
" interval that was filtered out.", contig));
mask[i] = true;
}
});

logger.info(String.format("%d / %d intervals passed all filters...", countNumberPassing(mask), numIntersectedIntervals));

//return the filtered intervals as a SimpleIntervalCollection
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,7 @@
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

Expand Down Expand Up @@ -318,6 +315,8 @@ public void onTraversalStart() {
"paths are provided in matching order.");
sortedIntervalCollections = sortedIntervalCollectionsFromCalls;

checkForSingletonInterval(sortedIntervalCollections);

/* assert that allosomal contigs are contained in the SAM sequence dictionary */
final Set<String> allContigs = sequenceDictionary.getSequences().stream()
.map(SAMSequenceRecord::getSequenceName)
Expand Down Expand Up @@ -654,4 +653,22 @@ private List<SimpleIntervalCollection> getUnsortedIntervalCollectionsFromModels(
}
return unsortedIntervalCollectionsFromModels;
}

/**
* Validate that the concatenation of the sharded interval lists does not have singleton intervals, i.e. intervals
* that are the only ones on their corresponding contigs.
*/
private void checkForSingletonInterval(final List<SimpleIntervalCollection> intervalCollections){
intervalCollections.stream()
.flatMap(list -> list.getIntervals().stream())
.collect(Collectors.groupingBy(SimpleInterval::getContig, Collectors.counting()))
.entrySet().stream()
.forEach(entry -> {
if (entry.getValue() == 1) {
throw new IllegalArgumentException(
String.format("Records contain a singleton interval on contig (%s)." +
" Please run FilterIntervals tool first.", entry.getKey()));
}
});
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,7 @@

import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.OptionalInt;
import java.util.Set;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,30 +47,40 @@ public final class FilterIntervalsIntegrationTest extends CommandLineProgramTest
private static final AnnotatedIntervalCollection ANNOTATED_INTERVALS = new AnnotatedIntervalCollection(
LOCATABLE_METADATA,
Arrays.asList(
new AnnotatedInterval(new SimpleInterval("20", 1, 10),
new AnnotatedInterval(new SimpleInterval("20", 1, 10),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.5),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.05),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.05)))),
new AnnotatedInterval(new SimpleInterval("20", 11, 20),
new AnnotatedInterval(new SimpleInterval("20", 11, 20),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.5),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.5),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.95)))),
new AnnotatedInterval(new SimpleInterval("20", 21, 30),
new AnnotatedInterval(new SimpleInterval("20", 21, 30),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.5),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.5),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.5)))),
new AnnotatedInterval(new SimpleInterval("20", 31, 40),
new AnnotatedInterval(new SimpleInterval("20", 31, 40),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.05),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.5),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.5)))),
new AnnotatedInterval(new SimpleInterval("20", 41, 50),
new AnnotatedInterval(new SimpleInterval("20", 41, 50),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.95),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.95),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.5)))),
new AnnotatedInterval(new SimpleInterval("20", 51, 60),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.5),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.5),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.5)))),
new AnnotatedInterval(new SimpleInterval("21", 1, 10),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.5),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.5),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.5))))));

@DataProvider(name = "dataAnnotationBasedFilters")
Expand All @@ -92,28 +102,28 @@ public Object[][] dataAnnotationBasedFilters() {
return new Object[][]{
//intervals file, array of strings for excluded intervals, annotated-intervals file,
//min/max GC content, mix/max mappability, min/max seg-dupe content, expected array of indices of retained intervals
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(0, 1, 2, 3, 4)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1, 2)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0., 1., Arrays.asList(1, 2, 3)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0.1, 0.9, Arrays.asList(2, 3, 4)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0., 1., Arrays.asList(1, 2)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0.1, 0.9, Collections.singletonList(2)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0.1, 0.9, Arrays.asList(2, 3)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, Collections.singletonList(2)},
{intervalsFile, Collections.singletonList("20:1-10"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(1, 2, 3, 4)},
{intervalsFile, Arrays.asList("20:1-15", "20:35-45"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Collections.singletonList(2)},
{intervalsFile, Collections.singletonList("20:25-50"), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(0, 1, 2, 3, 4)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1, 2)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0., 1., Arrays.asList(1, 2, 3)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0.1, 0.9, Arrays.asList(2, 3, 4)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0., 1., Arrays.asList(1, 2)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0.1, 0.9, Collections.singletonList(2)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0.1, 0.9, Arrays.asList(2, 3)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, Collections.singletonList(2)},
{intervalsWithExtraIntervalFile, Collections.singletonList("20:1-10"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(1, 2, 3, 4)},
{intervalsWithExtraIntervalFile, Arrays.asList("20:1-15", "20:35-45"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Collections.singletonList(2)},
{intervalsWithExtraIntervalFile, Collections.singletonList("20:25-50"), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1)}};
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(0, 1, 2, 3, 4, 5)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1, 2, 5)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0., 1., Arrays.asList(1, 2, 3, 5)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0.1, 0.9, Arrays.asList(2, 3, 4, 5)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0., 1., Arrays.asList(1, 2, 5)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0.1, 0.9, Arrays.asList(2, 5)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0.1, 0.9, Arrays.asList(2, 3, 5)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, Arrays.asList(2, 5)},
{intervalsFile, Collections.singletonList("20:1-10"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(1, 2, 3, 4, 5)},
{intervalsFile, Arrays.asList("20:1-15", "20:35-45"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(2, 5)},
{intervalsFile, Collections.singletonList("20:25-50"), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1, 5)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(0, 1, 2, 3, 4, 5)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1, 2, 5)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0., 1., Arrays.asList(1, 2, 3, 5)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0.1, 0.9, Arrays.asList(2, 3, 4, 5)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0., 1., Arrays.asList(1, 2, 5)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0.1, 0.9, Arrays.asList(2, 5)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0.1, 0.9, Arrays.asList(2, 3, 5)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, Arrays.asList(2, 5)},
{intervalsWithExtraIntervalFile, Collections.singletonList("20:1-10"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(1, 2, 3, 4, 5)},
{intervalsWithExtraIntervalFile, Arrays.asList("20:1-15", "20:35-45"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(2, 5)},
{intervalsWithExtraIntervalFile, Collections.singletonList("20:25-50"), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1, 5)}};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does one of these test for the singleton case?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All of them actually do because I added an interval from contig "21" which should be filtered out in all of these.

}

@Test(dataProvider = "dataAnnotationBasedFilters")
Expand Down