Skip to content

Commit

Permalink
FilterIntervals now filters out any singleton intervals (#6559)
Browse files Browse the repository at this point in the history
* FilterIntervals now filters out any singleton intervals, that have no other intervals on their contigs.
  • Loading branch information
asmirnov239 authored and jonn-smith committed Jul 14, 2020
1 parent 41ad7b3 commit 2fcdf89
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,16 @@
* {@link CollectReadCounts}, outputs a filtered Picard interval list. The set intersection of intervals from the
* specified intervals, the annotated intervals, and the first count file will be taken as the initial set of intervals
* on which to perform filtering. Parameters for filtering based on the annotations and counts can be adjusted.
* Annotation-based filters will be applied first, followed by count-based filters. The result may be passed via -L to
* other tools (e.g., {@link DetermineGermlineContigPloidy} and {@link GermlineCNVCaller}) to mask intervals from
* analysis.
* Annotation-based filters will be applied first, followed by count-based filters. In the end, any singleton intervals
* (i.e., those being by themselves on their corresponding contigs) found after applying other filters will be filtered
* out. The result may be passed via -L to other tools (e.g., {@link DetermineGermlineContigPloidy} and
* {@link GermlineCNVCaller}) to mask intervals from analysis.
*
* <h3>Inputs</h3>
*
* <ul>
* <li>
Intervals to be filtered (typically, the bins output by {@link PreprocessIntervals}).
* Intervals to be filtered (typically, the bins output by {@link PreprocessIntervals}).
* The argument {@code interval-merging-rule} must be set to {@link IntervalMergingRule#OVERLAPPING_ONLY}
* and all other common arguments for interval padding or merging must be set to their defaults.
* A blacklist of regions in which intervals should always be filtered (regardless of other annotation-based
Expand Down Expand Up @@ -431,6 +432,23 @@ private SimpleIntervalCollection filterIntervals() {
countNumberPassing(mask), numIntersectedIntervals));
}

//finally, filter intervals that are solitary in their corresponding contigs
final Map<String, Long> contigToIntervalCountMap = IntStream.range(0, numIntersectedIntervals)
.filter(i -> !mask[i])
.mapToObj(i -> intersectedIntervals.getRecords().get(i))
.collect(Collectors.groupingBy(SimpleInterval::getContig, Collectors.counting()));
IntStream.range(0, numIntersectedIntervals)
.filter(i -> !mask[i])
.forEach(i -> {
final String contig = intersectedIntervals.getRecords().get(i).getContig();
final long intervalCount = contigToIntervalCountMap.get(contig);
if (intervalCount == 1) {
logger.warn(String.format("After applying provided filters, contig %s was left with a single" +
" interval that was filtered out.", contig));
mask[i] = true;
}
});

logger.info(String.format("%d / %d intervals passed all filters...", countNumberPassing(mask), numIntersectedIntervals));

//return the filtered intervals as a SimpleIntervalCollection
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,7 @@
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

Expand Down Expand Up @@ -318,6 +315,8 @@ public void onTraversalStart() {
"paths are provided in matching order.");
sortedIntervalCollections = sortedIntervalCollectionsFromCalls;

checkForSingletonInterval(sortedIntervalCollections);

/* assert that allosomal contigs are contained in the SAM sequence dictionary */
final Set<String> allContigs = sequenceDictionary.getSequences().stream()
.map(SAMSequenceRecord::getSequenceName)
Expand Down Expand Up @@ -654,4 +653,22 @@ private List<SimpleIntervalCollection> getUnsortedIntervalCollectionsFromModels(
}
return unsortedIntervalCollectionsFromModels;
}

/**
* Validate that the concatenation of the sharded interval lists does not have singleton intervals, i.e. intervals
* that are the only ones on their corresponding contigs.
*/
private void checkForSingletonInterval(final List<SimpleIntervalCollection> intervalCollections){
intervalCollections.stream()
.flatMap(list -> list.getIntervals().stream())
.collect(Collectors.groupingBy(SimpleInterval::getContig, Collectors.counting()))
.entrySet().stream()
.forEach(entry -> {
if (entry.getValue() == 1) {
throw new IllegalArgumentException(
String.format("Records contain a singleton interval on contig (%s)." +
" Please run FilterIntervals tool first.", entry.getKey()));
}
});
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,7 @@

import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.OptionalInt;
import java.util.Set;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,30 +47,40 @@ public final class FilterIntervalsIntegrationTest extends CommandLineProgramTest
private static final AnnotatedIntervalCollection ANNOTATED_INTERVALS = new AnnotatedIntervalCollection(
LOCATABLE_METADATA,
Arrays.asList(
new AnnotatedInterval(new SimpleInterval("20", 1, 10),
new AnnotatedInterval(new SimpleInterval("20", 1, 10),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.5),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.05),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.05)))),
new AnnotatedInterval(new SimpleInterval("20", 11, 20),
new AnnotatedInterval(new SimpleInterval("20", 11, 20),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.5),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.5),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.95)))),
new AnnotatedInterval(new SimpleInterval("20", 21, 30),
new AnnotatedInterval(new SimpleInterval("20", 21, 30),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.5),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.5),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.5)))),
new AnnotatedInterval(new SimpleInterval("20", 31, 40),
new AnnotatedInterval(new SimpleInterval("20", 31, 40),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.05),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.5),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.5)))),
new AnnotatedInterval(new SimpleInterval("20", 41, 50),
new AnnotatedInterval(new SimpleInterval("20", 41, 50),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.95),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.95),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.5)))),
new AnnotatedInterval(new SimpleInterval("20", 51, 60),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.5),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.5),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.5)))),
new AnnotatedInterval(new SimpleInterval("21", 1, 10),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.5),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.5),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.5))))));

@DataProvider(name = "dataAnnotationBasedFilters")
Expand All @@ -92,28 +102,28 @@ public Object[][] dataAnnotationBasedFilters() {
return new Object[][]{
//intervals file, array of strings for excluded intervals, annotated-intervals file,
//min/max GC content, mix/max mappability, min/max seg-dupe content, expected array of indices of retained intervals
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(0, 1, 2, 3, 4)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1, 2)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0., 1., Arrays.asList(1, 2, 3)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0.1, 0.9, Arrays.asList(2, 3, 4)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0., 1., Arrays.asList(1, 2)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0.1, 0.9, Collections.singletonList(2)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0.1, 0.9, Arrays.asList(2, 3)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, Collections.singletonList(2)},
{intervalsFile, Collections.singletonList("20:1-10"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(1, 2, 3, 4)},
{intervalsFile, Arrays.asList("20:1-15", "20:35-45"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Collections.singletonList(2)},
{intervalsFile, Collections.singletonList("20:25-50"), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(0, 1, 2, 3, 4)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1, 2)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0., 1., Arrays.asList(1, 2, 3)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0.1, 0.9, Arrays.asList(2, 3, 4)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0., 1., Arrays.asList(1, 2)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0.1, 0.9, Collections.singletonList(2)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0.1, 0.9, Arrays.asList(2, 3)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, Collections.singletonList(2)},
{intervalsWithExtraIntervalFile, Collections.singletonList("20:1-10"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(1, 2, 3, 4)},
{intervalsWithExtraIntervalFile, Arrays.asList("20:1-15", "20:35-45"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Collections.singletonList(2)},
{intervalsWithExtraIntervalFile, Collections.singletonList("20:25-50"), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1)}};
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(0, 1, 2, 3, 4, 5)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1, 2, 5)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0., 1., Arrays.asList(1, 2, 3, 5)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0.1, 0.9, Arrays.asList(2, 3, 4, 5)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0., 1., Arrays.asList(1, 2, 5)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0.1, 0.9, Arrays.asList(2, 5)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0.1, 0.9, Arrays.asList(2, 3, 5)},
{intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, Arrays.asList(2, 5)},
{intervalsFile, Collections.singletonList("20:1-10"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(1, 2, 3, 4, 5)},
{intervalsFile, Arrays.asList("20:1-15", "20:35-45"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(2, 5)},
{intervalsFile, Collections.singletonList("20:25-50"), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1, 5)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(0, 1, 2, 3, 4, 5)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1, 2, 5)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0., 1., Arrays.asList(1, 2, 3, 5)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0.1, 0.9, Arrays.asList(2, 3, 4, 5)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0., 1., Arrays.asList(1, 2, 5)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0.1, 0.9, Arrays.asList(2, 5)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0.1, 0.9, Arrays.asList(2, 3, 5)},
{intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, Arrays.asList(2, 5)},
{intervalsWithExtraIntervalFile, Collections.singletonList("20:1-10"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(1, 2, 3, 4, 5)},
{intervalsWithExtraIntervalFile, Arrays.asList("20:1-15", "20:35-45"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(2, 5)},
{intervalsWithExtraIntervalFile, Collections.singletonList("20:25-50"), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1, 5)}};
}

@Test(dataProvider = "dataAnnotationBasedFilters")
Expand Down

0 comments on commit 2fcdf89

Please sign in to comment.