Skip to content

Commit

Permalink
block compressed interval streams (#7142)
Browse files Browse the repository at this point in the history
Added support for binary, indexed, block-compressed streams of intervals integrated with the GATK engine
  • Loading branch information
tedsharpe authored Jul 20, 2021
1 parent 3a46fd8 commit c042e8f
Show file tree
Hide file tree
Showing 85 changed files with 2,876 additions and 668 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,26 @@
import org.broadinstitute.hellbender.utils.IndexUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.codecs.FeaturesHeader;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.BlockCompressedIntervalStream.Reader;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.genomicsdb.model.GenomicsDBExportConfiguration;
import org.genomicsdb.reader.GenomicsDBFeatureReader;

import java.io.File;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.net.URI;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;

import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.createExportConfiguration;
import static org.broadinstitute.hellbender.utils.io.BlockCompressedIntervalStream.BCI_FILE_EXTENSION;

/**
* Enables traversals and queries over sources of Features, which are metadata associated with a location
Expand Down Expand Up @@ -305,7 +306,8 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
* @param setNameOnCodec If true, and if this FeatureDataSource uses a NameAwareCodec, the name of the FeatureInput will be used to set the codec's name. This exists as a mechanism to store the FeatureInput name in the source field of VariantContexts
*/
public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLookaheadBases, final Class<? extends Feature> targetFeatureType,
final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final GenomicsDBOptions genomicsDBOptions, final boolean setNameOnCodec) {
final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final GenomicsDBOptions genomicsDBOptions,
final boolean setNameOnCodec) {
Utils.validateArg(queryLookaheadBases >= 0, "Query lookahead bases must be >= 0");
this.featureInput = Utils.nonNull(featureInput, "featureInput must not be null");
if (IOUtils.isGenomicsDBPath(featureInput)) {
Expand All @@ -319,12 +321,14 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
BucketUtils.getPrefetchingWrapper(cloudIndexPrefetchBuffer),
genomicsDBOptions, setNameOnCodec);

if (IOUtils.isGenomicsDBPath(featureInput)) {
if (IOUtils.isGenomicsDBPath(featureInput) ||
featureInput.getFeaturePath().toLowerCase().endsWith(BCI_FILE_EXTENSION)) {
//genomics db uri's have no associated index file to read from, but they do support random access
// likewise with block-compressed interval files
this.hasIndex = false;
this.supportsRandomAccess = true;
} else if (featureReader instanceof AbstractFeatureReader) {
this.hasIndex = ((AbstractFeatureReader<T, ?>) featureReader).hasIndex();
this.hasIndex = ((AbstractFeatureReader<T, ?>)featureReader).hasIndex();
this.supportsRandomAccess = hasIndex;
} else {
throw new GATKException("Found a feature input that was neither GenomicsDB or a Tribble AbstractFeatureReader. Input was " + featureInput.toString() + ".");
Expand All @@ -345,7 +349,7 @@ final void printCacheStats() {
queryCache.printCacheStatistics( getName() );
}

@SuppressWarnings("unchecked")
@SuppressWarnings({"unchecked", "rawtypes"})
private static <T extends Feature> FeatureReader<T> getFeatureReader(final FeatureInput<T> featureInput, final Class<? extends Feature> targetFeatureType,
final Function<SeekableByteChannel, SeekableByteChannel> cloudWrapper,
final Function<SeekableByteChannel, SeekableByteChannel> cloudIndexWrapper,
Expand All @@ -367,6 +371,9 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
}
} else {
final FeatureCodec<T, ?> codec = getCodecForFeatureInput(featureInput, targetFeatureType, setNameOnCodec);
if ( featureInput.getFeaturePath().toLowerCase().endsWith(BCI_FILE_EXTENSION) ) {
return new Reader(featureInput, codec);
}
return getTribbleFeatureReader(featureInput, codec, cloudWrapper, cloudIndexWrapper);
}
}
Expand All @@ -380,7 +387,8 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
*/
@SuppressWarnings("unchecked")
private static <T extends Feature> FeatureCodec<T, ?> getCodecForFeatureInput(final FeatureInput<T> featureInput,
final Class<? extends Feature> targetFeatureType, final boolean setNameOnCodec) {
final Class<? extends Feature> targetFeatureType,
final boolean setNameOnCodec) {
final FeatureCodec<T, ?> codec;
final Class<FeatureCodec<T, ?>> codecClass = featureInput.getFeatureCodecClass();
if (codecClass == null) {
Expand All @@ -391,6 +399,10 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
} else {
try {
codec = codecClass.getDeclaredConstructor().newInstance();
if ( !codec.canDecode(featureInput.toPath().toAbsolutePath().toUri().toString()) ) {
throw new GATKException(codec.getClass().getSimpleName() + " cannot decode " +
featureInput.toPath().toString());
}
} catch (final InstantiationException | IllegalAccessException | NoSuchMethodException | InvocationTargetException e) {
throw new GATKException("Unable to automatically instantiate codec " + codecClass.getName());
}
Expand Down Expand Up @@ -461,7 +473,9 @@ protected static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final
public SAMSequenceDictionary getSequenceDictionary() {
SAMSequenceDictionary dict = null;
final Object header = getHeader();
if (header instanceof VCFHeader) {
if ( header instanceof FeaturesHeader ) {
dict = ((FeaturesHeader)header).getDictionary();
} else if (header instanceof VCFHeader) {
dict = ((VCFHeader) header).getSequenceDictionary();
}
if (dict != null && !dict.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,11 @@ void addToFeatureSources(final int featureQueryLookahead, final FeatureInput<? e
featureSources.put(featureInput, new FeatureDataSource<>(featureInput, featureQueryLookahead, featureType, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, genomicsDBOptions));
}

<F extends Feature> void addToFeatureSources (final FeatureInput<F> featureInput,
final FeatureDataSource<F> featureDataSource) {
featureSources.put(featureInput, featureDataSource);
}

/**
* Given a ArgumentDefinition for an argument known to be of type FeatureInput (or a Collection thereof), retrieves the type
* parameter for the FeatureInput (eg., for FeatureInput<VariantContext> or List<FeatureInput<VariantContext>>
Expand Down Expand Up @@ -375,6 +380,22 @@ public <T extends Feature> Iterator<T> getFeatureIterator(final FeatureInput<T>
return dataSource.iterator();
}

/**
* As above, but takes an optional list of intervals to examine.
* @param featureDescriptor FeatureInput to scan
* @param intervals The userIntervals to examine (may be null)
* @param <T> Feature type
* @return An iterator over the Features
*/
public <T extends Feature> Iterator<T> getFeatureIterator( final FeatureInput<T> featureDescriptor,
final List<SimpleInterval> intervals ) {
final FeatureDataSource<T> dataSource = lookupDataSource(featureDescriptor);
dataSource.setIntervalsForTraversal(intervals);
final Iterator<T> itr = dataSource.iterator();
dataSource.setIntervalsForTraversal(null);
return itr;
}

/**
* Get the header associated with a particular FeatureInput
*
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package org.broadinstitute.hellbender.engine;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.tribble.Feature;
import htsjdk.tribble.FeatureCodec;
import org.broadinstitute.hellbender.engine.filters.CountingReadFilter;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBOptions;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;

Expand Down Expand Up @@ -57,12 +59,18 @@ private void initializeDrivingFeatures() {
final GATKPath drivingPath = getDrivingFeaturePath();
final FeatureCodec<? extends Feature, ?> codec = FeatureManager.getCodecForFile(drivingPath.toPath());
if (isAcceptableFeatureType(codec.getFeatureType())) {
drivingFeatures = new FeatureDataSource<>(new FeatureInput<>(drivingPath), FeatureDataSource.DEFAULT_QUERY_LOOKAHEAD_BASES, null, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, referenceArguments.getReferencePath());

final FeatureInput<F> drivingFeaturesInput = new FeatureInput<>(drivingPath, "drivingFeatureFile");
features.addToFeatureSources(0, drivingFeaturesInput, codec.getFeatureType(), cloudPrefetchBuffer, cloudIndexPrefetchBuffer,
referenceArguments.getReferencePath());
header = getHeaderForFeatures(drivingFeaturesInput);
final GenomicsDBOptions options = new GenomicsDBOptions(referenceArguments.getReferencePath());
final FeatureInput<F> drivingFeatureInput = new FeatureInput<>(drivingPath);
drivingFeatureInput.setFeatureCodecClass((Class<FeatureCodec<F, ?>>)codec.getClass());
drivingFeatures = new FeatureDataSource<>(drivingFeatureInput, FeatureDataSource.DEFAULT_QUERY_LOOKAHEAD_BASES, null,
cloudPrefetchBuffer, cloudIndexPrefetchBuffer, options, false);
header = drivingFeatures.getHeader();

final FeatureInput<F> featureInput = new FeatureInput<>(drivingPath, "drivingFeatureFile");
featureInput.setFeatureCodecClass((Class<FeatureCodec<F, ?>>)codec.getClass());
features.addToFeatureSources(featureInput,
new FeatureDataSource<>(featureInput, 0, codec.getFeatureType(),
cloudPrefetchBuffer, cloudIndexPrefetchBuffer, options, false));
} else {
throw new UserException("File " + drivingPath.getRawInputString() + " contains features of the wrong type.");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,7 @@ public SAMSequenceDictionary getBestAvailableSequenceDictionary() {
} else if (hasReads()){
return reads.getSequenceDictionary();
} else if (hasFeatures()){
final List<SAMSequenceDictionary> dictionaries = features.getVariantSequenceDictionaries();
final List<SAMSequenceDictionary> dictionaries = features.getAllSequenceDictionaries();
//If there is just one, it clearly is the best. Otherwise, none is best.
if (dictionaries.size() == 1){
return dictionaries.get(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.SimpleNovelAdjacencyAndChimericAlignmentEvidence;
import org.broadinstitute.hellbender.tools.spark.sv.utils.CNVInputReader;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFWriter;
import org.broadinstitute.hellbender.utils.BaseUtils;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.broadinstitute.hellbender.tools.spark.sv.evidence.FindBreakpointEvidenceSpark;
import org.broadinstitute.hellbender.tools.spark.sv.evidence.ReadMetadata;
import org.broadinstitute.hellbender.tools.spark.sv.utils.*;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.bwa.BwaMemAlignment;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import org.broadinstitute.hellbender.tools.spark.sv.evidence.EvidenceTargetLink;
import org.broadinstitute.hellbender.tools.spark.sv.evidence.ReadMetadata;
import org.broadinstitute.hellbender.tools.spark.sv.utils.*;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.Utils;
import scala.Tuple2;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignedContig;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigAlignmentsRDDProcessor;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.ContigChimericAlignmentIterativeInterpreter;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFWriter;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import org.broadinstitute.hellbender.tools.spark.sv.evidence.EvidenceTargetLink;
import org.broadinstitute.hellbender.tools.spark.sv.evidence.ReadMetadata;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SimpleInterval;

import java.util.Collections;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.CpxVariantInterpreter;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.SegmentedCpxVariantSimpleVariantExtractor;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.SimpleNovelAdjacencyInterpreter;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFWriter;
import org.broadinstitute.hellbender.utils.io.IOUtils;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
import org.broadinstitute.hellbender.tools.spark.sv.evidence.EvidenceTargetLink;
import org.broadinstitute.hellbender.tools.spark.sv.evidence.ReadMetadata;
import org.broadinstitute.hellbender.tools.spark.sv.utils.PairedStrandedIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;

import java.util.List;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsArgumentCollection;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.BreakpointComplications;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.NovelAdjacencyAndAltHaplotype;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFReader;
import org.broadinstitute.hellbender.utils.SimpleInterval;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.*;
import htsjdk.samtools.util.SequenceUtil;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.Strand;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Tail;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignmentInterval;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigWithFineTunedAlignments;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.StrandSwitch;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import scala.Tuple2;

import java.util.ArrayList;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvDiscoveryInputMetaData;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignedContig;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigWithFineTunedAlignments;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.read.GATKRead;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignmentInterval;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigWithFineTunedAlignments;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.Utils;

import java.io.Serializable;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvType;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignmentInterval;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigWithFineTunedAlignments;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import scala.Tuple2;

import java.util.List;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import htsjdk.samtools.*;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVFileUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils;
import org.broadinstitute.hellbender.utils.Utils;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import com.google.common.annotations.VisibleForTesting;
import org.broadinstitute.hellbender.tools.spark.sv.utils.*;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.Utils;
import scala.Tuple2;

Expand Down
Loading

0 comments on commit c042e8f

Please sign in to comment.