Skip to content

Commit

Permalink
Add support for creating ReferenceSequenceFile from a reference Bundle.
Browse files Browse the repository at this point in the history
  • Loading branch information
cmnbroad committed Aug 6, 2024
1 parent 2c9878f commit 11db598
Show file tree
Hide file tree
Showing 15 changed files with 685 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ public FASTADecoderV1_0(final Bundle inputBundle) {
this.displayName = inputBundle.getPrimaryResource().getDisplayName();
final BundleResource referenceResource = inputBundle.getOrThrow(BundleResourceType.CT_HAPLOID_REFERENCE);
if (referenceResource.getIOPath().isPresent()) {
referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(
referenceResource.getIOPath().get().toPath(), true);
referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFileFromBundle(inputBundle, true, true);
} else {
final SeekableStream seekableStream = referenceResource.getSeekableStream().orElseThrow(
() -> new IllegalArgumentException(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ public class BundleResourceType {
/** Secondary content types for {@link BundleResourceType#CT_HAPLOID_REFERENCE} resources*/
public static final String CT_REFERENCE_DICTIONARY = "REFERENCE_DICTIONARY";
public static final String CT_REFERENCE_INDEX = "REFERENCE_INDEX";
public static final String CT_REFERENCE_INDEX_GZI = "REFERENCE_INDEX_GZI";


/****************************************** Resource types for FEATURES ********************************/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,18 @@
import htsjdk.beta.plugin.hapref.HaploidReferenceCodec;
import htsjdk.beta.plugin.hapref.HaploidReferenceDecoder;
import htsjdk.beta.plugin.hapref.HaploidReferenceDecoderOptions;
import htsjdk.io.HtsPath;
import htsjdk.io.IOPath;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import htsjdk.samtools.util.GZIIndex;
import htsjdk.samtools.util.IOUtil;
import htsjdk.utils.ValidationUtils;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.function.Function;

/**
* Class with methods for resolving inputs and outputs to haploid reference encoders and decoders.
* <p>
Expand Down Expand Up @@ -66,9 +75,7 @@ public HaploidReferenceDecoder getHaploidReferenceDecoder(
ValidationUtils.nonNull(inputPath, "Input path");
ValidationUtils.nonNull(HaploidReferenceDecoderOptions, "Decoder options");

final Bundle referenceBundle = new BundleBuilder().addPrimary(
new IOPathResource(inputPath, BundleResourceType.CT_HAPLOID_REFERENCE)).build();

final Bundle referenceBundle = referenceBundleFromFastaPath(inputPath, HtsPath::new);
return getHaploidReferenceDecoder(referenceBundle, HaploidReferenceDecoderOptions);
}

Expand Down Expand Up @@ -110,4 +117,47 @@ public HaploidReferenceDecoder getHaploidReferenceDecoder(
return (HaploidReferenceDecoder) resolveForDecoding(inputBundle).getDecoder(inputBundle, HaploidReferenceDecoderOptions);
}

/**
* Create q reference bundle given only a fasta path, including an index and a dictionary
* file if they are present and located in the same directory as the fasta.
*
* @param fastaPath location of the fasta
* @param ioPathConstructor a constructor used to create IOPath-derived objects for the bundle
* @return a reference Bundle
* @param <T>
*/
public static <T extends IOPath> Bundle referenceBundleFromFastaPath(final IOPath fastaPath, final Function<String, T> ioPathConstructor) {
final BundleBuilder referenceBundleBuilder = new BundleBuilder();
referenceBundleBuilder.addPrimary(new IOPathResource(fastaPath, BundleResourceType.CT_HAPLOID_REFERENCE));

final Path dictPath = ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(fastaPath.toPath());
if (Files.exists(dictPath)) {
referenceBundleBuilder.addSecondary(
new IOPathResource(
ioPathConstructor.apply(dictPath.toUri().toString()),
BundleResourceType.CT_REFERENCE_DICTIONARY));
}

final Path idxPath = ReferenceSequenceFileFactory.getFastaIndexFileName(fastaPath.toPath());
if (Files.exists(idxPath)) {
referenceBundleBuilder.addSecondary(
new IOPathResource(
ioPathConstructor.apply(idxPath.toUri().toString()),
BundleResourceType.CT_REFERENCE_INDEX));
}

try {
if (IOUtil.isBlockCompressed(fastaPath.toPath(), true)) {
final Path gziPath = GZIIndex.resolveIndexNameForBgzipFile(fastaPath.toPath());
referenceBundleBuilder.addSecondary(
new IOPathResource(
ioPathConstructor.apply(gziPath.toUri().toString()),
BundleResourceType.CT_REFERENCE_INDEX_GZI));
}
} catch (IOException e) {
throw new HtsjdkException("Error while checking for block compression", e);
}
return referenceBundleBuilder.build();
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
/**
* A {@link Bundle} for variants and variants-related resources that are backed by on disk files. A {@link
* htsjdk.beta.plugin.variants.VariantsBundle} has a primary resource with content type {@link
* BundleResourceType#PRIMARY_CT_VARIANT_CONTEXTS}; and an optional index resource. A VariantsBundle can also
* BundleResourceType#CT_VARIANT_CONTEXTS}; and an optional index resource. A VariantsBundle can also
* contain additional resources.
*
* Note that this class is merely a convenience class for the case where the variants are backed by files on disk.
Expand All @@ -31,6 +31,7 @@ public class VariantsBundle extends Bundle implements Serializable {
@Serial
private static final long serialVersionUID = 1L;
private static final Log LOG = Log.getInstance(VariantsBundle.class);

/**
* Create a {@link htsjdk.beta.plugin.variants.VariantsBundle} containing only a variants resource.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@

package htsjdk.samtools.reference;

import htsjdk.io.HtsPath;
import htsjdk.io.IOPath;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMTextHeaderCodec;
import htsjdk.samtools.util.BufferedLineReader;
import htsjdk.samtools.util.FileExtensions;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Lazy;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.nio.file.Files;
Expand Down Expand Up @@ -84,13 +84,25 @@ abstract class AbstractFastaSequenceFile implements ReferenceSequenceFile {
/** Attempts to find and load the sequence dictionary if present. */
protected SAMSequenceDictionary findAndLoadSequenceDictionary(final Path fasta) {
final Path dictPath = findSequenceDictionary(path);
if (dictPath == null) return null;
if (dictPath == null) {
return null;
}
return loadSequenceDictionary(new HtsPath(dictPath.toUri().toString()));
}

IOUtil.assertFileIsReadable(dictPath);
try (InputStream dictionaryIn = IOUtil.openFileForReading(dictPath)) {
return ReferenceSequenceFileFactory.loadDictionary(dictionaryIn);
/**
* Attempt to load a sequence dictionary given a file path. Path may be null.
* @param dictPath the dictionary file to open
* @return the SAMSequenceDictionary, or null
*/
protected static SAMSequenceDictionary loadSequenceDictionary(final IOPath dictPath) {
if (dictPath == null) {
return null;
}
catch (Exception e) {
IOUtil.assertFileIsReadable(dictPath.toPath());
try (final InputStream dictionaryStream = IOUtil.openFileForReading(dictPath.toPath())) {
return ReferenceSequenceFileFactory.loadDictionary(dictionaryStream);
} catch (final IOException e) {
throw new SAMException("Could not open sequence dictionary file: " + dictPath, e);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

package htsjdk.samtools.reference;

import htsjdk.io.IOPath;
import htsjdk.samtools.Defaults;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMSequenceDictionary;
Expand Down Expand Up @@ -69,6 +70,27 @@ protected AbstractIndexedFastaSequenceFile(final Path path, final FastaSequenceI
}
}

/**
* Create a AbstractIndexedFastaSequenceFile from explicitly provided files. No assumptions are made
* about the relative location of the files (i.e., that they are siblings).
*
* @param fastaPath the path to the fasta file. may not be null.
* @param dictPath the path to the sequence dictionary. may be null.
* @param index the associated index object; may not be null.
*/
protected AbstractIndexedFastaSequenceFile(final IOPath fastaPath, final IOPath dictPath, final FastaSequenceIndex index) {
super(fastaPath.toPath(), fastaPath.getURIString(), loadSequenceDictionary(dictPath));
if (index == null) {
throw new IllegalArgumentException("Null index for fasta " + index);
}
this.index = index;
IOUtil.assertFileIsReadable(fastaPath.toPath());
reset();
if (getSequenceDictionary() != null) {
sanityCheckDictionaryAgainstIndex(fastaPath.getRawInputString(), getSequenceDictionary(), index);
}
}

/**
* Initialise the given indexed fasta sequence file stream.
* @param source The named source of the reference file (used in error messages).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

package htsjdk.samtools.reference;

import htsjdk.io.IOPath;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.seekablestream.SeekablePathStream;
Expand Down Expand Up @@ -54,6 +55,32 @@ public BlockCompressedIndexedFastaSequenceFile(final Path path)
this(path, new FastaSequenceIndex((findRequiredFastaIndexFile(path))));
}

/**
* Create a BlockCompressedIndexedFastaSequenceFile from explicitly provided files. No assumptions are made
* about the relative location of the files (i.e., no assumption is made that they are siblings).
* @param fastaPath the fasta file
* @param dictPath the associated dictionary file
* @param index the associated index
* @param gziIndex the associated gziIndex
*/
public BlockCompressedIndexedFastaSequenceFile(
final IOPath fastaPath,
final IOPath dictPath,
final FastaSequenceIndex index,
final GZIIndex gziIndex) {
super(fastaPath, dictPath, index);
if (gziIndex == null) {
throw new IllegalArgumentException("null gzi index");
}
assertIsBlockCompressed(fastaPath.toPath());
try {
stream = new BlockCompressedInputStream(new SeekablePathStream(fastaPath.toPath()));
gzindex = gziIndex;
} catch (IOException e) {
throw new SAMException("Fasta file should be readable but is not: " + fastaPath, e);
}
}

public BlockCompressedIndexedFastaSequenceFile(final Path path, final FastaSequenceIndex index) {
this(path, index, loadFastaGziIndex(path));
}
Expand Down
16 changes: 16 additions & 0 deletions src/main/java/htsjdk/samtools/reference/FastaSequenceFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

package htsjdk.samtools.reference;

import htsjdk.io.IOPath;
import htsjdk.samtools.Defaults;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMSequenceDictionary;
Expand Down Expand Up @@ -64,6 +65,21 @@ public FastaSequenceFile(final Path path, final boolean truncateNamesAtWhitespac
this.in = new FastLineReader(IOUtil.openFileForReading(path));
}

/**
* Constructs a FastaSequenceFile that reads from the specified fasta and dictionary file. Makes no
* assumptions that the fata and dict file are in the same directory.
*
* @param fastaPath may not be null
* @param dictPath may be null
* @param truncateNamesAtWhitespace
*/
public FastaSequenceFile(final IOPath fastaPath, final IOPath dictPath, final boolean truncateNamesAtWhitespace) {
super(fastaPath.toPath(), fastaPath.toString(), dictPath == null ? null : loadSequenceDictionary(dictPath));
this.truncateNamesAtWhitespace = truncateNamesAtWhitespace;
this.seekableStream = null;
this.in = new FastLineReader(IOUtil.openFileForReading(fastaPath.toPath()));
}

/**
* Constructs a FastaSequenceFile that reads from the specified stream (which must not be compressed, i.e.
* the caller is responsible for decompressing the stream).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,16 @@

package htsjdk.samtools.reference;

import htsjdk.io.IOPath;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.seekablestream.ReadableSeekableStreamByteChannel;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.IOUtil;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.SeekableByteChannel;
Expand Down Expand Up @@ -89,6 +87,28 @@ public IndexedFastaSequenceFile(final Path path, final FastaSequenceIndex index)
}
}

/**
*/
/**
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
*
* @param path The file to open.
* @param dictPath the dictionar path (may be null)
* @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk. may not be null.
*/
public IndexedFastaSequenceFile(final IOPath path, final IOPath dictPath, final FastaSequenceIndex index) {
super(path, dictPath, index);
try {
// reject block-compressed files (use BlockCompressedIndexedFastaSequenceFile)
if (IOUtil.isBlockCompressed(path.toPath(), true)) {
throw new SAMException("Indexed block-compressed FASTA file cannot be handled: " + path);
}
this.channel = Files.newByteChannel(path.toPath());
} catch (IOException e) {
throw new SAMException("FASTA file should be readable but is not: " + path, e);
}
}

/**
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
* @param path The file to open.
Expand Down
Loading

0 comments on commit 11db598

Please sign in to comment.