Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ADAM-1502] Preserve contig ordering in TwoBitFile sequence dictionary. #1508

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 16 additions & 12 deletions adam-core/src/main/scala/org/bdgenomics/adam/util/TwoBitFile.scala
Original file line number Diff line number Diff line change
Expand Up @@ -63,17 +63,20 @@ class TwoBitFile(byteAccess: ByteAccess) extends ReferenceFile {
private[util] val numSeq = readHeader()
// hold current byte position of start of current index record
var indexRecordStart = TwoBitFile.FILE_INDEX_OFFSET
private val seqRecordStarts = (0 until numSeq).map(i => {
val tup = readIndexEntry(indexRecordStart)
indexRecordStart += TwoBitFile.NAME_SIZE_SIZE + tup._1.length + TwoBitFile.OFFSET_SIZE
tup
}).toMap
private[util] val seqRecords = seqRecordStarts.map(tup => tup._1 -> TwoBitRecord(bytes, tup._1, tup._2))
private[util] val seqRecords = (0 until numSeq).map(idx => {
val (name, startIdx) = readIndexEntry(indexRecordStart)
indexRecordStart += TwoBitFile.NAME_SIZE_SIZE + name.length + TwoBitFile.OFFSET_SIZE
(name, TwoBitRecord(bytes, name, startIdx, idx))
})
private val seqRecordsMap = seqRecords.toMap

/**
* The sequence dictionary corresponding to the contigs in this two bit file.
*/
val sequences = new SequenceDictionary(seqRecords.toVector.map(r => SequenceRecord(r._1, r._2.dnaSize)))
val sequences = new SequenceDictionary(seqRecords.toVector
.map(r => SequenceRecord(r._1,
r._2.dnaSize,
referenceIndex = Some(r._2.seqIdx))))

private def readHeader(): Int = {
// figure out proper byte order
Expand Down Expand Up @@ -107,10 +110,10 @@ class TwoBitFile(byteAccess: ByteAccess) extends ReferenceFile {
*/
def extract(region: ReferenceRegion, mask: Boolean): String = {
val record =
seqRecords.getOrElse(
seqRecordsMap.getOrElse(
region.referenceName,
throw new Exception(
s"Contig ${region.referenceName} not found in reference map with keys: ${seqRecords.keys.toList.sortBy(x => x).mkString(", ")}"
s"Contig ${region.referenceName} not found in reference map with keys: ${seqRecordsMap.keys.toList.sortBy(x => x).mkString(", ")}"
)
)
val contigLength = record.dnaSize
Expand Down Expand Up @@ -203,7 +206,7 @@ class TwoBitFileSerializer extends Serializer[TwoBitFile] {
}

private object TwoBitRecord {
def apply(twoBitBytes: ByteBuffer, name: String, seqRecordStart: Int): TwoBitRecord = {
def apply(twoBitBytes: ByteBuffer, name: String, seqRecordStart: Int, seqIdx: Int): TwoBitRecord = {
val dnaSize = twoBitBytes.getInt(seqRecordStart)
val nBlockCount = twoBitBytes.getInt(seqRecordStart + TwoBitFile.DNA_SIZE_SIZE)
val nBlockArraysOffset = seqRecordStart + TwoBitFile.DNA_SIZE_SIZE + TwoBitFile.BLOCK_COUNT_SIZE
Expand All @@ -222,12 +225,13 @@ private object TwoBitRecord {
ReferenceRegion(name, maskBlockStart, maskBlockStart + maskBlockSize) -> None
})))
val dnaOffset = maskBlockArraysOffset + (maskBlockCount * TwoBitFile.PER_BLOCK_SIZE) + TwoBitFile.SEQ_RECORD_RESERVED_SIZE
TwoBitRecord(dnaSize, nBlocks, maskBlocks, dnaOffset)
TwoBitRecord(dnaSize, nBlocks, maskBlocks, dnaOffset, seqIdx)
}
}

private case class TwoBitRecord(dnaSize: Int,
nBlocks: Option[NonoverlappingRegions],
maskBlocks: Option[NonoverlappingRegions],
dnaOffset: Int) {
dnaOffset: Int,
seqIdx: Int) {
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class TwoBitFileSuite extends ADAMFunSuite {
val byteAccess = new LocalFileByteAccess(file)
val twoBitFile = new TwoBitFile(byteAccess)
assert(twoBitFile.numSeq == 1)
assert(twoBitFile.seqRecords.toSeq.length == 1)
assert(twoBitFile.seqRecords.size == 1)
assert(twoBitFile.extract(ReferenceRegion("hg19_chrM", 0, 10)) == "GATCACAGGT")
assert(twoBitFile.extract(ReferenceRegion("hg19_chrM", 503, 513)) == "CATCCTACCC")
assert(twoBitFile.extract(ReferenceRegion("hg19_chrM", 16561, 16571)) == "CATCACGATG")
Expand Down Expand Up @@ -55,5 +55,7 @@ class TwoBitFileSuite extends ADAMFunSuite {
val dict = twoBitFile.sequences
assert(dict.records.length == 1)
assert(dict.records.head.length == 16571)
assert(dict.records.head.referenceIndex.isDefined)
assert(dict.records.head.referenceIndex.get === 0)
}
}