Skip to content

Commit

Permalink
[ADAM-1502] Preserve contig ordering in TwoBitFile sequence dictionary.
Browse files Browse the repository at this point in the history
Resolves #1502.
  • Loading branch information
fnothaft authored and heuermh committed May 5, 2017
1 parent dbe5c97 commit 36d8e0b
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 13 deletions.
28 changes: 16 additions & 12 deletions adam-core/src/main/scala/org/bdgenomics/adam/util/TwoBitFile.scala
Original file line number Diff line number Diff line change
Expand Up @@ -63,17 +63,20 @@ class TwoBitFile(byteAccess: ByteAccess) extends ReferenceFile {
private[util] val numSeq = readHeader()
// hold current byte position of start of current index record
var indexRecordStart = TwoBitFile.FILE_INDEX_OFFSET
private val seqRecordStarts = (0 until numSeq).map(i => {
val tup = readIndexEntry(indexRecordStart)
indexRecordStart += TwoBitFile.NAME_SIZE_SIZE + tup._1.length + TwoBitFile.OFFSET_SIZE
tup
}).toMap
private[util] val seqRecords = seqRecordStarts.map(tup => tup._1 -> TwoBitRecord(bytes, tup._1, tup._2))
private[util] val seqRecords = (0 until numSeq).map(idx => {
val (name, startIdx) = readIndexEntry(indexRecordStart)
indexRecordStart += TwoBitFile.NAME_SIZE_SIZE + name.length + TwoBitFile.OFFSET_SIZE
(name, TwoBitRecord(bytes, name, startIdx, idx))
})
private val seqRecordsMap = seqRecords.toMap

/**
* The sequence dictionary corresponding to the contigs in this two bit file.
*/
val sequences = new SequenceDictionary(seqRecords.toVector.map(r => SequenceRecord(r._1, r._2.dnaSize)))
val sequences = new SequenceDictionary(seqRecords.toVector
.map(r => SequenceRecord(r._1,
r._2.dnaSize,
referenceIndex = Some(r._2.seqIdx))))

private def readHeader(): Int = {
// figure out proper byte order
Expand Down Expand Up @@ -107,10 +110,10 @@ class TwoBitFile(byteAccess: ByteAccess) extends ReferenceFile {
*/
def extract(region: ReferenceRegion, mask: Boolean): String = {
val record =
seqRecords.getOrElse(
seqRecordsMap.getOrElse(
region.referenceName,
throw new Exception(
s"Contig ${region.referenceName} not found in reference map with keys: ${seqRecords.keys.toList.sortBy(x => x).mkString(", ")}"
s"Contig ${region.referenceName} not found in reference map with keys: ${seqRecordsMap.keys.toList.sortBy(x => x).mkString(", ")}"
)
)
val contigLength = record.dnaSize
Expand Down Expand Up @@ -203,7 +206,7 @@ class TwoBitFileSerializer extends Serializer[TwoBitFile] {
}

private object TwoBitRecord {
def apply(twoBitBytes: ByteBuffer, name: String, seqRecordStart: Int): TwoBitRecord = {
def apply(twoBitBytes: ByteBuffer, name: String, seqRecordStart: Int, seqIdx: Int): TwoBitRecord = {
val dnaSize = twoBitBytes.getInt(seqRecordStart)
val nBlockCount = twoBitBytes.getInt(seqRecordStart + TwoBitFile.DNA_SIZE_SIZE)
val nBlockArraysOffset = seqRecordStart + TwoBitFile.DNA_SIZE_SIZE + TwoBitFile.BLOCK_COUNT_SIZE
Expand All @@ -222,12 +225,13 @@ private object TwoBitRecord {
ReferenceRegion(name, maskBlockStart, maskBlockStart + maskBlockSize) -> None
})))
val dnaOffset = maskBlockArraysOffset + (maskBlockCount * TwoBitFile.PER_BLOCK_SIZE) + TwoBitFile.SEQ_RECORD_RESERVED_SIZE
TwoBitRecord(dnaSize, nBlocks, maskBlocks, dnaOffset)
TwoBitRecord(dnaSize, nBlocks, maskBlocks, dnaOffset, seqIdx)
}
}

private case class TwoBitRecord(dnaSize: Int,
nBlocks: Option[NonoverlappingRegions],
maskBlocks: Option[NonoverlappingRegions],
dnaOffset: Int) {
dnaOffset: Int,
seqIdx: Int) {
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class TwoBitFileSuite extends ADAMFunSuite {
val byteAccess = new LocalFileByteAccess(file)
val twoBitFile = new TwoBitFile(byteAccess)
assert(twoBitFile.numSeq == 1)
assert(twoBitFile.seqRecords.toSeq.length == 1)
assert(twoBitFile.seqRecords.size == 1)
assert(twoBitFile.extract(ReferenceRegion("hg19_chrM", 0, 10)) == "GATCACAGGT")
assert(twoBitFile.extract(ReferenceRegion("hg19_chrM", 503, 513)) == "CATCCTACCC")
assert(twoBitFile.extract(ReferenceRegion("hg19_chrM", 16561, 16571)) == "CATCACGATG")
Expand Down Expand Up @@ -55,5 +55,7 @@ class TwoBitFileSuite extends ADAMFunSuite {
val dict = twoBitFile.sequences
assert(dict.records.length == 1)
assert(dict.records.head.length == 16571)
assert(dict.records.head.referenceIndex.isDefined)
assert(dict.records.head.referenceIndex.get === 0)
}
}

0 comments on commit 36d8e0b

Please sign in to comment.