Skip to content

Commit

Permalink
refactored ReferenceFile to require SequenceDictionary
Browse files Browse the repository at this point in the history
  • Loading branch information
akmorrow13 authored and fnothaft committed Aug 5, 2016
1 parent 2544a3d commit e7e1adf
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import org.bdgenomics.adam.models.{
SequenceDictionary
}
import org.bdgenomics.adam.rdd.{ AvroGenomicRDD, JavaSaveArgs }
import org.bdgenomics.adam.util.ReferenceFile
import org.bdgenomics.formats.avro.{ AlignmentRecord, NucleotideContigFragment }
import scala.collection.JavaConversions._
import scala.math.max
Expand All @@ -53,9 +54,18 @@ object NucleotideContigFragmentRDD extends Serializable {
}
}

/**
* A wrapper class for RDD[NucleotideContigFragment].
* NucleotideContigFragmentRDD extends ReferenceFile. To specifically access a ReferenceFile within an RDD,
* refer to:
* @see ReferenceContigMap
*
* @param rdd Underlying RDD
* @param sequences Sequence dictionary computed from rdd
*/
case class NucleotideContigFragmentRDD(
rdd: RDD[NucleotideContigFragment],
sequences: SequenceDictionary) extends AvroGenomicRDD[NucleotideContigFragment, NucleotideContigFragmentRDD] {
sequences: SequenceDictionary) extends AvroGenomicRDD[NucleotideContigFragment, NucleotideContigFragmentRDD] with ReferenceFile {

/**
* Converts an RDD of nucleotide contig fragments into reads. Adjacent contig fragments are
Expand Down Expand Up @@ -169,7 +179,7 @@ case class NucleotideContigFragmentRDD(
* @param region Reference region over which to get sequence.
* @return String of bases corresponding to reference sequence.
*/
def getReferenceString(region: ReferenceRegion): String = {
def extract(region: ReferenceRegion): String = {
def getString(fragment: (ReferenceRegion, NucleotideContigFragment)): (ReferenceRegion, String) = {
val trimStart = max(0, region.start - fragment._1.start).toInt
val trimEnd = max(0, fragment._1.end - region.end).toInt
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,15 @@ package org.bdgenomics.adam.util
import org.apache.spark.rdd.RDD
// NOTE(ryan): this is necessary for Spark <= 1.2.1.
import org.apache.spark.SparkContext._
import org.bdgenomics.adam.models.ReferenceRegion
import org.bdgenomics.adam.models.{ ReferenceRegion, SequenceDictionary, SequenceRecord }
import org.bdgenomics.formats.avro.NucleotideContigFragment

case class ReferenceContigMap(contigMap: Map[String, Seq[NucleotideContigFragment]]) extends ReferenceFile {

// create sequence dictionary
val sequences: SequenceDictionary = new SequenceDictionary(contigMap.map(r =>
SequenceRecord(r._1, r._2.map(_.getFragmentEndPosition).max)).toVector)

/**
* Extract reference sequence from the file.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

package org.bdgenomics.adam.util

import org.bdgenomics.adam.models.ReferenceRegion
import org.bdgenomics.adam.models.{ SequenceDictionary, ReferenceRegion }

/**
* File that contains a reference assembly that can be broadcasted
Expand All @@ -31,4 +31,10 @@ trait ReferenceFile extends Serializable {
* @return The reference sequence at the desired locus.
*/
def extract(region: ReferenceRegion): String

/*
* Stores SequenceDictionary for ReferenceFile
*/
def sequences: SequenceDictionary

}
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import java.nio.{ ByteOrder, ByteBuffer }
import com.esotericsoftware.kryo.io.{ Output, Input }
import com.esotericsoftware.kryo.{ Kryo, Serializer }
import org.bdgenomics.utils.io.{ ByteArrayByteAccess, ByteAccess }
import org.bdgenomics.adam.models.{ NonoverlappingRegions, ReferencePosition, ReferenceRegion }
import org.bdgenomics.adam.models._

object TwoBitFile {
val MAGIC_NUMBER: Int = 0x1A412743
Expand Down Expand Up @@ -67,7 +67,10 @@ class TwoBitFile(byteAccess: ByteAccess) extends ReferenceFile {
indexRecordStart += TwoBitFile.NAME_SIZE_SIZE + tup._1.length + TwoBitFile.OFFSET_SIZE
tup
}).toMap
val seqRecords = seqRecordStarts.map(tup => tup._1 -> TwoBitRecord(bytes, tup._1, tup._2)).toMap
val seqRecords = seqRecordStarts.map(tup => tup._1 -> TwoBitRecord(bytes, tup._1, tup._2))

// create sequence dictionary
val sequences = new SequenceDictionary(seqRecords.toVector.map(r => SequenceRecord(r._1, r._2.dnaSize)))

private def readHeader(): Int = {
// figure out proper byte order
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@
*/
package org.bdgenomics.adam.rdd.contig

import com.google.common.io.Files
import java.io.File
import org.apache.spark.rdd.RDD

import com.google.common.io.Files
import org.bdgenomics.adam.models._
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.util.ADAMFunSuite
import org.bdgenomics.formats.avro._

import scala.collection.mutable.ListBuffer

class NucleotideContigFragmentRDDSuite extends ADAMFunSuite {
Expand Down Expand Up @@ -76,7 +76,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite {

val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(fragment)))

assert(rdd.getReferenceString(region) === "ACTGTAC")
assert(rdd.extract(region) === "ACTGTAC")
}

sparkTest("recover trimmed reference string from a single contig fragment") {
Expand All @@ -97,7 +97,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite {

val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(fragment)))

assert(rdd.getReferenceString(region) === "CTGTA")
assert(rdd.extract(region) === "CTGTA")
}

sparkTest("recover reference string from multiple contig fragments") {
Expand Down Expand Up @@ -143,8 +143,8 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite {
fragment1,
fragment2)))

assert(rdd.getReferenceString(region0) === "ACTGTAC")
assert(rdd.getReferenceString(region1) === "GTACTCTCATG")
assert(rdd.extract(region0) === "ACTGTAC")
assert(rdd.extract(region1) === "GTACTCTCATG")
}

sparkTest("recover trimmed reference string from multiple contig fragments") {
Expand Down Expand Up @@ -190,8 +190,8 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite {
fragment1,
fragment2)))

assert(rdd.getReferenceString(region0) === "CTGTA")
assert(rdd.getReferenceString(region1) === "CTCTCA")
assert(rdd.extract(region0) === "CTGTA")
assert(rdd.extract(region1) === "CTCTCA")
}

sparkTest("testing nondeterminism from reduce when recovering referencestring") {
Expand All @@ -213,7 +213,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite {
var passed = true
val rdd = NucleotideContigFragmentRDD(sc.parallelize(fragments.toList))
try {
val result = rdd.getReferenceString(new ReferenceRegion("chr1", 0L, 1000L))
val result = rdd.extract(new ReferenceRegion("chr1", 0L, 1000L))
} catch {
case e: AssertionError => passed = false
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@
package org.bdgenomics.adam.rdd.read

import org.apache.spark.rdd.RDD
import org.bdgenomics.adam.util.{ ReferenceContigMap, ADAMFunSuite }
import org.bdgenomics.formats.avro.{ AlignmentRecord, NucleotideContigFragment, Contig }
import org.bdgenomics.utils.misc.SparkFunSuite
import org.bdgenomics.adam.util.{ ADAMFunSuite, ReferenceContigMap }
import org.bdgenomics.formats.avro.{ AlignmentRecord, Contig, NucleotideContigFragment }

class MDTaggingSuite extends ADAMFunSuite {
val chr1 =
Expand All @@ -34,7 +33,13 @@ class MDTaggingSuite extends ADAMFunSuite {
sc.parallelize(
for {
(contig, start, seq) <- frags
} yield NucleotideContigFragment.newBuilder().setContig(contig).setFragmentStartPosition(start.toLong).setFragmentSequence(seq).build()
} yield (
NucleotideContigFragment.newBuilder()
.setContig(contig)
.setFragmentStartPosition(start.toLong)
.setFragmentEndPosition(start.toLong + seq.length)
.setFragmentSequence(seq).build()
)
)

def makeReads(reads: ((Contig, Int, Int, String, String), String)*): (Map[Int, String], RDD[AlignmentRecord]) = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,8 @@ package org.bdgenomics.adam.util

import java.io.File

import org.bdgenomics.utils.io.LocalFileByteAccess
import org.bdgenomics.adam.models.ReferenceRegion
import org.scalatest.FunSuite
import org.bdgenomics.utils.io.LocalFileByteAccess

class TwoBitSuite extends ADAMFunSuite {
test("correctly read sequence from .2bit file") {
Expand Down Expand Up @@ -49,4 +48,13 @@ class TwoBitSuite extends ADAMFunSuite {
val twoBitFile = new TwoBitFile(byteAccess)
assert(twoBitFile.extract(ReferenceRegion("1", 9990, 10010), true) == "NNNNNNNNNNTAACCCTAAC")
}

test("correctly calculates sequence dictionary") {
val file = new File(resourcePath("hg19.chrM.2bit"))
val byteAccess = new LocalFileByteAccess(file)
val twoBitFile = new TwoBitFile(byteAccess)
val dict = twoBitFile.sequences
assert(dict.records.length == 1)
assert(dict.records.head.length == 16571)
}
}

0 comments on commit e7e1adf

Please sign in to comment.