From 9a318ba40527ad660dd7637d8a7d5536adf14c7e Mon Sep 17 00:00:00 2001 From: Frank Austin Nothaft Date: Tue, 17 Mar 2015 19:50:12 -0700 Subject: [PATCH] [ADAM-601] Consolidate ReferencePosition so that it extends ReferenceRegion. Consolidates ordered classes down into unordered classes. --- .gitignore | 2 + .../adam/models/ReferencePosition.scala | 231 ++---------------- .../adam/models/ReferencePositionPair.scala | 31 ++- .../adam/models/ReferenceRegion.scala | 154 +++++------- .../adam/rdd/ShuffleRegionJoin.scala | 2 +- ...NucleotideContigFragmentRDDFunctions.scala | 4 +- .../read/AlignmentRecordRDDFunctions.scala | 11 +- .../adam/rdd/read/MarkDuplicates.scala | 16 +- .../realignment/IndelRealignmentTarget.scala | 4 +- .../rdd/read/realignment/RealignIndels.scala | 3 +- .../adam/rich/RichAlignmentRecord.scala | 17 +- .../serialization/ADAMKryoRegistrator.scala | 1 - .../adam/models/ReferencePositionSuite.scala | 136 +---------- .../adam/models/ReferenceRegionSuite.scala | 71 +----- .../adam/rdd/read/MarkDuplicatesSuite.scala | 52 ++-- .../BaseQualityRecalibrationSuite.scala | 8 +- 16 files changed, 187 insertions(+), 556 deletions(-) diff --git a/.gitignore b/.gitignore index 910e7c97a8..c08b87429c 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ build *.log .*.swp .DS_Store + +*#* \ No newline at end of file diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala index 9ac1b6fd3f..f72826c5cd 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala @@ -17,194 +17,44 @@ */ package org.bdgenomics.adam.models -import org.bdgenomics.formats.avro._ -import org.bdgenomics.adam.rdd.ADAMContext._ import com.esotericsoftware.kryo.{ Kryo, Serializer } import com.esotericsoftware.kryo.io.{ Input, Output } -import scala.annotation.tailrec - -object ReferencePositionWithOrientation { - - def apply(record: AlignmentRecord): Option[ReferencePositionWithOrientation] = { - if (record.getReadMapped) { - Some(new ReferencePositionWithOrientation(ReferencePosition(record).get, record.getReadNegativeStrand)) - } else { - None - } - } - - def fivePrime(record: AlignmentRecord): Option[ReferencePositionWithOrientation] = { - if (record.getReadMapped) { - Some(new ReferencePositionWithOrientation(ReferencePosition.fivePrime(record).get, record.getReadNegativeStrand)) - } else { - None - } - } - /** - * Given an index into a sequence (e.g., a transcript), and a sequence - * of regions that map this sequence to a reference genome, returns a - * reference position. - * - * @param idx Index into a sequence. - * @param mapping Sequence of reference regions that maps the sequence to a - * reference genome. - * @return Returns a lifted over reference position. - * - * @throws AssertionError Throws an assertion error if the strandedness does - * not match between the reference position and the mapping regions. - * @throws IllegalArgumentException Throws an exception if the index given - * is out of the range of the mappings given. - */ - def liftOverToReference(idx: Long, - mapping: Seq[ReferenceRegionWithOrientation]): ReferencePositionWithOrientation = { - val negativeStrand = mapping.head.negativeStrand - - @tailrec def liftOverHelper(regions: Iterator[ReferenceRegionWithOrientation], - idx: Long): ReferencePositionWithOrientation = { - if (!regions.hasNext) { - throw new IllegalArgumentException("Liftover is out of range") - } else { - val reg = regions.next() - assert(reg.negativeStrand == negativeStrand, - "Strand is inconsistent across set of regions.") - - // is our position in this region? - if (reg.width > idx) { - // get contig - val chr = reg.referenceName - - // get our pos - val pos = if (negativeStrand) { - reg.end - idx - 1 // need -1 because of open end coordinate - } else { - reg.start + idx - } - - // return - ReferencePositionWithOrientation(ReferencePosition(chr, pos), - negativeStrand) - } else { - // recurse - liftOverHelper(regions, idx - reg.width) - } - } - } +import org.bdgenomics.formats.avro._ - // call out to helper - if (negativeStrand) { - liftOverHelper(mapping.reverseIterator, idx) - } else { - liftOverHelper(mapping.toIterator, idx) - } - } +object PositionOrdering extends ReferenceOrdering[ReferencePosition] { } - -case class ReferencePositionWithOrientation(refPos: ReferencePosition, - negativeStrand: Boolean) - extends Ordered[ReferencePositionWithOrientation] { - - /** - * Given a sequence of regions that map from a reference genome to a sequence, - * returns an index into a sequence. - * - * @param mapping Sequence of reference regions that maps the sequence to a - * reference genome. - * @return Returns an index into a sequence. - * - * @throws AssertionError Throws an assertion error if the strandedness does - * not match between the reference position and the mapping regions. - * @throws IllegalArgumentException Throws an exception if the index given - * is out of the range of the mappings given. - */ - def liftOverFromReference(mapping: Seq[ReferenceRegionWithOrientation]): Long = { - val pos = refPos.pos - - @tailrec def liftOverHelper(regions: Iterator[ReferenceRegionWithOrientation], - idx: Long = 0L): Long = { - if (!regions.hasNext) { - throw new IllegalArgumentException("Position was not contained in mapping.") - } else { - // get region - val reg = regions.next() - assert(reg.negativeStrand == negativeStrand, - "Strand is inconsistent across set of regions.") - - // are we in this region? - if (reg.contains(this)) { - // how far are we from the end of the region - val into = if (negativeStrand) { - reg.end - pos - } else { - pos - reg.start - } - - // add to index - idx + into - } else { - liftOverHelper(regions, idx + reg.width) - } - } - } - - // call out to helper - liftOverHelper(mapping.toIterator) - } - - override def compare(that: ReferencePositionWithOrientation): Int = { - val posCompare = refPos.compare(that.refPos) - if (posCompare != 0) { - posCompare - } else { - negativeStrand.compare(that.negativeStrand) - } - } +object OptionalPositionOrdering extends OptionalReferenceOrdering[ReferencePosition] { + val baseOrdering = PositionOrdering } -object ReferencePosition { +object ReferencePosition extends Serializable { + + implicit def orderingForPositions = PositionOrdering + implicit def orderingForOptionalPositions = OptionalPositionOrdering /** * The UNMAPPED value is a convenience value, which can be used to indicate a position * which is not located anywhere along the reference sequences (see, e.g. its use in * GenomicPositionPartitioner). */ - val UNMAPPED = new ReferencePosition("", -1) - - /** - * Checks to see if a read is mapped with a valid position. - * - * @param record Read to check for mapping. - * @return True if read is mapped and has a valid position, else false. - */ - def mappedPositionCheck(record: AlignmentRecord): Boolean = { - val contig = Option(record.getContig) - val start = Option(record.getStart) - record.getReadMapped && (contig.isDefined && Option(contig.get.getContigName).isDefined) && start.isDefined - } + val UNMAPPED = new ReferencePosition("", 0) /** * Generates a reference position from a read. This function generates the * position from the start mapping position of the read. * * @param record Read from which to generate a reference position. - * @return Reference position wrapped inside of an option. If the read is - * mapped, the option will be stuffed, else it will be empty (None). + * @return Reference position of the start position. * * @see fivePrime */ - def apply(record: AlignmentRecord): Option[ReferencePosition] = { - if (mappedPositionCheck(record)) { - Some(new ReferencePosition(record.getContig.getContigName.toString, record.getStart)) - } else { - None - } + def apply(record: AlignmentRecord): ReferencePosition = { + new ReferencePosition(record.getContig.getContigName.toString, record.getStart) } /** * Generates a reference position from a called variant. * - * @note An invariant of variants is that they have a defined position, - * therefore we do not wrap them in an option. - * * @param variant Called variant from which to generate a * reference position. * @return The reference position of this variant. @@ -216,9 +66,6 @@ object ReferencePosition { /** * Generates a reference position from a genotype. * - * @note An invariant of genotypes is that they have a defined position, - * therefore we do not wrap them in an option. - * * @param genotype Genotype from which to generate a reference position. * @return The reference position of this genotype. */ @@ -227,63 +74,33 @@ object ReferencePosition { new ReferencePosition(variant.getContig.getContigName, variant.getStart) } - /** - * Generates a reference position from the five prime end of a read. This - * will be different from the start mapping position of a read if this - * read is a reverse strand read. - * - * @param record Read from which to generate a reference position. - * @return Reference position wrapped inside of an option. If the read is - * mapped, the option will be stuffed, else it will be empty (None). - * - * @see apply(record: Read) - */ - def fivePrime(record: AlignmentRecord): Option[ReferencePosition] = { - if (mappedPositionCheck(record)) { - Some(new ReferencePosition(record.getContig.getContigName, record.fivePrimePosition.get)) - } else { - None - } + def apply(referenceName: String, pos: Long): ReferencePosition = { + new ReferencePosition(referenceName, pos) } -} -case class ReferencePosition(referenceName: String, pos: Long) extends Ordered[ReferencePosition] { - - override def compare(that: ReferencePosition): Int = { - // Note: important to compare by reference first for coordinate ordering - val refCompare = referenceName.compare(that.referenceName) - if (refCompare != 0) { - refCompare - } else { - pos.compare(that.pos) - } + def apply(referenceName: String, pos: Long, orientation: Strand): ReferencePosition = { + new ReferencePosition(referenceName, pos, orientation) } } -class ReferencePositionWithOrientationSerializer extends Serializer[ReferencePositionWithOrientation] { - val referencePositionSerializer = new ReferencePositionSerializer() - - def write(kryo: Kryo, output: Output, obj: ReferencePositionWithOrientation) = { - output.writeBoolean(obj.negativeStrand) - referencePositionSerializer.write(kryo, output, obj.refPos) - } - - def read(kryo: Kryo, input: Input, klazz: Class[ReferencePositionWithOrientation]): ReferencePositionWithOrientation = { - val negativeStrand = input.readBoolean() - val referencePosition = referencePositionSerializer.read(kryo, input, classOf[ReferencePosition]) - new ReferencePositionWithOrientation(referencePosition, negativeStrand) - } +class ReferencePosition(override val referenceName: String, + val pos: Long, + override val orientation: Strand = Strand.Independent) extends ReferenceRegion(referenceName, pos, pos + 1, orientation) { } class ReferencePositionSerializer extends Serializer[ReferencePosition] { + private val enumValues = Strand.values() + def write(kryo: Kryo, output: Output, obj: ReferencePosition) = { output.writeString(obj.referenceName) output.writeLong(obj.pos) + output.writeInt(obj.orientation.ordinal) } def read(kryo: Kryo, input: Input, klazz: Class[ReferencePosition]): ReferencePosition = { val refName = input.readString() val pos = input.readLong() - new ReferencePosition(refName, pos) + val orientation = input.readInt() + new ReferencePosition(refName, pos, enumValues(orientation)) } } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePositionPair.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePositionPair.scala index e86bb0d99e..5cd7983578 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePositionPair.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePositionPair.scala @@ -19,18 +19,25 @@ package org.bdgenomics.adam.models import com.esotericsoftware.kryo.{ Kryo, Serializer } import com.esotericsoftware.kryo.io.{ Input, Output } -import org.apache.spark.Logging import Ordering.Option +import org.apache.spark.Logging import org.bdgenomics.adam.instrumentation.Timers.CreateReferencePositionPair +import org.bdgenomics.adam.models.ReferenceRegion._ +import org.bdgenomics.adam.rich.RichAlignmentRecord +import org.bdgenomics.formats.avro.AlignmentRecord object ReferencePositionPair extends Logging { + private def posForRead(read: AlignmentRecord): Option[ReferencePosition] = { + RichAlignmentRecord(read).fivePrimeReferencePosition + } + def apply(singleReadBucket: SingleReadBucket): ReferencePositionPair = CreateReferencePositionPair.time { singleReadBucket.primaryMapped.toSeq.lift(0) match { case None => // No mapped reads new ReferencePositionPair(None, None) case Some(read1) => - val read1pos = ReferencePositionWithOrientation.fivePrime(read1) + val read1pos = posForRead(read1) if (read1.getReadPaired && read1.getMateMapped) { singleReadBucket.primaryMapped.toSeq.lift(1) match { case None => @@ -39,8 +46,8 @@ object ReferencePositionPair extends Logging { new ReferencePositionPair(read1pos, None) case Some(read2) => // Both reads are mapped - val read2pos = ReferencePositionWithOrientation.fivePrime(read2) - if (read1pos < read2pos) { + val read2pos = posForRead(read2) + if (read1pos.get.disorient.compareTo(read2pos.get.disorient) < 0) { new ReferencePositionPair(read1pos, read2pos) } else { new ReferencePositionPair(read2pos, read1pos) @@ -52,9 +59,9 @@ object ReferencePositionPair extends Logging { // Mate is not mapped... new ReferencePositionPair(read1pos, None) case Some(read2) => - val read2pos = ReferencePositionWithOrientation.fivePrime(read2) + val read2pos = posForRead(read2) log.warn("%s claimed to not have mate but mate found".format(read1.getReadName)) - if (read1pos < read2pos) { + if (read1pos.get.disorient.compareTo(read2pos.get.disorient) < 0) { new ReferencePositionPair(read1pos, read2pos) } else { new ReferencePositionPair(read2pos, read1pos) @@ -65,13 +72,13 @@ object ReferencePositionPair extends Logging { } } -case class ReferencePositionPair(read1refPos: Option[ReferencePositionWithOrientation], - read2refPos: Option[ReferencePositionWithOrientation]) +case class ReferencePositionPair(read1refPos: Option[ReferencePosition], + read2refPos: Option[ReferencePosition]) class ReferencePositionPairSerializer extends Serializer[ReferencePositionPair] { - val rps = new ReferencePositionWithOrientationSerializer() + val rps = new ReferencePositionSerializer() - def writeOptionalReferencePos(kryo: Kryo, output: Output, optRefPos: Option[ReferencePositionWithOrientation]) = { + def writeOptionalReferencePos(kryo: Kryo, output: Output, optRefPos: Option[ReferencePosition]) = { optRefPos match { case None => output.writeBoolean(false) @@ -81,10 +88,10 @@ class ReferencePositionPairSerializer extends Serializer[ReferencePositionPair] } } - def readOptionalReferencePos(kryo: Kryo, input: Input): Option[ReferencePositionWithOrientation] = { + def readOptionalReferencePos(kryo: Kryo, input: Input): Option[ReferencePosition] = { val exists = input.readBoolean() if (exists) { - Some(rps.read(kryo, input, classOf[ReferencePositionWithOrientation])) + Some(rps.read(kryo, input, classOf[ReferencePosition])) } else { None } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala index 86bd7d036e..e6ffa2ffb4 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala @@ -20,73 +20,55 @@ package org.bdgenomics.adam.models import com.esotericsoftware.kryo.io.{ Input, Output } import com.esotericsoftware.kryo.{ Kryo, Serializer } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.formats.avro.{ Feature, AlignmentRecord, NucleotideContigFragment } +import org.bdgenomics.formats.avro._ import scala.math.{ max, min } -object ReferenceRegionWithOrientation { - - /** - * Builds an oriented reference region from the individual parameters - * - * @param referenceName The name of the sequence (chromosome) in the reference genome - * @param start The 0-based residue-coordinate for the start of the region - * @param end The 0-based residue-coordinate for the first residue after the start - * which is not in the region -- i.e. [start, end) define a 0-based - * half-open interval. - * @param negativeStrand Boolean flag as to whether the region is on the forward or - * reverse strand of the reference region. - */ - def apply(referenceName: String, - start: Long, - end: Long, - negativeStrand: Boolean): ReferenceRegionWithOrientation = { - ReferenceRegionWithOrientation(ReferenceRegion(referenceName, start, end), negativeStrand) - } -} - -/** - * Represents a contiguous region of the reference genome with strand information. - * - * @param region The genomic locus as a ReferenceRegion - * @param negativeStrand Boolean flag as to whether the region is on the forward or - * reverse strand of the reference region. - */ -case class ReferenceRegionWithOrientation(region: ReferenceRegion, - negativeStrand: Boolean) extends Ordered[ReferenceRegionWithOrientation] { - def width: Long = region.width - - def contains(other: ReferencePositionWithOrientation): Boolean = { - negativeStrand == other.negativeStrand && region.contains(other.refPos) - } - - def contains(other: ReferenceRegionWithOrientation): Boolean = { - region.contains(other.region) && negativeStrand == other.negativeStrand - } - - def overlaps(other: ReferenceRegionWithOrientation): Boolean = { - region.overlaps(other.region) && negativeStrand == other.negativeStrand +trait ReferenceOrdering[T <: ReferenceRegion] extends Ordering[T] { + private def regionCompare(a: T, + b: T): Int = { + if (a.referenceName != b.referenceName) { + a.referenceName.compareTo(b.referenceName) + } else if (a.start != b.start) { + a.start.compareTo(b.start) + } else { + a.end.compareTo(b.end) + } } - def compare(that: ReferenceRegionWithOrientation): Int = { - val regionCompare = region.compare(that.region) - if (regionCompare != 0) { - regionCompare + def compare(a: T, + b: T): Int = { + val rc = regionCompare(a, b) + if (rc == 0) { + a.orientation.ordinal compare b.orientation.ordinal } else { - negativeStrand.compare(that.negativeStrand) + rc } } +} - def toReferenceRegion: ReferenceRegion = region - - def referenceName: String = region.referenceName +trait OptionalReferenceOrdering[T <: ReferenceRegion] extends Ordering[Option[T]] { + val baseOrdering: ReferenceOrdering[T] - def start: Long = region.start + def compare(a: Option[T], + b: Option[T]): Int = (a, b) match { + case (None, None) => 0 + case (Some(pa), Some(pb)) => baseOrdering.compare(pa, pb) + case (Some(pa), None) => -1 + case (None, Some(pb)) => -1 + } +} - def end: Long = region.end +object RegionOrdering extends ReferenceOrdering[ReferenceRegion] { +} +object OptionalRegionOrdering extends OptionalReferenceOrdering[ReferenceRegion] { + val baseOrdering = RegionOrdering } object ReferenceRegion { + implicit def orderingForPositions = RegionOrdering + implicit def orderingForOptionalPositions = OptionalRegionOrdering + /** * Generates a reference region from read data. Returns None if the read is not mapped; * else, returns the inclusive region from the start to the end of the read alignment. @@ -144,13 +126,15 @@ object ReferenceRegion { * which is not in the region -- i.e. [start, end) define a 0-based * half-open interval. */ -case class ReferenceRegion(referenceName: String, start: Long, end: Long) extends Ordered[ReferenceRegion] with Interval { +case class ReferenceRegion(referenceName: String, start: Long, end: Long, orientation: Strand = Strand.Independent) extends Comparable[ReferenceRegion] with Interval { assert(start >= 0) assert(end >= start) def width: Long = end - start + def disorient: ReferenceRegion = new ReferenceRegion(referenceName, start, end) + /** * Merges two reference regions that are contiguous. * @@ -189,6 +173,7 @@ case class ReferenceRegion(referenceName: String, start: Long, end: Long) extend * @see merge */ def hull(region: ReferenceRegion): ReferenceRegion = { + assert(orientation == region.orientation, "Cannot compute convex hull of differently oriented regions.") assert(referenceName == region.referenceName, "Cannot compute convex hull of regions on different references.") ReferenceRegion(referenceName, min(start, region.start), max(end, region.end)) } @@ -202,28 +187,6 @@ case class ReferenceRegion(referenceName: String, start: Long, end: Long) extend def isAdjacent(region: ReferenceRegion): Boolean = distance(region).map(_ == 1).getOrElse(false) - /** - * Returns the distance between this reference region and a point in the reference space. - * - * @note Distance here is defined as the minimum distance between any point within this region, and - * the point we are measuring against. If the point is within this region, its distance will be 0. - * Else, the distance will be greater than or equal to 1. - * - * @param other Point to compare against. - * @return Returns an option containing the distance between two points. If the point is not in - * our reference space, we return an empty option (None). - */ - def distance(other: ReferencePosition): Option[Long] = - if (referenceName == other.referenceName) - if (other.pos < start) - Some(start - other.pos) - else if (other.pos >= end) - Some(other.pos - end + 1) - else - Some(0) - else - None - /** * Returns the distance between this reference region and another region in the reference space. * @@ -236,7 +199,7 @@ case class ReferenceRegion(referenceName: String, start: Long, end: Long) extend * our reference space, we return an empty option (None). */ def distance(other: ReferenceRegion): Option[Long] = - if (referenceName == other.referenceName) + if (referenceName == other.referenceName && orientation == other.orientation) if (overlaps(other)) Some(0) else if (other.start >= end) @@ -246,22 +209,27 @@ case class ReferenceRegion(referenceName: String, start: Long, end: Long) extend else None - def contains(other: ReferencePosition): Boolean = - referenceName == other.referenceName && start <= other.pos && end > other.pos + def contains(other: ReferencePosition): Boolean = { + orientation == other.orientation && + referenceName == other.referenceName && + start <= other.pos && end > other.pos + } - def contains(other: ReferenceRegion): Boolean = - referenceName == other.referenceName && start <= other.start && end >= other.end + def contains(other: ReferenceRegion): Boolean = { + orientation == other.orientation && + referenceName == other.referenceName && + start <= other.start && end >= other.end + } - def overlaps(other: ReferenceRegion): Boolean = - referenceName == other.referenceName && end > other.start && start < other.end + def overlaps(other: ReferenceRegion): Boolean = { + orientation == other.orientation && + referenceName == other.referenceName && + end > other.start && start < other.end + } - def compare(that: ReferenceRegion): Int = - if (referenceName != that.referenceName) - referenceName.compareTo(that.referenceName) - else if (start != that.start) - start.compareTo(that.start) - else - end.compareTo(that.end) + def compareTo(that: ReferenceRegion): Int = { + RegionOrdering.compare(this, that) + } def length(): Long = { end - start @@ -269,16 +237,20 @@ case class ReferenceRegion(referenceName: String, start: Long, end: Long) extend } class ReferenceRegionSerializer extends Serializer[ReferenceRegion] { + private val enumValues = Strand.values() + def write(kryo: Kryo, output: Output, obj: ReferenceRegion) = { output.writeString(obj.referenceName) output.writeLong(obj.start) output.writeLong(obj.end) + output.writeInt(obj.orientation.ordinal()) } def read(kryo: Kryo, input: Input, klazz: Class[ReferenceRegion]): ReferenceRegion = { val referenceName = input.readString() val start = input.readLong() val end = input.readLong() - new ReferenceRegion(referenceName, start, end) + val orientation = input.readInt() + new ReferenceRegion(referenceName, start, end, enumValues(orientation)) } } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala index f7ca6a3688..6d39956dc1 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala @@ -15,12 +15,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.bdgenomics.adam.rdd import org.apache.spark.{ Logging, Partitioner, SparkContext } import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD +import org.bdgenomics.adam.models.ReferenceRegion._ import org.bdgenomics.adam.models.{ SequenceDictionary, SequenceRecord, ReferenceRegion } import org.bdgenomics.adam.rdd.ADAMContext._ import scala.collection.mutable.ListBuffer diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDFunctions.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDFunctions.scala index 86976d9421..198c129196 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDFunctions.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDFunctions.scala @@ -78,7 +78,7 @@ class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) e assert(kv1._1.isAdjacent(kv2._1), "Regions being joined must be adjacent. For: " + kv1 + ", " + kv2) - (kv1._1.merge(kv2._1), if (kv1._1.compare(kv2._1) <= 0) { + (kv1._1.merge(kv2._1), if (kv1._1.compareTo(kv2._1) <= 0) { kv1._2 + kv2._2 } else { kv2._2 + kv1._2 @@ -94,7 +94,7 @@ class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) e .map(kv => getString(kv)) .reduce(reducePairs) - assert(pair._1.compare(region) == 0, + assert(pair._1.compareTo(region) == 0, "Merging fragments returned a different region than requested.") pair._2 diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctions.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctions.scala index e4bbe2ad35..8e0f6334b6 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctions.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctions.scala @@ -32,6 +32,7 @@ import org.bdgenomics.adam.algorithms.consensus.{ import org.bdgenomics.adam.converters.AlignmentRecordConverter import org.bdgenomics.adam.instrumentation.Timers._ import org.bdgenomics.adam.models._ +import org.bdgenomics.adam.models.ReferenceRegion._ import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.rdd.{ ADAMSaveArgs, ADAMSaveAnyArgs, ADAMSequenceDictionaryRDDAggregator } import org.bdgenomics.adam.rdd.read.correction.{ ErrorCorrection, TrimReads } @@ -266,12 +267,10 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) } rdd.map(p => { - val referencePos = ReferencePosition(p) match { - case None => - // Move unmapped reads to the end of the file - ReferencePosition( - unmappedReferenceNames.next(), Long.MaxValue) - case Some(pos) => pos + val referencePos = if (p.getReadMapped) { + ReferencePosition(p) + } else { + ReferencePosition(unmappedReferenceNames.next(), 0) } (referencePos, p) }).sortByKey().map(p => p._2) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala index 3af8565448..c4afada9c7 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala @@ -19,7 +19,11 @@ package org.bdgenomics.adam.rdd.read import org.apache.spark.rdd.RDD import org.bdgenomics.adam.instrumentation.Timers._ -import org.bdgenomics.adam.models.{ ReferencePositionPair, ReferencePositionWithOrientation, SingleReadBucket } +import org.bdgenomics.adam.models.{ + ReferencePosition, + ReferencePositionPair, + SingleReadBucket +} import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.formats.avro.AlignmentRecord @@ -62,19 +66,21 @@ private[rdd] object MarkDuplicates extends Serializable { def apply(rdd: RDD[AlignmentRecord]): RDD[AlignmentRecord] = { // Group by library and left position - def leftPositionAndLibrary(p: (ReferencePositionPair, SingleReadBucket)): (Option[ReferencePositionWithOrientation], String) = { + def leftPositionAndLibrary(p: (ReferencePositionPair, SingleReadBucket)): (Option[ReferencePosition], String) = { (p._1.read1refPos, p._2.allReads.head.getRecordGroupLibrary) } // Group by right position - def rightPosition(p: (ReferencePositionPair, SingleReadBucket)): Option[ReferencePositionWithOrientation] = { + def rightPosition(p: (ReferencePositionPair, SingleReadBucket)): Option[ReferencePosition] = { p._1.read2refPos } - rdd.adamSingleReadBuckets().keyBy(ReferencePositionPair(_)).groupBy(leftPositionAndLibrary) + rdd.adamSingleReadBuckets() + .keyBy(ReferencePositionPair(_)) + .groupBy(leftPositionAndLibrary) .flatMap(kv => PerformDuplicateMarking.time { - val leftPos: Option[ReferencePositionWithOrientation] = kv._1._1 + val leftPos: Option[ReferencePosition] = kv._1._1 val readsAtLeftPos: Iterable[(ReferencePositionPair, SingleReadBucket)] = kv._2 leftPos match { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala index f916bbe4e3..db94c35f8a 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala @@ -51,7 +51,7 @@ object TargetOrdering extends Ordering[IndelRealignmentTarget] { * @param b Indel realignment target to compare. * @return Comparison done by starting position. */ - def compare(a: IndelRealignmentTarget, b: IndelRealignmentTarget): Int = a.readRange compare b.readRange + def compare(a: IndelRealignmentTarget, b: IndelRealignmentTarget): Int = a.readRange compareTo b.readRange /** * Check to see if an indel realignment target contains the given read. @@ -76,7 +76,7 @@ object TargetOrdering extends Ordering[IndelRealignmentTarget] { def lt(target: IndelRealignmentTarget, read: RichAlignmentRecord): Boolean = { val region = read.readRegion - region.forall(r => target.readRange.compare(r) < 0) + region.forall(r => target.readRange.compareTo(r) < 0) } /** diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala index ea435aede6..e4da3d53ff 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala @@ -22,6 +22,7 @@ import org.apache.spark.Logging import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import org.bdgenomics.adam.algorithms.consensus.{ ConsensusGenerator, ConsensusGeneratorFromReads } +import org.bdgenomics.adam.models.ReferenceRegion._ import org.bdgenomics.adam.models.{ Consensus, ReferencePosition, ReferenceRegion } import org.bdgenomics.adam.rich.RichAlignmentRecord import org.bdgenomics.adam.rich.RichAlignmentRecord._ @@ -463,7 +464,7 @@ private[rdd] class RealignIndels(val consensusModel: ConsensusGenerator = new Co rdd.filter(r => r.getReadMapped) } else { val sr = rdd.filter(r => r.getReadMapped) - .keyBy(r => ReferencePosition(r).get) + .keyBy(ReferencePosition(_)) .sortByKey() sr.map(kv => kv._2) } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichAlignmentRecord.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichAlignmentRecord.scala index 001159e27a..0349e00de8 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichAlignmentRecord.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichAlignmentRecord.scala @@ -22,7 +22,7 @@ import htsjdk.samtools.{ Cigar, CigarElement, CigarOperator, TextCigarCodec } import org.bdgenomics.adam.models.{ Attribute, ReferencePosition, ReferenceRegion } import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.util._ -import org.bdgenomics.formats.avro.AlignmentRecord +import org.bdgenomics.formats.avro.{ AlignmentRecord, Strand } import scala.collection.immutable.NumericRange object RichAlignmentRecord { @@ -139,6 +139,17 @@ class RichAlignmentRecord(val record: AlignmentRecord) { } } + def fivePrimeReferencePosition: Option[ReferencePosition] = { + fivePrimePosition.map(rp => { + val strand = if (record.getReadNegativeStrand) { + Strand.Reverse + } else { + Strand.Forward + } + ReferencePosition(record.getContig.getContigName, rp, strand) + }) + } + // Does this read overlap with the given reference position? def overlapsReferencePosition(pos: ReferencePosition): Option[Boolean] = { readRegion.map(_.contains(pos)) @@ -164,8 +175,8 @@ class RichAlignmentRecord(val record: AlignmentRecord) { } def getReferenceContext(readOffset: Int, referencePosition: Long, cigarElem: CigarElement, elemOffset: Int): ReferenceSequenceContext = { - val position = if (ReferencePosition.mappedPositionCheck(record)) { - Some(new ReferencePosition(record.getContig.getContigName.toString, referencePosition)) + val position = if (record.getReadMapped) { + Some(ReferencePosition(record.getContig.getContigName, referencePosition)) } else { None } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala b/adam-core/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala index 641082d052..ff3626eb69 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala @@ -78,7 +78,6 @@ class ADAMKryoRegistrator extends KryoRegistrator { kryo.register(classOf[DatabaseVariantAnnotation], new AvroSerializer[DatabaseVariantAnnotation]()) kryo.register(classOf[Dbxref], new AvroSerializer[Dbxref]()) kryo.register(classOf[Feature], new AvroSerializer[Feature]()) - kryo.register(classOf[ReferencePositionWithOrientation], new ReferencePositionWithOrientationSerializer) kryo.register(classOf[ReferencePosition], new ReferencePositionSerializer) kryo.register(classOf[ReferencePositionPair], new ReferencePositionPairSerializer) kryo.register(classOf[SingleReadBucket], new SingleReadBucketSerializer) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferencePositionSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferencePositionSuite.scala index 6721a2e8ae..b2c35c66d2 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferencePositionSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferencePositionSuite.scala @@ -33,69 +33,12 @@ class ReferencePositionSuite extends FunSuite { .setReadMapped(true) .build() - val refPosOpt = ReferencePosition(read) - - assert(refPosOpt.isDefined) - - val refPos = refPosOpt.get + val refPos = ReferencePosition(read) assert(refPos.referenceName === "chr1") assert(refPos.pos === 1L) } - test("create reference position from unmapped read") { - val read = AlignmentRecord.newBuilder() - .setReadMapped(false) - .build() - - val refPosOpt = ReferencePosition(read) - - assert(refPosOpt.isEmpty) - } - - test("create reference position from mapped read but contig not specified") { - val read = AlignmentRecord.newBuilder() - .setReadMapped(true) - .setStart(1L) - .build() - - val refPosOpt = ReferencePosition(read) - - assert(refPosOpt.isEmpty) - } - - test("create reference position from mapped read but contig is underspecified") { - val contig = Contig.newBuilder - // contigName is NOT set - //.setContigName("chr1") - .build - - val read = AlignmentRecord.newBuilder() - .setReadMapped(true) - .setStart(1L) - .setContig(contig) - .build() - - val refPosOpt = ReferencePosition(read) - - assert(refPosOpt.isEmpty) - } - - test("create reference position from mapped read but start not specified") { - val contig = Contig.newBuilder - .setContigName("chr1") - .build - - val read = AlignmentRecord.newBuilder() - .setReadMapped(true) - .setContig(contig) - .build() - - val refPosOpt = ReferencePosition(read) - - assert(refPosOpt.isEmpty) - } - test("create reference position from variant") { val variant = Variant.newBuilder() .setContig(Contig.newBuilder.setContigName("chr10").build()) @@ -127,81 +70,4 @@ class ReferencePositionSuite extends FunSuite { assert(refPos.referenceName === "chr10") assert(refPos.pos === 100L) } - - test("liftOverToReference works with a multi-block alignment on the forward strand") { - val exons = Seq(ReferenceRegionWithOrientation("1", 100, 200, negativeStrand = false), - ReferenceRegionWithOrientation("1", 300, 400, negativeStrand = false), - ReferenceRegionWithOrientation("1", 500, 600, negativeStrand = false)) - - val p0 = ReferencePositionWithOrientation.liftOverToReference(0, exons) - assert(p0.refPos.referenceName === "1") - assert(p0.refPos.pos === 100) - - val p1 = ReferencePositionWithOrientation.liftOverToReference(50, exons) - assert(p1.refPos.referenceName === "1") - assert(p1.refPos.pos === 150) - - val p2 = ReferencePositionWithOrientation.liftOverToReference(150, exons) - assert(p2.refPos.referenceName === "1") - assert(p2.refPos.pos === 350) - - val p3 = ReferencePositionWithOrientation.liftOverToReference(250, exons) - assert(p3.refPos.referenceName === "1") - assert(p3.refPos.pos === 550) - } - - test("liftOverToReference works with a multi-block alignment on the reverse strand") { - val exons = Seq(ReferenceRegionWithOrientation("1", 100, 200, negativeStrand = true), - ReferenceRegionWithOrientation("1", 300, 400, negativeStrand = true), - ReferenceRegionWithOrientation("1", 500, 600, negativeStrand = true)) - - val p1 = ReferencePositionWithOrientation.liftOverToReference(50, exons) - assert(p1.refPos.referenceName === "1") - assert(p1.refPos.pos === 549) - - val p2 = ReferencePositionWithOrientation.liftOverToReference(150, exons) - assert(p2.refPos.referenceName === "1") - assert(p2.refPos.pos === 349) - - val p3 = ReferencePositionWithOrientation.liftOverToReference(250, exons) - assert(p3.refPos.referenceName === "1") - assert(p3.refPos.pos === 149) - } - - test("lift over between two transcripts on the forward strand") { - // create mappings for transcripts - val t1 = Seq(ReferenceRegionWithOrientation("chr0", 0L, 201L, negativeStrand = false)) - val t2 = Seq(ReferenceRegionWithOrientation("chr0", 50L, 101L, negativeStrand = false), - ReferenceRegionWithOrientation("chr0", 175L, 201L, negativeStrand = false)) - - // check forward strand - val pos = ReferencePositionWithOrientation.liftOverToReference(60, t1) - - assert(pos.refPos.referenceName === "chr0") - assert(pos.refPos.pos === 60L) - assert(!pos.negativeStrand) - - val idx = pos.liftOverFromReference(t2) - - assert(idx === 10L) - } - - test("lift over between two transcripts on the reverse strand") { - // create mappings for transcripts - val t1 = Seq(ReferenceRegionWithOrientation("chr0", 0L, 201L, negativeStrand = true)) - val t2 = Seq(ReferenceRegionWithOrientation("chr0", 175L, 201L, negativeStrand = true), - ReferenceRegionWithOrientation("chr0", 50L, 101L, negativeStrand = true)) - - // check reverse strand - val idx = ReferencePositionWithOrientation(ReferencePosition("chr0", 190L), negativeStrand = true) - .liftOverFromReference(t2) - - assert(idx === 11L) - - val pos = ReferencePositionWithOrientation.liftOverToReference(idx, t1) - - assert(pos.refPos.referenceName === "chr0") - assert(pos.refPos.pos === 189L) - assert(pos.negativeStrand) - } } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferenceRegionSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferenceRegionSuite.scala index dbb9b5b4c0..f6fbc2fc3d 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferenceRegionSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferenceRegionSuite.scala @@ -18,7 +18,7 @@ package org.bdgenomics.adam.models import org.scalatest._ -import org.bdgenomics.formats.avro.{ AlignmentRecord, Contig } +import org.bdgenomics.formats.avro.{ AlignmentRecord, Contig, Strand } class ReferenceRegionSuite extends FunSuite { @@ -233,68 +233,21 @@ class ReferenceRegionSuite extends FunSuite { def point(refName: String, pos: Long): ReferencePosition = ReferencePosition(refName, pos) - test("build oriented reference region from non-oriented") { - val rrf = ReferenceRegionWithOrientation(ReferenceRegion("chr1", 10L, 20L), negativeStrand = false) - assert(rrf.referenceName === "chr1") - assert(rrf.start === 10L) - assert(rrf.end === 20L) - assert(!rrf.negativeStrand) - - val rrr = ReferenceRegionWithOrientation(ReferenceRegion("chr1", 10L, 20L), negativeStrand = true) - assert(rrr.referenceName === "chr1") - assert(rrr.start === 10L) - assert(rrr.end === 20L) - assert(rrr.negativeStrand) - } - - test("comparison tests for oriented reference region") { - assert(ReferenceRegionWithOrientation("chr1", 10L, 20L, negativeStrand = false) - .contains(ReferenceRegionWithOrientation("chr1", 10L, 20L, negativeStrand = false))) - assert(ReferenceRegionWithOrientation("chr1", 10L, 20L, negativeStrand = true) - .contains(ReferenceRegionWithOrientation("chr1", 15L, 17L, negativeStrand = true))) - - val rrf = ReferenceRegionWithOrientation(ReferenceRegion("chr1", 10L, 20L), negativeStrand = false) - val rrr = ReferenceRegionWithOrientation(ReferenceRegion("chr1", 10L, 20L), negativeStrand = true) - assert(!rrf.contains(rrr)) - - assert(!ReferenceRegionWithOrientation("chr1", 10L, 20L, negativeStrand = false) - .contains(ReferenceRegionWithOrientation("chr2", 10L, 20L, negativeStrand = false))) - assert(!ReferenceRegionWithOrientation("chr1", 20L, 50L, negativeStrand = true) - .contains(ReferenceRegionWithOrientation("chr1", 50L, 100L, negativeStrand = true))) - } - - test("comparison tests for oriented reference region vs position") { - assert(ReferenceRegionWithOrientation("chr1", 10L, 20L, negativeStrand = false) - .contains(ReferencePositionWithOrientation(ReferencePosition("chr1", 10L), negativeStrand = false))) - assert(ReferenceRegionWithOrientation("chr1", 10L, 20L, negativeStrand = true) - .contains(ReferencePositionWithOrientation(ReferencePosition("chr1", 17L), negativeStrand = true))) - - assert(!ReferenceRegionWithOrientation(ReferenceRegion("chr1", 10L, 20L), negativeStrand = false) - .contains(ReferencePositionWithOrientation(ReferencePosition("chr1", 17L), negativeStrand = true))) - assert(!ReferenceRegionWithOrientation(ReferenceRegion("chr1", 10L, 20L), negativeStrand = true) - .contains(ReferencePositionWithOrientation(ReferencePosition("chr1", 10L), negativeStrand = false))) - - assert(!ReferenceRegionWithOrientation("chr1", 10L, 20L, negativeStrand = false) - .contains(ReferencePositionWithOrientation(ReferencePosition("chr2", 10L), negativeStrand = false))) - assert(!ReferenceRegionWithOrientation("chr1", 20L, 50L, negativeStrand = true) - .contains(ReferencePositionWithOrientation(ReferencePosition("chr1", 100L), negativeStrand = true))) - } - test("overlap tests for oriented reference region") { - assert(ReferenceRegionWithOrientation("chr1", 10L, 20L, negativeStrand = false) - .overlaps(ReferenceRegionWithOrientation("chr1", 15L, 25L, negativeStrand = false))) - assert(ReferenceRegionWithOrientation("chr1", 10L, 20L, negativeStrand = true) - .overlaps(ReferenceRegionWithOrientation("chr1", 5L, 15L, negativeStrand = true))) - - assert(!ReferenceRegionWithOrientation("chr1", 10L, 20L, negativeStrand = false) - .overlaps(ReferenceRegionWithOrientation("chr2", 10L, 20L, negativeStrand = false))) - assert(!ReferenceRegionWithOrientation("chr1", 20L, 50L, negativeStrand = true) - .overlaps(ReferenceRegionWithOrientation("chr1", 51L, 100L, negativeStrand = true))) + assert(ReferenceRegion("chr1", 10L, 20L, Strand.Forward) + .overlaps(ReferenceRegion("chr1", 15L, 25L, Strand.Forward))) + assert(ReferenceRegion("chr1", 10L, 20L, Strand.Reverse) + .overlaps(ReferenceRegion("chr1", 5L, 15L, Strand.Reverse))) + + assert(!ReferenceRegion("chr1", 10L, 20L, Strand.Forward) + .overlaps(ReferenceRegion("chr2", 10L, 20L, Strand.Forward))) + assert(!ReferenceRegion("chr1", 20L, 50L, Strand.Reverse) + .overlaps(ReferenceRegion("chr1", 51L, 100L, Strand.Reverse))) } test("check the width of a reference region") { assert(ReferenceRegion("chr1", 100, 201).width === 101) - assert(ReferenceRegionWithOrientation("chr2", 200, 401, negativeStrand = false).width === 201) - assert(ReferenceRegionWithOrientation("chr3", 399, 1000, negativeStrand = true).width === 601) + assert(ReferenceRegion("chr2", 200, 401, Strand.Forward).width === 201) + assert(ReferenceRegion("chr3", 399, 1000, Strand.Reverse).width === 601) } } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MarkDuplicatesSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MarkDuplicatesSuite.scala index 29e135a9d5..68313e6548 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MarkDuplicatesSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MarkDuplicatesSuite.scala @@ -28,7 +28,7 @@ class MarkDuplicatesSuite extends ADAMFunSuite { AlignmentRecord.newBuilder().setReadMapped(false).build() } - def createMappedRead(referenceName: String, position: Long, + def createMappedRead(referenceName: String, start: Long, end: Long, readName: String = UUID.randomUUID().toString, avgPhredScore: Int = 20, numClippedBases: Int = 0, isPrimaryAlignment: Boolean = true, isNegativeStrand: Boolean = false) = { @@ -37,15 +37,15 @@ class MarkDuplicatesSuite extends ADAMFunSuite { val cigar = if (numClippedBases > 0) "%dS%dM".format(numClippedBases, 100 - numClippedBases) else "100M" val contig = Contig.newBuilder - .setContigName("reference%s".format(referenceName)) + .setContigName(referenceName) .build AlignmentRecord.newBuilder() .setContig(contig) - .setStart(position) + .setStart(start) .setQual(qual) .setCigar(cigar) - .setEnd(if (isNegativeStrand) position - 100 + numClippedBases else position + 100 - numClippedBases) + .setEnd(end) .setReadMapped(true) .setPrimaryAlignment(isPrimaryAlignment) .setReadName(readName) @@ -56,8 +56,8 @@ class MarkDuplicatesSuite extends ADAMFunSuite { .build() } - def createPair(firstReferenceName: String, firstPosition: Long, - secondReferenceName: String, secondPosition: Long, + def createPair(firstReferenceName: String, firstStart: Long, firstEnd: Long, + secondReferenceName: String, secondStart: Long, secondEnd: Long, readName: String = UUID.randomUUID().toString, avgPhredScore: Int = 20): Seq[AlignmentRecord] = { val firstContig = Contig.newBuilder @@ -68,19 +68,19 @@ class MarkDuplicatesSuite extends ADAMFunSuite { .setContigName(secondReferenceName) .build - val firstOfPair = createMappedRead(firstReferenceName, firstPosition, + val firstOfPair = createMappedRead(firstReferenceName, firstStart, firstEnd, readName = readName, avgPhredScore = avgPhredScore) firstOfPair.setFirstOfPair(true) firstOfPair.setMateMapped(true) firstOfPair.setMateContig(secondContig) - firstOfPair.setMateAlignmentStart(secondPosition) + firstOfPair.setMateAlignmentStart(secondStart) firstOfPair.setReadPaired(true) - val secondOfPair = createMappedRead(secondReferenceName, secondPosition, + val secondOfPair = createMappedRead(secondReferenceName, secondStart, secondEnd, readName = readName, avgPhredScore = avgPhredScore, isNegativeStrand = true) secondOfPair.setSecondOfPair(true) secondOfPair.setMateMapped(true) secondOfPair.setMateContig(firstContig) - secondOfPair.setMateAlignmentStart(firstPosition) + secondOfPair.setMateAlignmentStart(firstStart) secondOfPair.setReadPaired(true) Seq(firstOfPair, secondOfPair) } @@ -90,15 +90,15 @@ class MarkDuplicatesSuite extends ADAMFunSuite { } sparkTest("single read") { - val read = createMappedRead("0", 100) + val read = createMappedRead("0", 100, 200) val marked = markDuplicates(read) // Can't have duplicates with a single read, should return the read unchanged. assert(marked(0) == read) } sparkTest("reads at different positions") { - val read1 = createMappedRead("0", 42) - val read2 = createMappedRead("0", 43) + val read1 = createMappedRead("0", 42, 142) + val read2 = createMappedRead("0", 43, 143) val marked = markDuplicates(read1, read2) // Reads shouldn't be modified assert(marked.contains(read1) && marked.contains(read2)) @@ -106,9 +106,9 @@ class MarkDuplicatesSuite extends ADAMFunSuite { sparkTest("reads at the same position") { val poorReads = for (i <- 0 until 10) yield { - createMappedRead("1", 42, avgPhredScore = 20, readName = "poor%d".format(i)) + createMappedRead("1", 42, 142, avgPhredScore = 20, readName = "poor%d".format(i)) } - val bestRead = createMappedRead("1", 42, avgPhredScore = 30, readName = "best") + val bestRead = createMappedRead("1", 42, 142, avgPhredScore = 30, readName = "best") val marked = markDuplicates(List(bestRead) ++ poorReads: _*) val (dups, nonDup) = marked.partition(p => p.getDuplicateRead) assert(nonDup.size == 1 && nonDup(0) == bestRead) @@ -117,12 +117,12 @@ class MarkDuplicatesSuite extends ADAMFunSuite { sparkTest("reads at the same position with clipping") { val poorClippedReads = for (i <- 0 until 5) yield { - createMappedRead("1", 44, numClippedBases = 2, avgPhredScore = 20, readName = "poorClipped%d".format(i)) + createMappedRead("1", 44, 142, numClippedBases = 2, avgPhredScore = 20, readName = "poorClipped%d".format(i)) } val poorUnclippedReads = for (i <- 0 until 5) yield { - createMappedRead("1", 42, avgPhredScore = 20, readName = "poorUnclipped%d".format(i)) + createMappedRead("1", 42, 142, avgPhredScore = 20, readName = "poorUnclipped%d".format(i)) } - val bestRead = createMappedRead("1", 42, avgPhredScore = 30, readName = "best") + val bestRead = createMappedRead("1", 42, 142, avgPhredScore = 30, readName = "best") val marked = markDuplicates(List(bestRead) ++ poorClippedReads ++ poorUnclippedReads: _*) val (dups, nonDup) = marked.partition(p => p.getDuplicateRead) assert(nonDup.size == 1 && nonDup(0) == bestRead) @@ -131,9 +131,9 @@ class MarkDuplicatesSuite extends ADAMFunSuite { sparkTest("reads on reverse strand") { val poorReads = for (i <- 0 until 7) yield { - createMappedRead("10", 42, isNegativeStrand = true, avgPhredScore = 20, readName = "poor%d".format(i)) + createMappedRead("10", 42, 142, isNegativeStrand = true, avgPhredScore = 20, readName = "poor%d".format(i)) } - val bestRead = createMappedRead("10", 42, isNegativeStrand = true, avgPhredScore = 30, readName = "best") + val bestRead = createMappedRead("10", 42, 142, isNegativeStrand = true, avgPhredScore = 30, readName = "best") val marked = markDuplicates(List(bestRead) ++ poorReads: _*) val (dups, nonDup) = marked.partition(p => p.getDuplicateRead) assert(nonDup.size == 1 && nonDup(0) == bestRead) @@ -151,9 +151,9 @@ class MarkDuplicatesSuite extends ADAMFunSuite { sparkTest("read pairs") { val poorPairs = for ( i <- 0 until 10; - read <- createPair("0", 10, "0", 210, avgPhredScore = 20, readName = "poor%d".format(i)) + read <- createPair("0", 10, 110, "0", 110, 210, avgPhredScore = 20, readName = "poor%d".format(i)) ) yield read - val bestPair = createPair("0", 10, "0", 210, avgPhredScore = 30, readName = "best") + val bestPair = createPair("0", 10, 110, "0", 110, 210, avgPhredScore = 30, readName = "best") val marked = markDuplicates(bestPair ++ poorPairs: _*) val (dups, nonDups) = marked.partition(_.getDuplicateRead) assert(nonDups.size == 2 && nonDups.forall(p => p.getReadName.toString == "best")) @@ -162,10 +162,10 @@ class MarkDuplicatesSuite extends ADAMFunSuite { sparkTest("read pairs with fragments") { val fragments = for (i <- 0 until 10) yield { - createMappedRead("2", 33, avgPhredScore = 40, readName = "fragment%d".format(i)) + createMappedRead("2", 33, 133, avgPhredScore = 40, readName = "fragment%d".format(i)) } // Even though the phred score is lower, pairs always score higher than fragments - val pairs = createPair("2", 33, "2", 200, avgPhredScore = 20, readName = "pair") + val pairs = createPair("2", 33, 133, "2", 100, 200, avgPhredScore = 20, readName = "pair") val marked = markDuplicates(fragments ++ pairs: _*) val (dups, nonDups) = marked.partition(_.getDuplicateRead) assert(nonDups.size == 2 && nonDups.forall(p => p.getReadName.toString == "pair")) @@ -182,9 +182,9 @@ class MarkDuplicatesSuite extends ADAMFunSuite { sparkTest("read pairs that cross chromosomes") { val poorPairs = for ( i <- 0 until 10; - read <- createPair("ref0", 10, "ref1", 210, avgPhredScore = 20, readName = "poor%d".format(i)) + read <- createPair("ref0", 10, 110, "ref1", 110, 210, avgPhredScore = 20, readName = "poor%d".format(i)) ) yield read - val bestPair = createPair("ref0", 10, "ref1", 210, avgPhredScore = 30, readName = "best") + val bestPair = createPair("ref0", 10, 110, "ref1", 110, 210, avgPhredScore = 30, readName = "best") val marked = markDuplicates(bestPair ++ poorPairs: _*) val (dups, nonDups) = marked.partition(_.getDuplicateRead) assert(nonDups.size == 2 && nonDups.forall(p => p.getReadName.toString == "best")) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibrationSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibrationSuite.scala index 2c8f7d893a..6adad99132 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibrationSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibrationSuite.scala @@ -41,11 +41,9 @@ class BaseQualityRecalibrationSuite extends ADAMFunSuite { assert(bqsr.result.count == reads.count) // Compare the ObservationTables - val referenceObs: Set[String] = scala.io.Source.fromFile(new File(obsFilepath)).getLines().filter(_.length > 0).toSet - val testObs: Set[String] = bqsr.observed.toCSV.split('\n').filter(_.length > 0).toSet - assert(testObs == referenceObs) - - // TODO: Compare `result` against the reference output + val referenceObs: Seq[String] = scala.io.Source.fromFile(new File(obsFilepath)).getLines().filter(_.length > 0).toSeq.sortWith((kv1, kv2) => kv1.compare(kv2) < 0) + val testObs: Seq[String] = bqsr.observed.toCSV.split('\n').filter(_.length > 0).toSeq.sortWith((kv1, kv2) => kv1.compare(kv2) < 0) + referenceObs.zip(testObs).foreach(p => assert(p._1 === p._2)) } sparkTest("BQSR Test Input #1 w/ VCF Sites") {