Skip to content

Commit

Permalink
[ADAM-2102] Inverted duplicates are not found in mark duplicates (#2101)
Browse files Browse the repository at this point in the history
* Inverted duplicates are not found in mark duplicates

Mark duplicates uses the left and right positions corrected for clipping
to match duplicates on coordinates. However, this approach can miss
duplicates that are truly inverted (ie positions reversed and the
strand direction is reversed).

Picard and sambamba pick up these reads as duplicates

This change fixes this issue by checking the direction when doing the
left and right group bys in mark duplicates.

* Revert incorrect formatting changes

* Revert incorrect formatting changes
  • Loading branch information
pauldwolfe authored and heuermh committed Dec 12, 2018
1 parent 78bb9e5 commit e696b64
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,10 @@ package org.bdgenomics.adam.rdd.read
import org.bdgenomics.utils.misc.Logging
import org.apache.spark.rdd.RDD
import org.bdgenomics.adam.instrumentation.Timers._
import org.bdgenomics.adam.models.{
RecordGroupDictionary,
ReferencePosition
}
import org.bdgenomics.adam.models.{ RecordGroupDictionary, ReferencePosition }
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.fragment.FragmentDataset
import org.bdgenomics.formats.avro.{ AlignmentRecord, Fragment }
import org.bdgenomics.formats.avro.{ AlignmentRecord, Fragment, Strand }

private[rdd] object MarkDuplicates extends Serializable with Logging {

Expand Down Expand Up @@ -96,19 +93,24 @@ private[rdd] object MarkDuplicates extends Serializable with Logging {
recordGroups: RecordGroupDictionary): RDD[SingleReadBucket] = {
checkRecordGroups(recordGroups)

def positionForStrand(defaultPos: Option[ReferencePosition], otherPos: Option[ReferencePosition], strand: Strand) = {
defaultPos.filter(_.strand == strand).orElse(otherPos.filter(_.strand == strand).orElse(defaultPos))
}

// Group by library and left position
def leftPositionAndLibrary(p: (ReferencePositionPair, SingleReadBucket),
rgd: RecordGroupDictionary): (Option[ReferencePosition], String) = {
val leftPosition = positionForStrand(p._1.read1refPos, p._1.read2refPos, Strand.FORWARD)
if (p._2.allReads.head.getRecordGroupName != null) {
(p._1.read1refPos, rgd(p._2.allReads.head.getRecordGroupName).library.getOrElse(null))
(leftPosition, rgd(p._2.allReads.head.getRecordGroupName).library.orNull)
} else {
(p._1.read1refPos, null)
(leftPosition, null)
}
}

// Group by right position
def rightPosition(p: (ReferencePositionPair, SingleReadBucket)): Option[ReferencePosition] = {
p._1.read2refPos
positionForStrand(p._1.read2refPos, p._1.read1refPos, Strand.REVERSE)
}

rdd.keyBy(ReferencePositionPair(_))
Expand Down Expand Up @@ -166,4 +168,5 @@ private[rdd] object MarkDuplicates extends Serializable with Logging {
scoreBucket(x._2) - scoreBucket(y._2)
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -309,4 +309,17 @@ class MarkDuplicatesSuite extends ADAMFunSuite {
assert(nonDups.size == 2 && nonDups.forall(p => p.getReadName.toString == "best"))
assert(dups.forall(p => p.getReadName.startsWith("poor")))
}

sparkTest("inverse pairs") {
val firstPair = createPair("0", 100, 251, "0", 1100, 1251, "pair1")
val secondPair = createPair("0", 1100, 1251, "0", 100, 251, "pair2")
secondPair.head.setReadNegativeStrand(true)
secondPair.head.setMateNegativeStrand(false)
secondPair(1).setReadNegativeStrand(false)
secondPair(1).setMateNegativeStrand(true)

val marked = markDuplicateFragments(firstPair ++ secondPair: _*)
val (dups, nonDups) = marked.partition(_.getDuplicateRead)
assert(dups.size == 2)
}
}

0 comments on commit e696b64

Please sign in to comment.