Skip to content

Commit

Permalink
fix for mdtag issues with insertions
Browse files Browse the repository at this point in the history
  • Loading branch information
arahuja committed Jul 30, 2015
1 parent d79d434 commit d0302c5
Show file tree
Hide file tree
Showing 3 changed files with 215 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class RichAlignmentRecord(val record: AlignmentRecord) {
}

lazy val samtoolsCigar: Cigar = {
TextCigarCodec.decode(record.getCigar.toString)
TextCigarCodec.decode(record.getCigar)
}

// Returns the MdTag if the read is mapped, None otherwise
Expand Down
103 changes: 64 additions & 39 deletions adam-core/src/main/scala/org/bdgenomics/adam/util/MdTag.scala
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ object MdTag {
var cigarIdx = 0
var mdTagStringOffset = 0
var referencePos = referenceStart
var cigarReferencePosition = referenceStart

var cigarOperatorIndex = 0
var usedMatchingBases = 0
if (mdTagInput == null || mdTagInput == "0") {
new MdTag(referenceStart, List(), Map(), Map())
} else {
Expand All @@ -68,74 +68,99 @@ object MdTag {
val cigarElement = cigar.getCigarElement(cigarIdx)
val nextDigit = digitPattern.findPrefixOf(mdTag.substring(mdTagStringOffset))
(cigarElement.getOperator, nextDigit) match {
case (_, None) if mdTagStringOffset == 0 => throw new IllegalArgumentException(s"MdTag $mdTagInput does not start with a digit")
case (_, Some(matchingBases)) if matchingBases.toInt == 0 => {

case (_, None) if mdTagStringOffset == 0 =>
throw new IllegalArgumentException(s"MdTag $mdTagInput does not start with a digit")

case (_, Some(matchingBases)) if matchingBases.toInt == 0 =>
mdTagStringOffset += 1
}
case (CigarOperator.MATCH_OR_MISMATCH | CigarOperator.M | CigarOperator.EQ, Some(matchingBases)) => {
val numMatchingBases = math.min(cigarElement.getLength, matchingBases.toInt)

case (CigarOperator.M | CigarOperator.EQ, Some(matchingBases)) =>
val numMatchingBases = math.min(cigarElement.getLength - cigarOperatorIndex, matchingBases.toInt - usedMatchingBases)

if (numMatchingBases > 0) {
matches ::= NumericRange(referencePos, referencePos + numMatchingBases, 1L)

// Move the reference position the length of the matching sequence
referencePos += numMatchingBases

// Move ahead in the current CIGAR operator
cigarOperatorIndex += numMatchingBases

// Move ahead in the current MdTag digit
usedMatchingBases += numMatchingBases
}

if (matchingBases.toInt == usedMatchingBases) {
mdTagStringOffset += matchingBases.length
usedMatchingBases = 0
}
// Move the reference position the length of the matching sequence
referencePos += numMatchingBases.toInt
if (matchingBases.toInt <= cigarElement.getLength) mdTagStringOffset += matchingBases.size

// If the M operator has been fully read move on to the next operator
if (referencePos >= cigarReferencePosition + cigarElement.getLength) {
if (cigarOperatorIndex == cigarElement.getLength) {
cigarIdx += 1
cigarReferencePosition += cigarElement.getLength
cigarOperatorIndex = 0
}
}
case (CigarOperator.MATCH_OR_MISMATCH | CigarOperator.M | CigarOperator.X, None) => {

case (CigarOperator.M | CigarOperator.X, None) =>
basesPattern.findPrefixOf(mdTag.substring(mdTagStringOffset)) match {
// Must have found a base or digit pattern in CigarOperator M
case None => throw new IllegalArgumentException(s"No match or mismatched bases found for ${cigar.toString} in MDTag $mdTag")
case Some(mismatchedBases) => {
case None =>
throw new IllegalArgumentException(
s"No match or mismatched bases found for ${cigar.toString} in MDTag $mdTag"
)
case Some(mismatchedBases) =>
mismatchedBases.foreach {
base =>
mismatches += (referencePos -> base)
referencePos += 1
}
mdTagStringOffset += mismatchedBases.size
}
mdTagStringOffset += mismatchedBases.length
cigarOperatorIndex += mismatchedBases.length
}
// If the M operator has been fully read move on to the next operator
if (referencePos >= cigarReferencePosition + cigarElement.getLength) {
if (cigarOperatorIndex == cigarElement.getLength) {
cigarIdx += 1
cigarReferencePosition += cigarElement.getLength
cigarOperatorIndex = 0
}
}
case (CigarOperator.DELETION, None) => {

case (CigarOperator.DELETION, None) =>
mdTag.charAt(mdTagStringOffset) match {
case '^' => {
case '^' =>
// Skip ahead of the deletion '^' character
mdTagStringOffset += 1
basesPattern.findPrefixOf(mdTag.substring(mdTagStringOffset)) match {
case None => throw new IllegalArgumentException(s"No deleted bases found ${cigar.toString} in MDTag $mdTag")
case Some(deletedBases) => {
case None =>
throw new IllegalArgumentException(s"No deleted bases found ${cigar.toString} in MDTag $mdTag")
case Some(deletedBases) if deletedBases.length == cigarElement.getLength =>
deletedBases.foreach {
base =>
deletions += (referencePos -> base)
referencePos += 1
}
mdTagStringOffset += deletedBases.size
}
mdTagStringOffset += deletedBases.length
case Some(deletedBases) =>
throw new IllegalArgumentException(
s"Element ${cigarElement.getLength}${cigarElement.getOperator.toString} in cigar ${cigar.toString} contradicts number of bases listed in MDTag: ^${deletedBases}"
)
}
cigarReferencePosition += cigarElement.getLength
cigarIdx += 1
}
case _ => throw new IllegalArgumentException(s"CIGAR ${cigar.toString} indicates deletion found but no deleted bases in MDTag $mdTagInput")
cigarOperatorIndex = 0

case _ =>
throw new IllegalArgumentException(
s"CIGAR ${cigar.toString} indicates deletion found but no deleted bases in MDTag $mdTagInput"
)
}
}
case _ if cigarElement.getOperator.consumesReferenceBases() => {

case (CigarOperator.N, _) =>
referencePos += cigarElement.getLength
cigarReferencePosition += cigarElement.getLength
cigarIdx += 1
}
case _ => {
cigarOperatorIndex = 0

case (CigarOperator.INSERTION | CigarOperator.H | CigarOperator.S, _) =>
cigarIdx += 1
}

}
}
new MdTag(referenceStart, matches, mismatches, deletions)
Expand Down Expand Up @@ -400,7 +425,7 @@ class MdTag(
* @return True if this read has mismatches. We do not return true if the read has no mismatches but has deletions.
*/
def hasMismatches: Boolean = {
!mismatches.isEmpty
mismatches.nonEmpty
}

/**
Expand All @@ -419,7 +444,7 @@ class MdTag(
*/
def end(): Long = {
val ends = matches.map(_.end - 1) ::: mismatches.keys.toList ::: deletions.keys.toList
ends.reduce(_ max _)
ends.max
}

/**
Expand Down Expand Up @@ -496,7 +521,7 @@ class MdTag(
* @return MD string corresponding to [0-9]+(([A-Z]|\&#94;[A-Z]+)[0-9]+)
* @see http://zenfractal.com/2013/06/19/playing-with-matches/
*/
override def toString(): String = {
override def toString: String = {
if (matches.isEmpty && mismatches.isEmpty && deletions.isEmpty) {
"0"
} else {
Expand Down
151 changes: 150 additions & 1 deletion adam-core/src/test/scala/org/bdgenomics/adam/util/MdTagSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,138 @@ class MdTagSuite extends FunSuite {
assert(tag.toString === "20")
}

test("Get correct matches for mdtag with insertion") {
val tag = MdTag("10", 0L, TextCigarCodec.decode("5M3I5M"))
assert(tag.end === 9)

(0 until 9).foreach(locus => assert(tag.isMatch(locus)))

assert(tag.toString === "10")
}

test("Get correct matches for mdtag with mismatches and insertion") {
val tag = MdTag("2A7", 0L, TextCigarCodec.decode("5M3I5M"))
assert(tag.end === 9)

assert(tag.isMatch(0))
assert(tag.isMatch(1))
assert(tag.mismatches(2) === 'A')
(3 until 9).foreach(locus => assert(tag.isMatch(locus)))

assert(tag.toString === "2A7")
}

test("Get correct matches for mdtag with insertion between mismatches") {
val tag = MdTag("2A4A2", 0L, TextCigarCodec.decode("5M3I5M"))
assert(tag.end === 9)

assert(tag.isMatch(0))
assert(tag.isMatch(1))
assert(tag.mismatches(2L) === 'A')

assert(tag.mismatches(7L) === 'A')
assert(tag.isMatch(8))
assert(tag.isMatch(9))

assert(tag.toString === "2A4A2")
}

test("Get correct matches for mdtag with intron between mismatches") {
val tag = MdTag("2A4A2", 0L, TextCigarCodec.decode("5M3N5M"))
assert(tag.end === 12)

assert(tag.isMatch(0))
assert(tag.isMatch(1))
assert(tag.mismatches(2L) === 'A')
assert(tag.isMatch(3))
assert(tag.isMatch(4))

assert(!tag.isMatch(5))
assert(!tag.isMatch(6))
assert(!tag.isMatch(7))

assert(tag.isMatch(8))
assert(tag.isMatch(9))

assert(tag.mismatches(10L) === 'A')
assert(tag.isMatch(11))
assert(tag.isMatch(12))

assert(tag.toString === "2A4A2")
}

test("Get correct matches for mdtag with intron and deletion between mismatches") {
val tag = MdTag("2A4A0^AAA2", 0L, TextCigarCodec.decode("5M3N3M3D2M"))
assert(tag.end === 15)

assert(tag.isMatch(0))
assert(tag.isMatch(1))
assert(tag.mismatches(2L) === 'A')
assert(tag.isMatch(3))
assert(tag.isMatch(4))

assert(!tag.isMatch(5))
assert(!tag.isMatch(6))
assert(!tag.isMatch(7))

assert(tag.isMatch(8))
assert(tag.isMatch(9))

assert(tag.mismatches(10L) === 'A')

assert(tag.deletions(11L) === 'A')
assert(tag.deletions(12L) === 'A')
assert(tag.deletions(13L) === 'A')

assert(tag.toString === "2A4A0^AAA2")
}

test("Throw exception when number of deleted bases in mdtag disagrees with CIGAR") {
intercept[IllegalArgumentException] {
MdTag("2A4A0^AAA2", 0L, TextCigarCodec.decode("5M3N3M4D2M"))
}
}

test("Get correct matches for mdtag with mismatch, insertion and deletion") {
val tag = MdTag("2A3^AAA4", 0L, TextCigarCodec.decode("5M3I1M3D4M"))
assert(tag.end === 12)

assert(tag.isMatch(0))
assert(tag.isMatch(1))
assert(tag.mismatches(2L) === 'A')

(3 to 5).foreach(locus => assert(tag.isMatch(locus)))

assert(tag.deletedBase(6L) === Some('A'))
assert(tag.deletedBase(7L) === Some('A'))
assert(tag.deletedBase(8L) === Some('A'))

(9 to 12).foreach(locus => assert(tag.isMatch(locus)))

assert(tag.toString === "2A3^AAA4")
}

test("Get correct matches for mdtag with mismatches, insertion and deletion") {
val tag = MdTag("2A3^AAA2A1", 0L, TextCigarCodec.decode("5M3I1M3D4M"))
assert(tag.end === 12)

assert(tag.isMatch(0))
assert(tag.isMatch(1))
assert(tag.mismatches(2L) === 'A')

(3 to 5).foreach(locus => assert(tag.isMatch(locus)))

assert(tag.deletedBase(6L) === Some('A'))
assert(tag.deletedBase(7L) === Some('A'))
assert(tag.deletedBase(8L) === Some('A'))

(9 to 10).foreach(locus => assert(tag.isMatch(locus)))

assert(tag.mismatches(11L) === 'A')

assert(tag.toString === "2A3^AAA2A1")
}

test("Get correct matches for MDTag with mismatches and deletions") {

val tag1 = MdTag("40A5^TTT54", 0L, TextCigarCodec.decode("46M3D54M"))
Expand All @@ -237,6 +369,7 @@ class MdTagSuite extends FunSuite {
assert(tag1.isMatch(41) === true)

assert(tag1.mismatchedBase(40) === Some('A'))
assert(tag1.toString === "40A5^TTT54")

val tag2 = MdTag("40A5^TTT0G53", 0L, TextCigarCodec.decode("46M3D54M"))
assert(tag2.hasMismatches === true)
Expand All @@ -249,6 +382,22 @@ class MdTagSuite extends FunSuite {
assert(tag2.mismatchedBase(40) === Some('A'))
assert(tag2.mismatchedBase(49) === Some('G'))
assert(tag2.isMatch(50) === true)
assert(tag2.toString === "40A5^TTT0G53")

val tag3 = MdTag("2^GA5^TC6", 0L, TextCigarCodec.decode("2M2D1M2I2M4I2M2D6M"))
(0 to 1).foreach(l => assert(tag3.isMatch(l)))
(2 to 3).foreach(l => assert(!tag3.isMatch(l)))
(4 to 8).foreach(l => assert(tag3.isMatch(l)))
(9 to 10).foreach(l => assert(!tag3.isMatch(l)))
(11 to 16).foreach(l => assert(tag3.isMatch(l)))

assert(tag3.deletedBase(2) == Some('G'))
assert(tag3.deletedBase(3) == Some('A'))

assert(tag3.deletedBase(9) == Some('T'))
assert(tag3.deletedBase(10) == Some('C'))

assert(tag3.toString === "2^GA5^TC6")
}

test("Get correct matches base from MDTag and CIGAR with N") {
Expand Down Expand Up @@ -453,4 +602,4 @@ class MdTagSuite extends FunSuite {
test("handle '=' and 'X' operators") {
testTag("ACCCAAGT", "ACCATAGA", "3=2X2=1X", 0, "3A0T2A0", 0, 7)
}
}
}

0 comments on commit d0302c5

Please sign in to comment.