diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichADAMRecord.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichADAMRecord.scala index d55e40e7b1..a0dd9f447d 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichADAMRecord.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rich/RichADAMRecord.scala @@ -22,11 +22,39 @@ import org.bdgenomics.adam.util._ import scala.Some import scala.collection.immutable.NumericRange import org.bdgenomics.adam.models.{ ReferenceRegion, ReferencePosition, Attribute } +import java.util.regex.Pattern object RichADAMRecord { val CIGAR_CODEC: TextCigarCodec = TextCigarCodec.getSingleton val ILLUMINA_READNAME_REGEX = "[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*".r + val cigarPattern = Pattern.compile("([0-9]+)([MIDNSHPX=])") + + /** + * Parses a CIGAR string, and returns the aligned length with respect to the + * reference genome (i.e. skipping clipping, padding, and insertion operators) + * + * @param cigar The CIGAR string whose reference length is to be measured + * @return A non-negative integer, the sum of the MDNX= operators in the CIGAR string. + */ + def referenceLengthFromCigar(cigar: String): Int = { + val m = cigarPattern.matcher(cigar) + var i = 0 + var len: Int = 0 + while (i < cigar.length) { + if (m.find(i)) { + val op = m.group(2) + if ("MDNX=".indexOf(op) != -1) { + len += m.group(1).toInt + } + } else { + return len + } + i = m.end() + } + len + } + def apply(record: ADAMRecord) = { new RichADAMRecord(record) } @@ -39,6 +67,8 @@ class IlluminaOptics(val tile: Long, val x: Long, val y: Long) {} class RichADAMRecord(val record: ADAMRecord) { + lazy val referenceLength: Int = RichADAMRecord.referenceLengthFromCigar(record.cigar) + lazy val readRegion = ReferenceRegion(this) // Returns the quality scores as a list of bytes diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rich/RichADAMRecordSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rich/RichADAMRecordSuite.scala index b5bcd95eb5..94d88d3f51 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rich/RichADAMRecordSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rich/RichADAMRecordSuite.scala @@ -22,6 +22,16 @@ import org.bdgenomics.adam.models.{ ReferencePosition, TagType, Attribute } class RichADAMRecordSuite extends FunSuite { + test("referenceLengthFromCigar") { + assert(referenceLengthFromCigar("3M") === 3) + assert(referenceLengthFromCigar("30M") === 30) + assert(referenceLengthFromCigar("10Y") === 0) // should abort when it hits an illegal operator + assert(referenceLengthFromCigar("10M1Y") === 10) // same + assert(referenceLengthFromCigar("10M1I10M") === 20) + assert(referenceLengthFromCigar("10M1D10M") === 21) + assert(referenceLengthFromCigar("1S10M1S") === 10) + } + test("Unclipped Start") { val recordWithoutClipping = ADAMRecord.newBuilder().setReadMapped(true).setCigar("10M").setStart(42).build() val recordWithClipping = ADAMRecord.newBuilder().setReadMapped(true).setCigar("2S8M").setStart(42).build()