Skip to content

Commit

Permalink
Merge pull request bigdatagenomics#1080 from fnothaft/issues/1061-mat…
Browse files Browse the repository at this point in the history
…ch-error-attributes

[ADAM-1061] Clean up attributes regex and denormalized fields
  • Loading branch information
heuermh authored Jul 19, 2016
2 parents 4b6e107 + 367ddcf commit 06d0fd2
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,13 @@ private[adam] class AlignmentRecordConverter extends Serializable {
.foreach(v => builder.setReadFailsVendorQualityCheckFlag(v.booleanValue))
Option(adamRecord.getMismatchingPositions)
.foreach(builder.setAttribute("MD", _))
Option(adamRecord.getOrigQual)
.map(s => s.getBytes.map(v => (v - 33).toByte)) // not ascii, but short int
.foreach(builder.setOriginalBaseQualities(_))
Option(adamRecord.getOldCigar)
.foreach(builder.setAttribute("OC", _))
Option(adamRecord.getOldPosition)
.foreach(i => builder.setAttribute("OP", i + 1))

// add all other tags
if (adamRecord.getAttributes != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,42 @@ import org.bdgenomics.adam.util.AttributeUtils
import org.bdgenomics.formats.avro.AlignmentRecord
import scala.collection.JavaConverters._

class SAMRecordConverter extends Serializable with Logging {
/**
* Helper class for converting SAMRecords into AlignmentRecords.
*/
private[adam] class SAMRecordConverter extends Serializable with Logging {

/**
* Returns true if a tag should not be kept in the attributes field.
*
* The SAM/BAM format supports attributes, which is a key/value pair map. In
* ADAM, we have promoted some of these fields to "primary" fields, so that we
* can more efficiently access them. These include the MD tag, which describes
* substitutions against the reference; the OQ tag, which describes the
* original read base qualities; and the OP and OC tags, which describe the
* original read alignment position and CIGAR.
*
* @param attrTag Tag name to check.
* @return Returns true if the tag should be skipped.
*/
private[converters] def skipTag(attrTag: String): Boolean = attrTag match {
case "OQ" => true
case "OP" => true
case "OC" => true
case "MD" => true
case _ => false
}

/**
* Converts a SAM record into an Avro AlignmentRecord.
*
* @param samRecord Record to convert.
* @param dict The sequence dictionary describing the reference genome this
* read was aligned to.
* @param readGroups The read group dictionary describing the read group this
* read is from.
* @return Returns the original record converted into Avro.
*/
def convert(
samRecord: SAMRecord,
dict: SequenceDictionary,
Expand Down Expand Up @@ -173,7 +208,7 @@ class SAMRecordConverter extends Serializable with Logging {
attr =>
if (attr.tag == "MD") {
builder.setMismatchingPositions(attr.value.toString)
} else {
} else if (!skipTag(attr.tag)) {
tags ::= AttributeUtils.convertSAMTagAndValue(attr)
}
}
Expand All @@ -196,5 +231,4 @@ class SAMRecordConverter extends Serializable with Logging {
}
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ import org.bdgenomics.adam.models.{ TagType, Attribute }
*/
object AttributeUtils {

val attrRegex = RegExp("([^:]{2,4}):([AifZHB]):([cCiIsSf]{1},)?(.*)")
val attrRegex = RegExp("([^:]{2,4}):([AifZHB]):(.*)")
val arrayRegex = RegExp("([cCiIsSf]{1},)(.*)")

def convertSAMTagAndValue(attr: SAMTagAndValue): Attribute = {
if (attr.value.isInstanceOf[TagValueAndUnsignedArrayFlag]) {
Expand Down Expand Up @@ -71,21 +72,31 @@ object AttributeUtils {
*/
def parseAttribute(encoded: String): Attribute = {
attrRegex.matches(encoded) match {
case Some(m) => createAttribute((m.group(1), m.group(2), m.group(3), m.group(4)))
case Some(m) => createAttribute((m.group(1), m.group(2), m.group(3)))
case None =>
throw new IllegalArgumentException(
"attribute string \"%s\" doesn't match format attrTuple:type:value".format(encoded)
)
}
}

private def createAttribute(attrTuple: (String, String, String, String)): Attribute = {
private def createAttribute(attrTuple: (String, String, String)): Attribute = {
val tagName = attrTuple._1
val tagTypeString = attrTuple._2
val tagArrayString: Option[String] = Option(attrTuple._3)
val valueStr = attrTuple._4

val fullTagString = tagArrayString.fold(tagTypeString)(arrayString => "%s:%s".format(tagTypeString, arrayString))
val (fullTagString, valueStr) = if (tagTypeString == "B") {
arrayRegex.matches(attrTuple._3) match {
case Some(m) => {
("%s:%s".format(tagTypeString, m.group(1)), m.group(2))
}
case None => {
throw new IllegalArgumentException(
"Array tags must define array format. For tag %s.".format(attrTuple))
}
}
} else {
(tagTypeString, attrTuple._3)
}

// partial match, but these letters should be (per the SAM file format spec)
// the only legal values of the tagTypeString anyway.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ class SAMRecordConverterSuite extends FunSuite {
// RecordGroupDictionary to be used as a parameter during conversion
val testRecordGroupDict = new RecordGroupDictionary(Seq())

// set the oq, md, oc, and op attributes
testSAMRecord.setOriginalBaseQualities("*****".getBytes.map(v => (v - 33).toByte))
testSAMRecord.setAttribute("MD", "100")
testSAMRecord.setAttribute("OC", "100M")
testSAMRecord.setAttribute("OP", 1)

// Convert samRecord to alignmentRecord
val testAlignmentRecord = testRecordConverter.convert(testSAMRecord, testSequenceDict, testRecordGroupDict)

Expand All @@ -64,6 +70,11 @@ class SAMRecordConverterSuite extends FunSuite {
assert(!testAlignmentRecord.getReadPaired)
assert(testAlignmentRecord.getReadInFragment != 1)
assert(testAlignmentRecord.getSupplementaryAlignment === testSAMRecord.getSupplementaryAlignmentFlag)
assert(testAlignmentRecord.getOrigQual === "*****")
assert(testAlignmentRecord.getMismatchingPositions === "100")
assert(testAlignmentRecord.getOldCigar === "100M")
assert(testAlignmentRecord.getOldPosition === 0L)
assert(testAlignmentRecord.getAttributes === "XS:i:0\tAS:i:75\tNM:i:0")
}

test("testing the fields in an alignmentRecord obtained from an unmapped samRecord conversion") {
Expand Down Expand Up @@ -138,4 +149,12 @@ class SAMRecordConverterSuite extends FunSuite {
assert(newAlignmentRecord.getQual === null)
}

test("don't keep denormalized fields") {
val rc = new SAMRecordConverter

assert(rc.skipTag("MD"))
assert(rc.skipTag("OQ"))
assert(rc.skipTag("OP"))
assert(rc.skipTag("OC"))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,13 @@ class AttributeUtilsSuite extends FunSuite {
test("parseTags works with NumericSequence tagType") {
val tags = parseAttributes("jM:B:c,-1\tjI:B:i,-1,1")

assert(tags.length === 2)
assert(tags(0).tag === "jM")
assert(tags(0).tagType === TagType.NumericByteSequence)
assert(tags(0).value.asInstanceOf[Array[Number]].sameElements(Array(-1)))
assert(tags(1).tag === "jI")
assert(tags(1).tagType === TagType.NumericIntSequence)
assert(tags(1).value.asInstanceOf[Array[Number]].sameElements(Array(-1, 1)))

}

test("empty string is parsed as zero tagStrings") {
Expand All @@ -67,6 +69,26 @@ class AttributeUtilsSuite extends FunSuite {
assert(tags.size === 1)
assert(tags.head.value === string)
}

test("oq string tag with many ':' in it is correctly parsed") {
val tags = parseAttributes("OQ:Z:C55/15D:::::::.7GFFAFDA442.40F=AGHHE")

assert(tags.size === 1)
assert(tags.head.value === "C55/15D:::::::.7GFFAFDA442.40F=AGHHE")
}

test("oq string tag with a ',' in it is correctly parsed") {
val tags = parseAttributes("OQ:Z:C,55/15D:::::::.7GFFAFDA442.40F=AGHHE")

assert(tags.size === 1)
assert(tags.head.value === "C,55/15D:::::::.7GFFAFDA442.40F=AGHHE")
}

test("if a tag is an array but doesn't define it's format, throw") {
intercept[IllegalArgumentException] {
val tags = parseAttributes("jI:B:1,2,3")
}
}
}

class AttributeSuite extends FunSuite {
Expand Down

0 comments on commit 06d0fd2

Please sign in to comment.