Skip to content

Commit

Permalink
[ADAM-1709] Add ability to left normalize reads containing INDELs.
Browse files Browse the repository at this point in the history
Resolves #1709. Opens protection on main method in NormalizationUtils to be
package-private to org.bdgenomics.adam. Adds code to AlignmentRecordRDD
exposing INDEL normalization as a transformation.
  • Loading branch information
fnothaft committed Sep 6, 2017
1 parent 51efbaf commit 8fcb03f
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import scala.collection.JavaConversions._
/**
* Utility for left normalizing INDELs in alignments.
*/
private[consensus] object NormalizationUtils {
private[adam] object NormalizationUtils {

/**
* Given a cigar, returns the cigar with the position of the cigar shifted left.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ import org.apache.spark.sql.{ Dataset, Row, SQLContext }
import org.apache.spark.storage.StorageLevel
import org.bdgenomics.adam.algorithms.consensus.{
ConsensusGenerator,
ConsensusGeneratorFromReads
ConsensusGeneratorFromReads,
NormalizationUtils
}
import org.bdgenomics.adam.converters.AlignmentRecordConverter
import org.bdgenomics.adam.instrumentation.Timers._
Expand Down Expand Up @@ -1478,4 +1479,32 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicRDD[Align
AlignmentRecordRDD.validateBins(bins)
BinQualities(this, bins)
}

/**
* Left normalizes the INDELs in reads containing INDELs.
*
* @return Returns a new RDD where the reads that contained INDELs have their
* INDELs left normalized.
*/
def leftNormalizeIndels(): AlignmentRecordRDD = {
transform(rdd => {
rdd.map(r => {
if (!r.getReadMapped || r.getCigar == null) {
r
} else {
val origCigar = r.getCigar
val newCigar = NormalizationUtils.leftAlignIndel(r).toString

// update cigar if changed
if (origCigar != newCigar) {
AlignmentRecord.newBuilder(r)
.setCigar(newCigar)
.build
} else {
r
}
}
})
})
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1399,4 +1399,46 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite {
stringency = ValidationStringency.SILENT)
assert(sorted.rdd.count === 102)
}

sparkTest("left normalize indels") {
val reads = Seq(
AlignmentRecord.newBuilder()
.setReadMapped(false)
.build(),
AlignmentRecord.newBuilder()
.setReadMapped(true)
.setSequence("AAAAACCCCCGGGGGTTTTT")
.setStart(0)
.setCigar("10M2D10M")
.setMismatchingPositions("10^CC10")
.build(),
AlignmentRecord.newBuilder()
.setReadMapped(true)
.setSequence("AAAAACCCCCGGGGGTTTTT")
.setStart(0)
.setCigar("10M10D10M")
.setMismatchingPositions("10^ATATATATAT10")
.build(),
AlignmentRecord.newBuilder()
.setSequence("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
.setReadMapped(true)
.setCigar("29M10D31M")
.setStart(5)
.setMismatchingPositions("29^GGGGGGGGGG10G0G0G0G0G0G0G0G0G0G11")
.build())

// obviously, this isn't unaligned, but, we don't use the metadata here
val rdd = AlignmentRecordRDD.unaligned(sc.parallelize(reads))
.leftNormalizeIndels()

val normalized = rdd.rdd.collect

assert(normalized.size === 4)
val cigars = normalized.flatMap(r => {
Option(r.getCigar)
}).toSet
assert(cigars("5M2D15M"))
assert(cigars("10M10D10M"))
assert(cigars("29M10D31M"))
}
}

0 comments on commit 8fcb03f

Please sign in to comment.