Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dbsnp to mutect caller #4

Merged
merged 2 commits into from
Aug 3, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,29 @@ class ReadExplorer(referenceObservations: RDD[Observation]) extends Explorer wit

val companion: ExplorerCompanion = ReadExplorer

def mdTagToMismatchPositions(mdTag: MdTag): Seq[Int] = {
def mdTagToMismatchPositions(mdTag: MdTag, cigar: List[CigarElement]): Seq[Int] = {
var idx = 0
val insertions = cigar.map(c => {
(c, c.getLength)
}).map(kv => {
val r = (kv._1, idx)
idx += kv._2
r
}).flatMap(kv => {
val (ce, i) = kv
if (ce.getOperator == CigarOperator.I) {
(0 until ce.getLength).map(_ + i)
} else {
Seq.empty
}
})

val deletions = mdTag.deletions
val oriPositions = mdTag.mismatches.keys
var mismatchPositions = oriPositions.zip(oriPositions)
for (iPos <- insertions) {
mismatchPositions = mismatchPositions.map({ case (p, i) => if (i <= iPos) (p + 1, i) else (p, i) })
}
for ((dPos, _) <- deletions) {
mismatchPositions = mismatchPositions.map({ case (p, i) => if (i > dPos) (p - 1, i) else (p, i) })
}
Expand Down Expand Up @@ -77,7 +96,8 @@ class ReadExplorer(referenceObservations: RDD[Observation]) extends Explorer wit
val mdString = read.getMismatchingPositions
val mismatchPositions: Option[Seq[Int]] = if (mdString != null && mdString != "")
Some(mdTagToMismatchPositions(MdTag(read.getMismatchingPositions,
if (cigar.head.getOperator == CigarOperator.S) cigar.head.getLength else 0)))
if (cigar.head.getOperator == CigarOperator.S) cigar.head.getLength else 0,
richRead.samtoolsCigar), cigar))
else None

// observations
Expand All @@ -87,7 +107,11 @@ class ReadExplorer(referenceObservations: RDD[Observation]) extends Explorer wit
var readPos = 0

// get the sum of mismatching bases
val qscores: Option[Seq[Int]] = mismatchPositions.map(l => l.map(p => quals(p)))
val qscores: Option[Seq[Int]] = mismatchPositions.map(l => {
l.map(p => {
quals(p)
})
})

val mismatchQScoreSum = qscores.map(_.sum)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,16 @@
*/
package org.bdgenomics.avocado.genotyping

import java.io.File
import org.apache.commons.configuration.{ HierarchicalConfiguration, SubnodeConfiguration }
import org.apache.spark.Logging
import org.apache.spark.broadcast.Broadcast
import org.bdgenomics.adam.models._
import org.bdgenomics.avocado.algorithms.mutect._
import org.bdgenomics.avocado.algorithms.math.{ LogUtils, LogBinomial }
import org.bdgenomics.avocado.models.{ AlleleObservation, Observation }
import org.bdgenomics.avocado.stats.AvocadoConfigAndStats
import org.bdgenomics.formats.avro.{ Contig, Genotype, Variant }
import org.bdgenomics.avocado.algorithms.math.{ LogUtils, LogBinomial }

import scala.annotation.tailrec

Expand All @@ -40,6 +42,13 @@ object MutectGenotyper extends GenotyperCompanion {
require(config.containsKey("somaticId"),
"Somatic sample ID is not defined in configuration file.")

// load snp table
val snpTable = if (config.containsKey("dbSNPFile")) {
Some(stats.sc.broadcast(SnpTable(new File(config.getString("dbSNPFile")))))
} else {
None
}

// pull out algorithm parameters and return genotyper
new MutectGenotyper(config.getString("normalId"),
config.getString("somaticId"),
Expand All @@ -61,7 +70,8 @@ object MutectGenotyper extends GenotyperCompanion {
config.getBoolean("experimentalMutectIndelDetector", false),
config.getDouble("errorForPowerCalculations", 0.001),
config.getInt("minThetaForPowerCalc", 20),
None)
None,
snpTable)
}
}

Expand Down Expand Up @@ -93,7 +103,8 @@ class MutectGenotyper(normalId: String,
experimentalMutectIndelDetector: Boolean = false,
errorForPowerCalculations: Double = 0.001,
minThetaForPowerCalc: Int = 20,
f: Option[Double] = None) extends SiteGenotyper with Logging {
f: Option[Double] = None,
snpTable: Option[Broadcast[SnpTable]] = None) extends SiteGenotyper with Logging {

val companion: GenotyperCompanion = MutectGenotyper

Expand Down Expand Up @@ -164,7 +175,10 @@ class MutectGenotyper(normalId: String,

val normalNotHet = somaticModel.logOdds(ref, alt, normals, None)

val dbSNPsite = false //TODO figure out if this is a dbSNP position
// is this a known site?
val dbSNPsite = snpTable.fold(false)(t => {
t.value.contains(ReferencePosition(region.referenceName, region.start))
})

val passSomatic: Boolean = (dbSNPsite && normalNotHet >= somDbSnpThreshold) || (!dbSNPsite && normalNotHet >= somNovelThreshold)
val nInsertions = tumors.map(ao => if (math.abs(ao.distanceToNearestReadInsertion.getOrElse(Int.MaxValue)) <= indelNearnessThreshold) 1 else 0).sum
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package org.bdgenomics.avocado.postprocessing

import java.io.File
import org.apache.commons.configuration.SubnodeConfiguration
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.bdgenomics.adam.models.{
ReferencePosition,
Expand Down Expand Up @@ -48,23 +49,26 @@ object FilterKnownSites extends PostprocessingStage {
val keepKnowns = config.getBoolean("keepKnowns", false)

// build and apply filter
val filter = new FilterKnownSites(datasetName, snpTable, keepKnowns)
val filter = new FilterKnownSites(datasetName, snpTable, keepKnowns, Some(stats.sc))
filter.filter(rdd)
}
}

class FilterKnownSites(dataset: Option[String],
siteTable: SnpTable,
keepKnowns: Boolean) extends VariantAttributeFilter {
keepKnowns: Boolean,
sc: Option[SparkContext] = None) extends VariantAttributeFilter {

val filterName = if (keepKnowns) {
dataset.fold("KNOWN_SITE")(d => "SITE_IN_%s".format(d))
} else {
dataset.fold("UNKNOWN_SITE")(d => "SITE_NOT_IN_%s".format(d))
}

private val table = sc.fold(siteTable)(c => c.broadcast(siteTable).value)

def filterFn(v: RichVariant): Boolean = {
val in = siteTable.contains(ReferencePosition(v.variant.getContig.getContigName,
val in = table.contains(ReferencePosition(v.variant.getContig.getContigName,
v.variant.getStart))
if (keepKnowns) {
in
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ class ReadExplorerSuite extends AvocadoFunSuite {
.setSequence("ACTGA")
.setQual(":::?:")
.setCigar("1M2I2M")
.setMismatchingPositions("3C1")
.setMismatchingPositions("1C1")
.setRecordGroupSample("sample1")
.build()

Expand Down