bigdatagenomics · fnothaft · Feb 28, 2014 · Feb 26, 2014 · Feb 27, 2014 · Feb 27, 2014
diff --git a/adam-cli/src/main/scala/edu/berkeley/cs/amplab/adam/cli/Adam2Vcf.scala b/adam-cli/src/main/scala/edu/berkeley/cs/amplab/adam/cli/Adam2Vcf.scala
@@ -16,18 +16,13 @@
 
 package edu.berkeley.cs.amplab.adam.cli
 
-import edu.berkeley.cs.amplab.adam.converters.VariantContextConverter
-import edu.berkeley.cs.amplab.adam.models.ADAMVariantContext
+import edu.berkeley.cs.amplab.adam.avro.ADAMGenotype
 import edu.berkeley.cs.amplab.adam.rdd.AdamContext._
 import edu.berkeley.cs.amplab.adam.rdd.variation.ADAMVariationContext._
-import edu.berkeley.cs.amplab.adam.util.{AdamVCFOutputFormat, VcfHeaderUtils}
-import fi.tkk.ics.hadoop.bam.VariantContextWritable
-import org.apache.hadoop.io.LongWritable
-import org.apache.hadoop.mapreduce.Job
-import org.apache.spark.{Logging, SparkContext}
-import org.apache.spark.SparkContext._
+import org.kohsuke.args4j.Argument
 import org.apache.spark.rdd.RDD
-import org.kohsuke.args4j.{Argument, Option => Args4jOption}
+import org.apache.spark.{Logging, SparkContext}
+import org.apache.hadoop.mapreduce.Job
 
 object Adam2Vcf extends AdamCommandCompanion {
 
@@ -50,5 +45,7 @@ class Adam2Vcf(val args: Adam2VcfArgs) extends AdamSparkCommand[Adam2VcfArgs] wi
  val companion = Adam2Vcf
 
  def run(sc: SparkContext, job: Job) {
+ val adamGTs: RDD[ADAMGenotype] = sc.adamLoad(args.adamFile)
+ sc.adamVCFSave(args.outputPath, adamGTs.toADAMVariantContext)
  }
 }
diff --git a/adam-core/src/main/java/edu/berkeley/cs/amplab/adam/rich/GenotypeType.java b/adam-core/src/main/java/edu/berkeley/cs/amplab/adam/rich/GenotypeType.java
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2014. Mount Sinai School of Medicine
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.berkeley.cs.amplab.adam.rich;
+
+public enum GenotypeType {
+ HOM_REF, HET, HOM_ALT, NO_CALL
+}
diff --git a/adam-core/src/main/scala/edu/berkeley/cs/amplab/adam/models/ConcordanceTable.scala b/adam-core/src/main/scala/edu/berkeley/cs/amplab/adam/models/ConcordanceTable.scala
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2014. Mount Sinai School of Medicine
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.berkeley.cs.amplab.adam.models
+
+import edu.berkeley.cs.amplab.adam.rich.GenotypeType
+
+object ConcordanceTable {
+ def apply() = new ConcordanceTable()
+
+ def create(v: (GenotypeType, GenotypeType)) = new ConcordanceTable(v)
+
+ /**
+ *
+ * @param l Modified
+ * @param r
+ * @return
+ */
+
+ def addComparison(l: ConcordanceTable, r: (GenotypeType, GenotypeType)) = {
+ l.add(r)
+ l
+ }
+
+ /**
+ *
+ * @param l Modified
+ * @param r
+ * @return
+ */
+ def mergeTable(l: ConcordanceTable, r: ConcordanceTable) = {
+ l.add(r)
+ l
+ }
+
+ // Relevant sub-groups of concordance table entries
+ val CALLED = Seq(GenotypeType.HOM_REF, GenotypeType.HET, GenotypeType.HOM_ALT)
+ val VARIANT = Seq(GenotypeType.HET, GenotypeType.HOM_ALT)
+ val ALL = GenotypeType.values.toSeq
+
+ implicit def typesToIndices(t : Seq[GenotypeType]) : Seq[Int] = t.map(_.ordinal)
+}
+
+
+class ConcordanceTable {
+ import ConcordanceTable._
+
+ private val table_ = Array.fill[Long](GenotypeType.values.length, GenotypeType.values.length)(0L)
+
+ def this(p : (GenotypeType, GenotypeType)) = {
+ this()
+ add(p)
+ }
+
+ def add(p : (GenotypeType, GenotypeType)) = table_(p._1.ordinal)(p._2.ordinal) += 1L
+ def add(that : ConcordanceTable) = {
+ for (r <- ALL : Seq[Int]; c <- ALL : Seq[Int])
+ table_(r)(c) += that.table_(r)(c)
+ }
+
+ def get(test: GenotypeType, truth: GenotypeType) : Long = table_(test.ordinal)(truth.ordinal)
+
+
+ def total(diagonal : Seq[Int]) = ( for (i <- diagonal) yield table_(i)(i) ).sum
+ def total(test : Seq[Int] = ALL, truth : Seq[Int] = ALL) =
+ ( for (r <- test; c <- truth) yield table_(r)(c) ).sum
+
+ def concordance : Double = total(CALLED).toDouble / total(CALLED, CALLED).toDouble
+ def nonReferenceSensitivity : Double = total(VARIANT, VARIANT).toDouble / total(ALL, VARIANT).toDouble
+}
diff --git a/...-core/src/main/scala/edu/berkeley/cs/amplab/adam/rdd/variation/ADAMVariationContext.scala b/...-core/src/main/scala/edu/berkeley/cs/amplab/adam/rdd/variation/ADAMVariationContext.scala
@@ -16,20 +16,52 @@
 
 package edu.berkeley.cs.amplab.adam.rdd.variation
 
-import org.apache.spark.rdd.RDD
-import edu.berkeley.cs.amplab.adam.models.ADAMVariantContext
-import org.apache.spark.{Logging, SparkContext}
-import org.apache.hadoop.mapreduce.Job
+import edu.berkeley.cs.amplab.adam.avro.ADAMGenotype
 import edu.berkeley.cs.amplab.adam.converters.VariantContextConverter
-import fi.tkk.ics.hadoop.bam.{VariantContextWritable, VCFInputFormat}
+import edu.berkeley.cs.amplab.adam.models.{ADAMVariantContext, SequenceDictionary}
+import edu.berkeley.cs.amplab.adam.rdd.variation.ADAMVariationContext._
+import fi.tkk.ics.hadoop.bam._
 import org.apache.hadoop.io.LongWritable
+import org.apache.hadoop.mapreduce.Job
+import org.apache.spark.{SparkContext, Logging}
+import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.RDD
+import org.broadinstitute.variant.vcf.{VCFHeaderLine, VCFHeader}
 import parquet.hadoop.util.ContextUtil
+import scala.collection.JavaConversions._
+
+
+private object ADAMVCFOutputFormat {
+ private var header : Option[VCFHeader] = None
 
+ def getHeader : VCFHeader = header match {
+ case Some(h) => h
+ case None => setHeader(Seq())
+ }
+
+ def setHeader(samples: Seq[String]) : VCFHeader = {
+ header = Some(new VCFHeader(
+ (VariantContextConverter.infoHeaderLines ++ VariantContextConverter.formatHeaderLines).toSet : Set[VCFHeaderLine],
+ samples
+ ))
+ header.get
+ }
+}
+
+/**
+ * Wrapper for Hadoop-BAM to work around requirement for no-args constructor. Depends on
+ * ADAMVCFOutputFormat object to maintain global state (such as samples)
+ *
+ * @tparam K
+ */
+private class ADAMVCFOutputFormat[K] extends KeyIgnoringVCFOutputFormat[K](VCFFormat.VCF) {
+ setHeader(ADAMVCFOutputFormat.getHeader)
+}
 
 object ADAMVariationContext {
  implicit def sparkContextToADAMVariationContext(sc: SparkContext): ADAMVariationContext = new ADAMVariationContext(sc)
-
  implicit def rddToADAMVariantContextRDD(rdd: RDD[ADAMVariantContext]) = new ADAMVariantContextRDDFunctions(rdd)
+ implicit def rddToADAMGenotypeRDD(rdd: RDD[ADAMGenotype]) = new ADAMGenotypeRDDFunctions(rdd)
 }
 
 class ADAMVariationContext(sc: SparkContext) extends Serializable with Logging {
@@ -51,6 +83,34 @@ class ADAMVariationContext(sc: SparkContext) extends Serializable with Logging {
  log.info("Converted %d records".format(records.count()))
  records.flatMap(p => vcc.convert(p._2.get))
  }
+
+ def adamVCFSave(filePath: String, variants: RDD[ADAMVariantContext], dict: Option[SequenceDictionary] = None) = {
+ val vcfFormat = VCFFormat.inferFromFilePath(filePath)
+ assert(vcfFormat == VCFFormat.VCF, "BCF not yet supported") // TODO: Add BCF support
+
+ log.info("Writing %s file to %s".format(vcfFormat, filePath))
+
+ // Initialize global header object required by Hadoop VCF Writer
+ ADAMVCFOutputFormat.setHeader(variants.adamGetCallsetSamples)
+
+ // TODO: Sort variants according to sequence dictionary (if supplied)
+ val converter = new VariantContextConverter(dict)
+ val gatkVCs: RDD[VariantContextWritable] = variants.map(v => {
+ val vcw = new VariantContextWritable
+ vcw.set(converter.convert(v))
+ vcw
+ })
+ val withKey = gatkVCs.keyBy(v => new LongWritable(v.get.getStart))
+
+ val conf = sc.hadoopConfiguration
+ conf.set(VCFOutputFormat.OUTPUT_VCF_FORMAT_PROPERTY, vcfFormat.toString)
+ withKey.saveAsNewAPIHadoopFile(filePath,
+ classOf[LongWritable], classOf[VariantContextWritable], classOf[ADAMVCFOutputFormat[LongWritable]],
+ conf
+ )
+
+ log.info("Write %d records".format(gatkVCs.count))
+ }
 }
 
 
diff --git a/.../src/main/scala/edu/berkeley/cs/amplab/adam/rdd/variation/ADAMVariationRDDFunctions.scala b/.../src/main/scala/edu/berkeley/cs/amplab/adam/rdd/variation/ADAMVariationRDDFunctions.scala
@@ -17,8 +17,12 @@
 package edu.berkeley.cs.amplab.adam.rdd.variation
 
 import edu.berkeley.cs.amplab.adam.avro.{ADAMGenotype, ADAMDatabaseVariantAnnotation}
-import edu.berkeley.cs.amplab.adam.models.{ADAMVariantContext, SequenceDictionary, SequenceRecord}
-import edu.berkeley.cs.amplab.adam.rich.RichADAMVariant
+import edu.berkeley.cs.amplab.adam.models.{ADAMVariantContext,
+ ConcordanceTable,
+ SequenceDictionary,
+ SequenceRecord}
+import edu.berkeley.cs.amplab.adam.rich.{GenotypeType, RichADAMVariant}
+import edu.berkeley.cs.amplab.adam.rich.RichADAMGenotype._
 import org.apache.spark.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.SparkContext._
@@ -60,4 +64,26 @@ class ADAMGenotypeRDDFunctions(rdd: RDD[ADAMGenotype]) extends Serializable with
  .groupByKey
  .map { case (v:RichADAMVariant, g) => new ADAMVariantContext(v, g, None) }
  }
+
+ def concordanceWith(truth: RDD[ADAMGenotype]) : RDD[(String, ConcordanceTable)] = {
+ val keyedTest = rdd.keyBy(g => (g.getVariant, g.getSampleId.toString) : (RichADAMVariant, String))
+ val keyedTruth = truth.keyBy(g => (g.getVariant, g.getSampleId.toString) : (RichADAMVariant, String))
+
+ val inTest = keyedTest.leftOuterJoin(keyedTruth)
+ val justInTruth = keyedTruth.subtractByKey(inTest)
+
+ // Compute RDD[sample -> ConcordanceTable] across variants/samples
+ val bySample = inTest.map({
+ case ((_, sample), (l, Some(r))) => sample -> (l.getType, r.getType)
+ case ((_, sample), (l, None)) => sample -> (l.getType, GenotypeType.NO_CALL)
+ }).union(justInTruth.map({ // add in "truth-only" entries
+ case ((_, sample), r) => sample -> (GenotypeType.NO_CALL, r.getType)
+ })).combineByKey(
+ ConcordanceTable.create,
+ ConcordanceTable.addComparison,
+ ConcordanceTable.mergeTable
+ )
+
+ bySample
+ }
 }
diff --git a/adam-core/src/main/scala/edu/berkeley/cs/amplab/adam/rich/RichADAMGenotype.scala b/adam-core/src/main/scala/edu/berkeley/cs/amplab/adam/rich/RichADAMGenotype.scala
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2014. Mount Sinai School of Medicine
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.berkeley.cs.amplab.adam.rich
+
+import edu.berkeley.cs.amplab.adam.avro.{ADAMGenotypeAllele, ADAMGenotype}
+import scala.collection.JavaConversions._
+
+object RichADAMGenotype {
+ implicit def genotypeToRichGenotype(g: ADAMGenotype) = new RichADAMGenotype(g)
+ implicit def richGenotypeToGenotype(g: RichADAMGenotype) = g.genotype
+}
+
+class RichADAMGenotype(val genotype: ADAMGenotype) {
+ def getType: GenotypeType = {
+ genotype.getAlleles.toList.distinct match {
+ case List(ADAMGenotypeAllele.Ref) => GenotypeType.HOM_REF
+ case List(ADAMGenotypeAllele.Alt) => GenotypeType.HOM_ALT
+ case List(ADAMGenotypeAllele.Ref, ADAMGenotypeAllele.Alt) |
+ List(ADAMGenotypeAllele.Alt, ADAMGenotypeAllele.Ref) => GenotypeType.HET
+ case _ => GenotypeType.NO_CALL
+ }
+ }
+}
diff --git a/.../test/scala/edu/berkeley/cs/amplab/adam/rdd/variation/ADAMGenotypeRDDFunctionsSuite.scala b/.../test/scala/edu/berkeley/cs/amplab/adam/rdd/variation/ADAMGenotypeRDDFunctionsSuite.scala
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2014. Mount Sinai School of Medicine
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.berkeley.cs.amplab.adam.rdd.variation
+
+import edu.berkeley.cs.amplab.adam.util.SparkFunSuite
+import edu.berkeley.cs.amplab.adam.avro._
+import edu.berkeley.cs.amplab.adam.models.ConcordanceTable
+import edu.berkeley.cs.amplab.adam.rdd.variation.ADAMVariationContext._
+import scala.collection.JavaConversions._
+import org.apache.spark.SparkContext._
+import edu.berkeley.cs.amplab.adam.rich.GenotypeType
+
+
+class ADAMGenotypeRDDFunctionsSuite extends SparkFunSuite {
+ def v0 = ADAMVariant.newBuilder
+ .setContig(ADAMContig.newBuilder.setContigName("11").build)
+ .setPosition(17409572)
+ .setReferenceAllele("T")
+ .setVariantAllele("C")
+ .build
+
+ sparkTest("concordance of identical and non-identical genotypes") {
+ val gb = ADAMGenotype.newBuilder().setVariant(v0)
+ .setSampleId("NA12878")
+ .setAlleles(List(ADAMGenotypeAllele.Ref, ADAMGenotypeAllele.Alt))
+
+ val g0 = gb.build
+ val g1 = gb.build
+ val tables0 = sc.parallelize(Seq(g0)).concordanceWith(sc.parallelize(Seq(g1))).collectAsMap
+ assert(tables0.size === 1)
+ val table0 = tables0.getOrElse("NA12878", ConcordanceTable())
+ assert(table0.total() === 1)
+ assert(table0.get(GenotypeType.HET, GenotypeType.HET) === 1)
+
+ val g2 = gb.setAlleles(List(ADAMGenotypeAllele.Ref, ADAMGenotypeAllele.Ref)).build
+ val table1 = sc.parallelize(Seq(g0))
+ .concordanceWith(sc.parallelize(Seq(g2))).collectAsMap()
+ .getOrElse("NA12878", ConcordanceTable())
+ assert(table1.total() === 1)
+ assert(table1.get(GenotypeType.HET, GenotypeType.HOM_REF) === 1)
+ }
+}
diff --git a/...scala/edu/berkeley/cs/amplab/adam/rdd/variation/ADAMVariantContextRDDFunctionsSuite.scala b/...scala/edu/berkeley/cs/amplab/adam/rdd/variation/ADAMVariantContextRDDFunctionsSuite.scala
@@ -47,8 +47,6 @@ class ADAMVariantContextRDDFunctionsSuite extends SparkFunSuite {
 
  // TODO: implicit conversion to ADAMVariantContextRDD
  val annotated = vc.joinDatabaseVariantAnnotation(vda)
- println("nealsid: " + annotated.count())
- println("nealsid: " + annotated.first.databases.isDefined)
  assert( annotated.map(_.databases.isDefined).reduce { (a,b) => a && b } )
  }