bigdatagenomics · fnothaft · Feb 20, 2014 · Feb 18, 2014 · Feb 18, 2014 · Feb 18, 2014
diff --git a/adam-core/src/main/scala/edu/berkeley/cs/amplab/adam/rdd/AdamContext.scala b/adam-core/src/main/scala/edu/berkeley/cs/amplab/adam/rdd/AdamContext.scala
@@ -20,7 +20,6 @@ import edu.berkeley.cs.amplab.adam.converters.SAMRecordConverter
 import edu.berkeley.cs.amplab.adam.models._
 import org.apache.hadoop.fs.FileSystem
 import edu.berkeley.cs.amplab.adam.projections.{ADAMRecordField, Projection}
-import edu.berkeley.cs.amplab.adam.rdd.compare.CompareAdam
 import fi.tkk.ics.hadoop.bam.util.SAMHeaderReader
 import fi.tkk.ics.hadoop.bam.{SAMRecordWritable, AnySAMInputFormat}
 import org.apache.avro.Schema
@@ -232,11 +231,6 @@ class AdamContext(sc: SparkContext) extends Serializable with Logging {
     }
   }
 
-  def adamCompareFiles(file1Path: String, file2Path: String,
-                       predicateFactory: (Map[Int, Int]) => (SingleReadBucket, SingleReadBucket) => Boolean) = {
-    CompareAdam.compareADAM(sc, file1Path, file2Path, predicateFactory)
-  }
-
   /**
    * Searches a path recursively, returning the names of all directories in the tree whose
    * name matches the given regex.

diff --git a/adam-core/src/main/scala/edu/berkeley/cs/amplab/adam/rdd/AdamRDDFunctions.scala b/adam-core/src/main/scala/edu/berkeley/cs/amplab/adam/rdd/AdamRDDFunctions.scala
@@ -15,23 +15,21 @@
  */
 package edu.berkeley.cs.amplab.adam.rdd
 
-import edu.berkeley.cs.amplab.adam.avro.{ADAMGenotype, ADAMGenotypeAllele, ADAMPileup, ADAMRecord, ADAMVariant, ADAMVariantDomain}
-import edu.berkeley.cs.amplab.adam.converters.{GenotypesToVariantsConverter, VariantContextConverter}
-import edu.berkeley.cs.amplab.adam.models.{ADAMRod, ADAMVariantContext, ReferencePosition, SequenceDictionary, SequenceRecord, SingleReadBucket, SnpTable}
-import edu.berkeley.cs.amplab.adam.projections.{ADAMVariantAnnotations, ADAMVariantField}
-import edu.berkeley.cs.amplab.adam.rdd.AdamContext._
-import edu.berkeley.cs.amplab.adam.util.ParquetLogger
-import org.apache.avro.specific.SpecificRecord
+import parquet.hadoop.metadata.CompressionCodecName
 import org.apache.hadoop.mapreduce.Job
-import org.apache.spark.Logging
-import org.apache.spark.SparkContext._
-import org.apache.spark.rdd.RDD
-import org.broadinstitute.variant.variantcontext.{Allele, VariantContext}
-import parquet.avro.{AvroParquetOutputFormat, AvroWriteSupport}
 import parquet.hadoop.ParquetOutputFormat
-import parquet.hadoop.metadata.CompressionCodecName
+import parquet.avro.{AvroParquetOutputFormat, AvroWriteSupport}
 import parquet.hadoop.util.ContextUtil
+import org.apache.avro.specific.SpecificRecord
+import edu.berkeley.cs.amplab.adam.avro.{ADAMPileup, ADAMRecord}
+import edu.berkeley.cs.amplab.adam.models.{SequenceRecord, SequenceDictionary, SingleReadBucket, SnpTable, ReferencePosition, ADAMRod}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.SparkContext._
+import org.apache.spark.Logging
+import java.io.File
+import edu.berkeley.cs.amplab.adam.util.ParquetLogger
 import java.util.logging.Level
+import edu.berkeley.cs.amplab.adam.rich.RichADAMRecord._
 
 class AdamRDDFunctions[T <% SpecificRecord : Manifest](rdd: RDD[T]) extends Serializable {
 
@@ -107,10 +105,6 @@ class AdamRecordRDDFunctions(rdd: RDD[ADAMRecord]) extends Serializable with Log
     RecalibrateBaseQualities(rdd, broadcastDbSNP)
   }
 
-  def adamRealignIndels(): RDD[ADAMRecord] = {
-    RealignIndels(rdd)
-  }
-
   // Returns a tuple of (failedQualityMetrics, passedQualityMetrics)
   def adamFlagStat(): (FlagStatMetrics, FlagStatMetrics) = {
     FlagStat(rdd)
@@ -151,15 +145,14 @@ class AdamRecordRDDFunctions(rdd: RDD[ADAMRecord]) extends Serializable with Log
      * @param r Read to map.
      * @return List containing one or two mapping key/value pairs.
      */
-    def mapToBucket (r: ADAMRecord): List[(ReferencePosition, ADAMRecord)] = {
+    def mapToBucket (r: ADAMRecord): List[(Long, ADAMRecord)] = {
       val s = r.getStart / bucketSize
       val e = r.end.get / bucketSize
-      val id = r.getReferenceId
 
       if (s == e) {
-        List((new ReferencePosition(id, s), r))
+        List((s, r))
       } else {
-        List((new ReferencePosition(id, s), r), (new ReferencePosition(id, e), r))
+        List((s, r), (e, r))
       }
     }
 
@@ -179,11 +172,11 @@ class AdamRecordRDDFunctions(rdd: RDD[ADAMRecord]) extends Serializable with Log
      * @param bucket Tuple of (bucket number, reads in bucket).
      * @return A sequence containing the rods in this bucket.
      */
-    def bucketedReadsToRods (bucket: (ReferencePosition, Seq[ADAMRecord])): Seq[ADAMRod] = {
-      val (bucketStart, bucketReads) = bucket
+    def bucketedReadsToRods (bucket: (Long, Seq[ADAMRecord])): Seq[ADAMRod] = {
+      val (bucketNumber, bucketReads) = bucket
 
       bucketReads.flatMap(pp.readToPileups)
-        .groupBy(ReferencePosition(_))
+        .groupBy(_.getPosition)
         .toList
         .map(g => ADAMRod(g._1, g._2.toList)).toSeq
     }
@@ -215,7 +208,7 @@ class AdamPileupRDDFunctions(rdd: RDD[ADAMPileup]) extends Serializable with Log
    * @return RDD with rods grouped by reference position.
    */
   def adamPileupsToRods(coverage: Int = 30): RDD[ADAMRod] = {
-    val groups = rdd.groupBy((p: ADAMPileup) => ReferencePosition(p), coverage)
+    val groups = rdd.groupBy((p: ADAMPileup) => p.getPosition, coverage)
 
     groups.map(kv => ADAMRod(kv._1, kv._2.toList))
   }
@@ -241,7 +234,7 @@ class AdamRodRDDFunctions(rdd: RDD[ADAMRod]) extends Serializable with Logging {
    *
    * @return Rods split up by samples and grouped together by position.
    */
-  def adamDivideRodsBySamples(): RDD[(ReferencePosition, List[ADAMRod])] = {
+  def adamDivideRodsBySamples(): RDD[(Long, List[ADAMRod])] = {
     rdd.keyBy(_.position).map(r => (r._1, r._2.splitBySamples))
   }
 
@@ -276,107 +269,3 @@ class AdamRodRDDFunctions(rdd: RDD[ADAMRod]) extends Serializable with Logging {
     totalBases.toDouble / rdd.count.toDouble
   }
 }
-
-class AdamVariantContextRDDFunctions(rdd: RDD[ADAMVariantContext]) extends Serializable {
-
-  /**
-   * Save function for variant contexts. Disaggregates internal fields of variant context
-   * and saves to Parquet files.
-   *
-   * @param filePath Master file path for parquet files.
-   * @param blockSize Parquet block size.
-   * @param pageSize Parquet page size.
-   * @param compressCodec Parquet compression codec.
-   * @param disableDictionaryEncoding If true, disables dictionary encoding in Parquet.
-   * @return Returns the initial RDD.
-   */
-  def adamSave(filePath: String, blockSize: Int = 128 * 1024 * 1024,
-               pageSize: Int = 1 * 1024 * 1024, compressCodec: CompressionCodecName = CompressionCodecName.GZIP,
-               disableDictionaryEncoding: Boolean = false): RDD[ADAMVariantContext] = {
-
-    // Add the Void Key
-    val variantToSave: RDD[ADAMVariant] = rdd.flatMap(p => p.variants)
-    val genotypeToSave: RDD[ADAMGenotype] = rdd.flatMap(p => p.genotypes)
-    val domainsToSave: RDD[ADAMVariantDomain] = rdd.flatMap(p => p.domains)
-
-    // save records
-    variantToSave.adamSave(filePath + ".v",
-      blockSize,
-      pageSize,
-      compressCodec,
-      disableDictionaryEncoding)
-    genotypeToSave.adamSave(filePath + ".g",
-      blockSize,
-      pageSize,
-      compressCodec,
-      disableDictionaryEncoding)
-
-    // check if we have domains to save or not
-    if (domainsToSave.count() != 0) {
-      val fileExtension = ADAMVariantAnnotations.fileExtensions(ADAMVariantAnnotations.ADAMVariantDomain)
-
-      domainsToSave.adamSave(filePath + fileExtension,
-        blockSize,
-        pageSize,
-        compressCodec,
-        disableDictionaryEncoding)
-    }
-
-    rdd
-  }
-
-}
-
-class AdamGenotypeRDDFunctions(rdd: RDD[ADAMGenotype]) extends Serializable {
-
-  /**
-   * Validates that an RDD of genotypes is correctly formed.
-   *
-   * @return True if RDD is correctly formed.
-   * @throws IllegalArgumentException Throws exception if RDD is not correctly formed.
-   */
-  def adamValidateGenotypes(): Boolean = {
-    val validator = new GenotypesToVariantsConverter(true, true)
-    val groupedGenotypes = rdd.groupBy(g => (g.getPosition, g.getSampleId))
-    groupedGenotypes.map(_._2.toList).foreach(validator.validateGenotypes)
-
-    true
-  }
-
-  /**
-   *  Converts an RDD of GATK Variants to an RDD of ADAMVariantContext objects
-   */
-  def gatkVariantsToADAMVariants(gatkAlleles : RDD[VariantContext]) : RDD[ADAMVariantContext] = {
-    val vcc : VariantContextConverter = new VariantContextConverter
-    gatkAlleles.flatMap(vcc.convert(_))
-  }
-
-  /**
-   * Calculates Variants from an RDD of genotypes. This allows for on-the-fly creation of variant
-   * data from a subset of a population. This function also allows an RDD of variant data to be provided.
-   * Data can be taken from this RDD by adding to the projection set.
-   *
-   * @param variants Optional RDD of variant data to supplement genotype info.
-   * @param variantProjection The set of fields to copy from the variant data, if this is provided.
-   * @param performValidation Whether to validate that the genotype data is well formed.
-   * @param failOnValidationError If validation is performed and failOnValidationError is true, an exception will
-   *                              be thrown if an error is encountered.
-   * @return An RDD containing variant data.
-   *
-   * @throws IllegalArgumentException Throws an exception if performValidation and failOnValidationError are true
-   *                                  and the RDD of genotypes has bad data.
-   */
-  def adamConvertGenotypes(variants: Option[RDD[ADAMVariant]] = None,
-                           variantProjection: Set[ADAMVariantField.Value] = Set[ADAMVariantField.Value](),
-                           performValidation: Boolean = false,
-                           failOnValidationError: Boolean = false): RDD[ADAMVariant] = {
-    val computer = new GenotypesToVariantsConverter(performValidation, failOnValidationError)
-    val groupedGenotypes = rdd.groupBy(g => g.getPosition)
-    val groupedGenotypesWithVariants: RDD[(java.lang.Long, (Seq[ADAMGenotype], Option[Seq[ADAMVariant]]))] = variants match {
-      case Some(o) => groupedGenotypes.leftOuterJoin(o.asInstanceOf[RDD[ADAMVariant]].groupBy(_.getPosition))
-      case None => groupedGenotypes.map(kv => (kv._1, (kv._2, None.asInstanceOf[Option[Seq[ADAMVariant]]])))
-    }
-
-    groupedGenotypesWithVariants.map(_._2).flatMap(vg => computer.convert(vg._1, vg._2, variantProjection))
-  }
-}
diff --git a/...-core/src/main/scala/edu/berkeley/cs/amplab/adam/rdd/variation/ADAMVariationContext.scala b/...-core/src/main/scala/edu/berkeley/cs/amplab/adam/rdd/variation/ADAMVariationContext.scala
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2013. Mount Sinai School of Medicine
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.berkeley.cs.amplab.adam.rdd.variation
+
+import org.apache.spark.rdd.RDD
+import edu.berkeley.cs.amplab.adam.models.ADAMVariantContext
+import org.apache.spark.{Logging, SparkContext}
+import org.apache.hadoop.mapreduce.Job
+import edu.berkeley.cs.amplab.adam.converters.VariantContextConverter
+import fi.tkk.ics.hadoop.bam.{VariantContextWritable, VCFInputFormat}
+import org.apache.hadoop.io.LongWritable
+import parquet.hadoop.util.ContextUtil
+
+
+object ADAMVariationContext {
+  implicit def sparkContextToADAMVariationContext(sc: SparkContext): ADAMVariationContext = new ADAMVariationContext(sc)
+
+  implicit def rddToADAMVariantContextRDD(rdd: RDD[ADAMVariantContext]) = new ADAMVariantContextRDDFunctions(rdd)
+}
+
+class ADAMVariationContext(sc: SparkContext) extends Serializable with Logging {
+
+  /**
+  * This method will create a new RDD of VariantContext objects
+  * @param filePath: input VCF file to read
+  * @return RDD of variants
+  */
+  def adamVCFLoad(filePath: String): RDD[ADAMVariantContext] = {
+    log.info("Reading VCF file from %s".format(filePath))
+    val job = Job.getInstance(sc.hadoopConfiguration)
+    val vcc = new VariantContextConverter
+    val records = sc.newAPIHadoopFile(
+      filePath,
+      classOf[VCFInputFormat], classOf[LongWritable], classOf[VariantContextWritable],
+      ContextUtil.getConfiguration(job)
+    )
+    log.info("Converted %d records".format(records.count()))
+    records.flatMap(p => vcc.convert(p._2.get))
+  }
+}
+
+
diff --git a/.../src/main/scala/edu/berkeley/cs/amplab/adam/rdd/variation/ADAMVariationRDDFunctions.scala b/.../src/main/scala/edu/berkeley/cs/amplab/adam/rdd/variation/ADAMVariationRDDFunctions.scala
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2013. Mount Sinai School of Medicine
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.berkeley.cs.amplab.adam.rdd.variation
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+import edu.berkeley.cs.amplab.adam.models.ADAMVariantContext
+import edu.berkeley.cs.amplab.adam.avro.{ADAMGenotype, ADAMDatabaseVariantAnnotation}
+import org.apache.spark.SparkContext._
+import edu.berkeley.cs.amplab.adam.rich.RichADAMVariant
+
+class ADAMVariantContextRDDFunctions(rdd: RDD[ADAMVariantContext]) extends Serializable with Logging {
+  initLogging()
+
+  /**
+   * Left outer join database variant annotations
+   *
+   */
+  def joinDatabaseVariantAnnotation(ann: RDD[ADAMDatabaseVariantAnnotation]): RDD[ADAMVariantContext] = {
+    rdd.keyBy(_.variant)
+      .leftOuterJoin(ann.keyBy(_.getVariant))
+      .values
+      .map { case (v:ADAMVariantContext, a) => new ADAMVariantContext(v.variant, v.genotypes, databases = a) }
+  }
+}
+
+class ADAMGenotypeRDDFunctions(rdd: RDD[ADAMGenotype]) extends Serializable with Logging {
+  initLogging()
+
+  def toADAMVariantContext(): RDD[ADAMVariantContext] = {
+    rdd.keyBy({ g => RichADAMVariant.variantToRichVariant(g.getVariant) })
+      .groupByKey
+      .map { case (v:RichADAMVariant, g) => new ADAMVariantContext(v, genotypes = g) }
+  }
+}
diff --git a/adam-core/src/main/scala/edu/berkeley/cs/amplab/adam/rich/RichADAMVariant.scala b/adam-core/src/main/scala/edu/berkeley/cs/amplab/adam/rich/RichADAMVariant.scala
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2014. Mount Sinai School of Medicine
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.berkeley.cs.amplab.adam.rich
+
+import edu.berkeley.cs.amplab.adam.avro.{ADAMContig, ADAMVariant}
+import java.util.Arrays
+
+object RichADAMVariant {
+  implicit def variantToRichVariant(variant: ADAMVariant): RichADAMVariant = new RichADAMVariant(variant)
+  implicit def richVariantToVariant(variant: RichADAMVariant): ADAMVariant = variant.variant
+}
+
+class RichADAMVariant(val variant: ADAMVariant) {
+ // Only include the contigName in the hash
+  val hashObjects = Array[Object](variant.getContig.getContigName, 
+    variant.getPosition, variant.getReferenceAllele, variant.getVariantAllele)
+  override def hashCode = Arrays.hashCode(hashObjects)
+
+  private def isSameContig(left: ADAMContig, right: ADAMContig): Boolean = {
+    left.getContigName == right.getContigName && (
+      left.getReferenceMD5 == null || right.getReferenceMD5 == null || left.getReferenceMD5 == right.getReferenceMD5
+    )
+  }
+
+  override def equals(o: Any) = o match {
+    case that: RichADAMVariant => {
+      variant.getPosition        == that.variant.getPosition  &&
+      isSameContig(variant.getContig, that.variant.getContig) &&
+      variant.getReferenceAllele == that.variant.getReferenceAllele &&
+      variant.getVariantAllele   == that.variant.getVariantAllele
+    }
+    case _ => false
+  }
+}
diff --git a/adam-core/src/main/scala/edu/berkeley/cs/amplab/adam/util/ParquetFileTraversable.scala b/adam-core/src/main/scala/edu/berkeley/cs/amplab/adam/util/ParquetFileTraversable.scala
@@ -31,7 +31,7 @@ class ParquetFileTraversable[T <: IndexedRecord](sc: SparkContext, file: Path) e
     }
     val status = fs.getFileStatus(file)
     var paths = List[Path]()
-    if (status.isDir) {
+    if (status.isDirectory) {
       val files = fs.listStatus(file)
       files.foreach {
         file =>