Added support for coverage calculation from alignment files

bigdatagenomics · Aug 18, 2016 · f15afe5 · f15afe5
1 parent e7e1adf
commit f15afe5
Show file tree

Hide file tree

Showing 15 changed files with 687 additions and 60 deletions.
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala
@@ -51,7 +51,8 @@ object ADAMMain {
           Features2ADAM,
           WigFix2Bed,
           Fragments2Reads,
-          Reads2Fragments
+          Reads2Fragments,
+          Reads2Coverage
         )
       ),
       CommandGroup(

diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Coverage.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Coverage.scala
@@ -0,0 +1,85 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.cli
+
+import org.apache.spark.SparkContext
+import org.bdgenomics.adam.projections.AlignmentRecordField._
+import org.bdgenomics.adam.projections.Projection
+import org.bdgenomics.adam.rdd.ADAMContext._
+import org.bdgenomics.adam.rdd.features.CoverageRDD
+import org.bdgenomics.adam.rdd.read.{ AlignedReadRDD, AlignmentRecordRDD }
+import org.bdgenomics.utils.cli._
+import org.kohsuke.args4j.{ Argument, Option => Args4jOption }
+
+/**
+ * Reads2Coverage (accessible as the command 'reads2coverage' through the CLI) takes two arguments,
+ * an INPUT and OUTPUT, and calculates the number of reads from INPUT at every location in
+ * the file. Optional arguments are only_negative_strands, only_positive_strands and collapse.
+ * only_negative_strands and only_positive_strands save coverage computed from only negative and positive strands,
+ * respectively. Collapse specifies whether saved coverage should merge neighboring coverage with the same counts
+ * to one record.
+ */
+object Reads2Coverage extends BDGCommandCompanion {
+  val commandName: String = "reads2coverage"
+  val commandDescription: String = "Calculate the coverage from a given ADAM file"
+
+  def apply(cmdLine: Array[String]): BDGCommand = {
+    new Reads2Coverage(Args4j[Reads2CoverageArgs](cmdLine))
+  }
+}
+
+class Reads2CoverageArgs extends Args4jBase with ParquetArgs {
+  @Argument(required = true, metaVar = "INPUT", usage = "The reads file to use to calculate depths", index = 0)
+  var inputPath: String = null
+  @Argument(required = true, metaVar = "OUTPUT", usage = "Location to write the coverage data in ADAM/Parquet format", index = 1)
+  var outputPath: String = null
+  @Args4jOption(required = false, name = "-collapse", usage = "Collapses neighboring coverage records " +
+    "of equal counts into the same record")
+  var collapse: Boolean = true
+  @Args4jOption(required = false, name = "-only_negative_strands", usage = "Compute coverage for negative strands")
+  var onlyNegativeStrands: Boolean = false
+  @Args4jOption(required = false, name = "-only_positive_strands", usage = "Compute coverage for positive strands")
+  var onlyPositiveStrands: Boolean = false
+}
+
+class Reads2Coverage(protected val args: Reads2CoverageArgs) extends BDGSparkCommand[Reads2CoverageArgs] {
+  val companion: BDGCommandCompanion = Reads2Coverage
+
+  def run(sc: SparkContext): Unit = {
+
+    val proj = Projection(contigName, start, end, cigar)
+
+    // If saving strand specific coverage, require that only one direction is specified
+    require(!(args.onlyNegativeStrands && args.onlyPositiveStrands),
+      "Cannot compute coverage for both negative and positive strands separately")
+
+    // load reads
+    val readsRdd: AlignmentRecordRDD = sc.loadAlignments(args.inputPath, projection = Some(proj))
+
+    val finalReads = if (args.onlyNegativeStrands && !args.onlyPositiveStrands) {
+      readsRdd.transform(rdd => rdd.filter(_.getReadNegativeStrand))
+    } else if (!args.onlyNegativeStrands && args.onlyPositiveStrands) {
+      readsRdd.transform(rdd => rdd.filter(!_.getReadNegativeStrand))
+    } else {
+      readsRdd
+    }
+
+    finalReads.toCoverage(args.collapse)
+      .save(args.outputPath)
+  }
+}
diff --git a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/Reads2CoverageSuite.scala b/adam-cli/src/test/scala/org/bdgenomics/adam/cli/Reads2CoverageSuite.scala
@@ -0,0 +1,37 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.cli
+
+import org.bdgenomics.adam.rdd.ADAMContext._
+import org.bdgenomics.adam.util.ADAMFunSuite
+import org.bdgenomics.utils.cli.Args4j
+
+class Reads2CoverageSuite extends ADAMFunSuite {
+
+  sparkTest("correctly calculates coverage from small sam file") {
+    val inputPath = copyResource("artificial.sam")
+    val outputPath = tmpFile("coverage.adam")
+
+    val args: Array[String] = Array(inputPath, outputPath)
+    new Reads2Coverage(Args4j[Reads2CoverageArgs](args)).run(sc)
+    val coverage = sc.loadCoverage(outputPath)
+
+    val pointCoverage = coverage.flatten.rdd.filter(_.start == 30).first
+    assert(pointCoverage.count == 5)
+  }
+}
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/Coverage.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/Coverage.scala
@@ -0,0 +1,85 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.models
+
+import org.apache.spark.rdd.RDD
+import org.bdgenomics.formats.avro.Feature
+
+/**
+ * Converts from avro Feature to Coverage.
+ */
+object Coverage {
+
+  /**
+   * Creates Coverage from ReferenceRegion and coverage count in that ReferenceRegion.
+   *
+   * @param region ReferenceRegion in which Coverage spans
+   * @param count Coverage count for each base pair in region
+   * @return Coverage spanning the specified ReferenceRegion
+   */
+  private[adam] def apply(region: ReferenceRegion, count: Double): Coverage = {
+    Coverage(region.referenceName, region.start, region.end, count)
+  }
+
+  /**
+   * Creates Coverage from Feature, extracting region information and feature score for coverage.
+   *
+   * @param feature Feature to create coverage from
+   * @return Coverage spanning the specified feature
+   */
+  private[adam] def apply(feature: Feature): Coverage = {
+    Coverage(feature.getContigName, feature.getStart, feature.getEnd, feature.getScore)
+  }
+
+  /**
+   * Creates an RDD of Coverage from RDD of Features.
+   *
+   * @param rdd RDD of Features to extract Coverage from
+   * @return RDD of Coverage spanning all features in rdd
+   */
+  private[adam] def apply(rdd: RDD[Feature]): RDD[Coverage] = {
+    rdd.map(f => Coverage(f))
+  }
+}
+
+/**
+ * Coverage record for CoverageRDD.
+ * Contains Region indexed by contig name, start and end, as well as count of coverage at
+ * each base pair in that region.
+ *
+ * @param contigName Specifies chromosomal location of coverage
+ * @param start Specifies start position of coverage
+ * @param end  Specifies end position of coverage
+ * @param count Specifies count of coverage at location
+ */
+case class Coverage(contigName: String, start: Long, end: Long, count: Double) {
+
+  /**
+   * Converts Coverage to Feature, setting Coverage count in the score attribute.
+   *
+   * @return Feature built from Coverage
+   */
+  def toFeature: Feature = {
+    val fb = Feature.newBuilder()
+    fb.setContigName(contigName)
+    fb.setStart(start)
+    fb.setEnd(end)
+    fb.setScore(count)
+    fb.build()
+  }
+}
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala
@@ -150,9 +150,25 @@ object ReferenceRegion {
     }
   }
 
+  /**
+   * Extracts ReferenceRegion from Feature
+   *
+   * @param feature Feature to extract ReferenceRegion from
+   * @return Extracted ReferenceRegion
+   */
   def apply(feature: Feature): ReferenceRegion = {
     new ReferenceRegion(feature.getContigName, feature.getStart, feature.getEnd)
   }
+
+  /**
+   * Extracts ReferenceRegion from Coverage
+   *
+   * @param coverage Coverage to extract ReferenceRegion from
+   * @return Extracted ReferenceRegion
+   */
+  def apply(coverage: Coverage): ReferenceRegion = {
+    new ReferenceRegion(coverage.contigName, coverage.start, coverage.end)
+  }
 }
 
 /**

diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala
@@ -17,12 +17,12 @@
  */
 package org.bdgenomics.adam.rdd
 
-import collection.JavaConverters._
 import java.io.{ File, FileNotFoundException, InputStream }
 import java.util.regex.Pattern
+
 import com.google.common.collect.ImmutableList
-import htsjdk.samtools.{ SAMFileHeader, ValidationStringency }
 import htsjdk.samtools.util.{ Interval, Locatable }
+import htsjdk.samtools.{ SAMFileHeader, ValidationStringency }
 import htsjdk.variant.vcf.VCFHeader
 import org.apache.avro.Schema
 import org.apache.avro.file.DataFileStream
@@ -35,33 +35,35 @@ import org.apache.parquet.avro.{ AvroParquetInputFormat, AvroReadSupport }
 import org.apache.parquet.filter2.predicate.FilterPredicate
 import org.apache.parquet.hadoop.ParquetInputFormat
 import org.apache.parquet.hadoop.util.ContextUtil
+import org.apache.spark.SparkContext
 import org.apache.spark.rdd.MetricsContext._
 import org.apache.spark.rdd.RDD
-import org.apache.spark.SparkContext
 import org.bdgenomics.adam.converters._
 import org.bdgenomics.adam.instrumentation.Timers._
 import org.bdgenomics.adam.io._
 import org.bdgenomics.adam.models._
-import org.bdgenomics.adam.projections.{ AlignmentRecordField, NucleotideContigFragmentField, Projection }
+import org.bdgenomics.adam.projections.{
+  AlignmentRecordField,
+  FeatureField,
+  NucleotideContigFragmentField,
+  Projection
+}
 import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentRDD
 import org.bdgenomics.adam.rdd.features._
 import org.bdgenomics.adam.rdd.fragment.FragmentRDD
-import org.bdgenomics.adam.rdd.read.{
-  AlignedReadRDD,
-  AlignmentRecordRDD,
-  UnalignedReadRDD
-}
+import org.bdgenomics.adam.rdd.read.{ AlignedReadRDD, AlignmentRecordRDD, UnalignedReadRDD }
 import org.bdgenomics.adam.rdd.variation._
 import org.bdgenomics.adam.rich.RichAlignmentRecord
-import org.bdgenomics.adam.util.{ TwoBitFile, ReferenceContigMap, ReferenceFile }
+import org.bdgenomics.adam.util.{ ReferenceContigMap, ReferenceFile, TwoBitFile }
 import org.bdgenomics.formats.avro._
 import org.bdgenomics.utils.instrumentation.Metrics
 import org.bdgenomics.utils.io.LocalFileByteAccess
-import org.bdgenomics.utils.misc.HadoopUtil
-import org.bdgenomics.utils.misc.Logging
+import org.bdgenomics.utils.misc.{ HadoopUtil, Logging }
 import org.seqdoop.hadoop_bam._
 import org.seqdoop.hadoop_bam.util._
+
 import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.Map
 import scala.reflect.ClassTag
 
@@ -757,6 +759,17 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log
     FragmentRDD.fromRdd(records.map(fastqRecordConverter.convertFragment))
   }
 
+  /**
+   * Loads Parquet file of Features to a CoverageRDD.
+   * Coverage is stored in the score attribute of Feature.
+   * @param filePath File path to load coverage from
+   * @return CoverageRDD containing an RDD of Coverage
+   */
+  def loadCoverage(filePath: String): CoverageRDD = {
+    val proj = Projection(FeatureField.contigName, FeatureField.start, FeatureField.end, FeatureField.score)
+    loadFeatures(filePath, projection = Some(proj)).toCoverage
+  }
+
   def loadGff3(filePath: String, minPartitions: Option[Int] = None): FeatureRDD = {
     val records = sc.textFile(filePath, minPartitions.getOrElse(sc.defaultParallelism)).flatMap(new GFF3Parser().parse)
     if (Metrics.isRecording) records.instrument() else records
@@ -999,7 +1012,6 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log
     } else if (filePath.endsWith(".fa") ||
       filePath.endsWith(".fasta")) {
       log.info(s"Loading $filePath as FASTA and converting to AlignmentRecords. Projection is ignored.")
-      import ADAMContext._
       UnalignedReadRDD(loadFasta(filePath, fragmentLength = 10000).toReads,
         RecordGroupDictionary.empty)
     } else if (filePath.endsWith("contig.adam")) {