From faaf9ac5b0cf45663c220344026fed93941be58b Mon Sep 17 00:00:00 2001
From: Michael Heuer <heuermh@acm.org>
Date: Tue, 4 Sep 2018 10:35:51 -0500
Subject: [PATCH] Adding support for writing BED format per UCSC definition.

---
 .../adam/rdd/feature/FeatureRDD.scala         | 50 +++++++++++++++++-
 .../adam/rdd/feature/Features.scala           | 51 +++++++++++++++++++
 .../adam/rdd/feature/FeatureRDDSuite.scala    | 20 ++++++++
 3 files changed, 119 insertions(+), 2 deletions(-)

diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala
index cde23938ef..b9e277afd9 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala
@@ -202,13 +202,30 @@ object FeatureRDD {
    * @return Returns the feature as a single line BED string.
    */
   private[rdd] def toBed(feature: Feature): String = {
+    toBed(feature, None, None, None)
+  }
+
+  /**
+   * @param feature Feature to write in BED format.
+   * @return Returns the feature as a single line BED string.
+   */
+  private[rdd] def toBed(feature: Feature,
+                         minimumScore: Option[Double],
+                         maximumScore: Option[Double],
+                         missingValue: Option[Int]): String = {
+
     val chrom = feature.getContigName
     val start = feature.getStart
     val end = feature.getEnd
     val name = Features.nameOf(feature)
-    val score = Option(feature.getScore).getOrElse(".")
     val strand = Features.asString(feature.getStrand)
 
+    val score = if (minimumScore.isDefined && maximumScore.isDefined && missingValue.isDefined) {
+      Features.interpolateScore(feature.getScore, minimumScore.get, maximumScore.get, missingValue.get)
+    } else {
+      Features.formatScore(feature.getScore)
+    }
+
     if (!feature.getAttributes.containsKey("thickStart") &&
       !feature.getAttributes.containsKey("itemRgb") &&
       !feature.getAttributes.containsKey("blockCount")) {
@@ -492,7 +509,35 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd
   }
 
   /**
-   * Save this FeatureRDD in BED format.
+   * Save this FeatureRDD in UCSC BED format, where score is formatted as
+   * integer values between 0 and 1000, with missing value as specified.
+   *
+   * @param fileName The path to save BED formatted text file(s) to.
+   * @param asSingleFile By default (false), writes file to disk as shards with
+   *   one shard per partition. If true, we save the file to disk as a single
+   *   file by merging the shards.
+   * @param disableFastConcat If asSingleFile is true, disables the use of the
+   *   parallel file merging engine.
+   * @param minimumScore Minimum score, interpolated to 0.
+   * @param maximumScore Maximum score, interpolated to 1000.
+   * @param missingValue Value to use if score is not specified. Defaults to 0.
+   */
+  def saveAsUcscBed(fileName: String,
+                    asSingleFile: Boolean = false,
+                    disableFastConcat: Boolean = false,
+                    minimumScore: Double,
+                    maximumScore: Double,
+                    missingValue: Int = 0) = {
+
+    writeTextRdd(rdd.map(FeatureRDD.toBed(_, Some(minimumScore), Some(maximumScore), Some(missingValue))),
+      fileName,
+      asSingleFile,
+      disableFastConcat)
+  }
+
+  /**
+   * Save this FeatureRDD in bedtools2 BED format, where score is formatted
+   * as double floating point values with missing values.
    *
    * @param fileName The path to save BED formatted text file(s) to.
    * @param asSingleFile By default (false), writes file to disk as shards with
@@ -504,6 +549,7 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd
   def saveAsBed(fileName: String,
                 asSingleFile: Boolean = false,
                 disableFastConcat: Boolean = false) = {
+
     writeTextRdd(rdd.map(FeatureRDD.toBed),
       fileName,
       asSingleFile,
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/Features.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/Features.scala
index 9b18848040..f46acc5589 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/Features.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/Features.scala
@@ -225,4 +225,55 @@ private[feature] object Features {
     }
     "sequence_feature"
   }
+
+  /**
+   * Format the feature score as double floating point values
+   * with "." as missing value.
+   *
+   * @param score Feature score to format.
+   * @return Return the specified feature score formatted as
+   *    double floating point values with "." as missing value.
+   */
+  def formatScore(score: java.lang.Double): String = {
+    Option(score).fold(".")(_.toString)
+  }
+
+  /**
+   * Interpolate the feature score to integer values between 0 and 1000,
+   * with missing value as specified.
+   *
+   * @param score Feature score to interpolate.
+   * @param minimumScore Minimum score, interpolated to 0.
+   * @param maximumScore Maximum score, interpolated to 1000.
+   * @param missingValue Value to use if score is not specified.
+   * @return Return the specified feature score interpolated to integer values
+   *    between 0 and 1000, with missing value as specified.
+   */
+  def interpolateScore(score: java.lang.Double,
+                       minimumScore: Double,
+                       maximumScore: Double,
+                       missingValue: Int): Int = {
+
+    def constrain(v: Double, min: Double, max: Double): Double = {
+      if (v < min) {
+        min
+      } else if (v > max) {
+        max
+      } else {
+        v
+      }
+    }
+
+    def interp(value: Double,
+               sourceMin: Double,
+               sourceMax: Double,
+               targetMin: Double,
+               targetMax: Double): Double = {
+
+      val v = constrain(value, sourceMin, sourceMax)
+      targetMin + (targetMax - targetMin) * ((v - sourceMin) / (sourceMax - sourceMin))
+    }
+
+    Option(score).fold(missingValue)(interp(_, minimumScore, maximumScore, 0, 1000).toInt)
+  }
 }
diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureRDDSuite.scala
index 1950737fbc..f77c34c615 100644
--- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureRDDSuite.scala
+++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureRDDSuite.scala
@@ -343,6 +343,26 @@ class FeatureRDDSuite extends ADAMFunSuite {
     checkFiles(inputPath, outputPath)
   }
 
+  sparkTest("save to UCSC BED format") {
+    val inputPath = testFile("dvl1.200.bed")
+    val expected = sc.loadBed(inputPath)
+    val outputPath = tempLocation(".bed")
+    expected.saveAsUcscBed(outputPath,
+      asSingleFile = true,
+      minimumScore = 0.0,
+      maximumScore = 200.0)
+
+    val lines = sc.textFile(outputPath)
+    val bedCols = lines.first.split("\t")
+    assert(bedCols.size === 6)
+    assert(bedCols(0) === "1")
+    assert(bedCols(1) === "1331345")
+    assert(bedCols(2) === "1331536")
+    assert(bedCols(3) === "106624")
+    assert(bedCols(4) === "67")
+    assert(bedCols(5) === "+")
+  }
+
   sparkTest("save IntervalList as GTF format") {
     val inputPath = testFile("SeqCap_EZ_Exome_v3.hg19.interval_list")
     val features = sc.loadIntervalList(inputPath)