From faaf9ac5b0cf45663c220344026fed93941be58b Mon Sep 17 00:00:00 2001 From: Michael Heuer Date: Tue, 4 Sep 2018 10:35:51 -0500 Subject: [PATCH] Adding support for writing BED format per UCSC definition. --- .../adam/rdd/feature/FeatureRDD.scala | 50 +++++++++++++++++- .../adam/rdd/feature/Features.scala | 51 +++++++++++++++++++ .../adam/rdd/feature/FeatureRDDSuite.scala | 20 ++++++++ 3 files changed, 119 insertions(+), 2 deletions(-) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala index cde23938ef..b9e277afd9 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala @@ -202,13 +202,30 @@ object FeatureRDD { * @return Returns the feature as a single line BED string. */ private[rdd] def toBed(feature: Feature): String = { + toBed(feature, None, None, None) + } + + /** + * @param feature Feature to write in BED format. + * @return Returns the feature as a single line BED string. + */ + private[rdd] def toBed(feature: Feature, + minimumScore: Option[Double], + maximumScore: Option[Double], + missingValue: Option[Int]): String = { + val chrom = feature.getContigName val start = feature.getStart val end = feature.getEnd val name = Features.nameOf(feature) - val score = Option(feature.getScore).getOrElse(".") val strand = Features.asString(feature.getStrand) + val score = if (minimumScore.isDefined && maximumScore.isDefined && missingValue.isDefined) { + Features.interpolateScore(feature.getScore, minimumScore.get, maximumScore.get, missingValue.get) + } else { + Features.formatScore(feature.getScore) + } + if (!feature.getAttributes.containsKey("thickStart") && !feature.getAttributes.containsKey("itemRgb") && !feature.getAttributes.containsKey("blockCount")) { @@ -492,7 +509,35 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd } /** - * Save this FeatureRDD in BED format. + * Save this FeatureRDD in UCSC BED format, where score is formatted as + * integer values between 0 and 1000, with missing value as specified. + * + * @param fileName The path to save BED formatted text file(s) to. + * @param asSingleFile By default (false), writes file to disk as shards with + * one shard per partition. If true, we save the file to disk as a single + * file by merging the shards. + * @param disableFastConcat If asSingleFile is true, disables the use of the + * parallel file merging engine. + * @param minimumScore Minimum score, interpolated to 0. + * @param maximumScore Maximum score, interpolated to 1000. + * @param missingValue Value to use if score is not specified. Defaults to 0. + */ + def saveAsUcscBed(fileName: String, + asSingleFile: Boolean = false, + disableFastConcat: Boolean = false, + minimumScore: Double, + maximumScore: Double, + missingValue: Int = 0) = { + + writeTextRdd(rdd.map(FeatureRDD.toBed(_, Some(minimumScore), Some(maximumScore), Some(missingValue))), + fileName, + asSingleFile, + disableFastConcat) + } + + /** + * Save this FeatureRDD in bedtools2 BED format, where score is formatted + * as double floating point values with missing values. * * @param fileName The path to save BED formatted text file(s) to. * @param asSingleFile By default (false), writes file to disk as shards with @@ -504,6 +549,7 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd def saveAsBed(fileName: String, asSingleFile: Boolean = false, disableFastConcat: Boolean = false) = { + writeTextRdd(rdd.map(FeatureRDD.toBed), fileName, asSingleFile, diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/Features.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/Features.scala index 9b18848040..f46acc5589 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/Features.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/Features.scala @@ -225,4 +225,55 @@ private[feature] object Features { } "sequence_feature" } + + /** + * Format the feature score as double floating point values + * with "." as missing value. + * + * @param score Feature score to format. + * @return Return the specified feature score formatted as + * double floating point values with "." as missing value. + */ + def formatScore(score: java.lang.Double): String = { + Option(score).fold(".")(_.toString) + } + + /** + * Interpolate the feature score to integer values between 0 and 1000, + * with missing value as specified. + * + * @param score Feature score to interpolate. + * @param minimumScore Minimum score, interpolated to 0. + * @param maximumScore Maximum score, interpolated to 1000. + * @param missingValue Value to use if score is not specified. + * @return Return the specified feature score interpolated to integer values + * between 0 and 1000, with missing value as specified. + */ + def interpolateScore(score: java.lang.Double, + minimumScore: Double, + maximumScore: Double, + missingValue: Int): Int = { + + def constrain(v: Double, min: Double, max: Double): Double = { + if (v < min) { + min + } else if (v > max) { + max + } else { + v + } + } + + def interp(value: Double, + sourceMin: Double, + sourceMax: Double, + targetMin: Double, + targetMax: Double): Double = { + + val v = constrain(value, sourceMin, sourceMax) + targetMin + (targetMax - targetMin) * ((v - sourceMin) / (sourceMax - sourceMin)) + } + + Option(score).fold(missingValue)(interp(_, minimumScore, maximumScore, 0, 1000).toInt) + } } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureRDDSuite.scala index 1950737fbc..f77c34c615 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureRDDSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureRDDSuite.scala @@ -343,6 +343,26 @@ class FeatureRDDSuite extends ADAMFunSuite { checkFiles(inputPath, outputPath) } + sparkTest("save to UCSC BED format") { + val inputPath = testFile("dvl1.200.bed") + val expected = sc.loadBed(inputPath) + val outputPath = tempLocation(".bed") + expected.saveAsUcscBed(outputPath, + asSingleFile = true, + minimumScore = 0.0, + maximumScore = 200.0) + + val lines = sc.textFile(outputPath) + val bedCols = lines.first.split("\t") + assert(bedCols.size === 6) + assert(bedCols(0) === "1") + assert(bedCols(1) === "1331345") + assert(bedCols(2) === "1331536") + assert(bedCols(3) === "106624") + assert(bedCols(4) === "67") + assert(bedCols(5) === "+") + } + sparkTest("save IntervalList as GTF format") { val inputPath = testFile("SeqCap_EZ_Exome_v3.hg19.interval_list") val features = sc.loadIntervalList(inputPath)