Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ADAM-2039] Adding support for writing BED format per UCSC definition #2042

Merged
merged 2 commits into from
Oct 16, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -202,13 +202,30 @@ object FeatureRDD {
* @return Returns the feature as a single line BED string.
*/
private[rdd] def toBed(feature: Feature): String = {
toBed(feature, None, None, None)
}

/**
* @param feature Feature to write in BED format.
* @return Returns the feature as a single line BED string.
*/
private[rdd] def toBed(feature: Feature,
minimumScore: Option[Double],
maximumScore: Option[Double],
missingValue: Option[Int]): String = {

val chrom = feature.getContigName
val start = feature.getStart
val end = feature.getEnd
val name = Features.nameOf(feature)
val score = Option(feature.getScore).getOrElse(".")
val strand = Features.asString(feature.getStrand)

val score = if (minimumScore.isDefined && maximumScore.isDefined && missingValue.isDefined) {
Features.interpolateScore(feature.getScore, minimumScore.get, maximumScore.get, missingValue.get)
} else {
Features.formatScore(feature.getScore)
}

if (!feature.getAttributes.containsKey("thickStart") &&
!feature.getAttributes.containsKey("itemRgb") &&
!feature.getAttributes.containsKey("blockCount")) {
Expand Down Expand Up @@ -492,7 +509,35 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd
}

/**
* Save this FeatureRDD in BED format.
* Save this FeatureRDD in UCSC BED format, where score is formatted as
* integer values between 0 and 1000, with missing value as specified.
*
* @param fileName The path to save BED formatted text file(s) to.
* @param asSingleFile By default (false), writes file to disk as shards with
* one shard per partition. If true, we save the file to disk as a single
* file by merging the shards.
* @param disableFastConcat If asSingleFile is true, disables the use of the
* parallel file merging engine.
* @param minimumScore Minimum score, interpolated to 0.
* @param maximumScore Maximum score, interpolated to 1000.
* @param missingValue Value to use if score is not specified. Defaults to 0.
*/
def saveAsUcscBed(fileName: String,
asSingleFile: Boolean = false,
disableFastConcat: Boolean = false,
minimumScore: Double,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If these are "interpolated to 0/1000", shouldn't 0.0/1000.0 be the default values?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minimumScore is the score which interpolates to 0. maximumScore is the score which interpolates to 1000. Output in this convention is always integers between 0 and 1000, inclusive.

maximumScore: Double,
missingValue: Int = 0) = {

writeTextRdd(rdd.map(FeatureRDD.toBed(_, Some(minimumScore), Some(maximumScore), Some(missingValue))),
fileName,
asSingleFile,
disableFastConcat)
}

/**
* Save this FeatureRDD in bedtools2 BED format, where score is formatted
* as double floating point values with missing values.
*
* @param fileName The path to save BED formatted text file(s) to.
* @param asSingleFile By default (false), writes file to disk as shards with
Expand All @@ -504,6 +549,7 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd
def saveAsBed(fileName: String,
asSingleFile: Boolean = false,
disableFastConcat: Boolean = false) = {

writeTextRdd(rdd.map(FeatureRDD.toBed),
fileName,
asSingleFile,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package org.bdgenomics.adam.rdd.feature
import org.bdgenomics.formats.avro.{ Dbxref, Feature, OntologyTerm, Strand }
import scala.collection.JavaConversions._
import scala.collection.mutable.{ ArrayBuffer, HashMap, MutableList }
import scala.math.{ max, min }

/**
* Utility methods on features and related classes.
Expand Down Expand Up @@ -225,4 +226,55 @@ private[feature] object Features {
}
"sequence_feature"
}

/**
* Format the feature score as double floating point values
* with "." as missing value.
*
* @param score Feature score to format.
* @return Return the specified feature score formatted as
* double floating point values with "." as missing value.
*/
def formatScore(score: java.lang.Double): String = {
Option(score).fold(".")(_.toString)
}

/**
* Interpolate the feature score to integer values between 0 and 1000,
* with missing value as specified.
*
* @param score Feature score to interpolate.
* @param minimumScore Minimum score, interpolated to 0.
* @param maximumScore Maximum score, interpolated to 1000.
* @param missingValue Value to use if score is not specified.
* @return Return the specified feature score interpolated to integer values
* between 0 and 1000, with missing value as specified.
*/
def interpolateScore(score: java.lang.Double,
minimumScore: Double,
maximumScore: Double,
missingValue: Int): Int = {

def constrain(v: Double, min: Double, max: Double): Double = {
if (v < min) {
min
} else if (v > max) {
max
} else {
v
}
}

def interp(value: Double,
sourceMin: Double,
sourceMax: Double,
targetMin: Double,
targetMax: Double): Double = {

val v = max(min(sourceMax, value), sourceMin)
targetMin + (targetMax - targetMin) * ((v - sourceMin) / (sourceMax - sourceMin))
}

Option(score).fold(missingValue)(interp(_, minimumScore, maximumScore, 0, 1000).toInt)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,26 @@ class FeatureRDDSuite extends ADAMFunSuite {
checkFiles(inputPath, outputPath)
}

sparkTest("save to UCSC BED format") {
val inputPath = testFile("dvl1.200.bed")
val expected = sc.loadBed(inputPath)
val outputPath = tempLocation(".bed")
expected.saveAsUcscBed(outputPath,
asSingleFile = true,
minimumScore = 0.0,
maximumScore = 200.0)

val lines = sc.textFile(outputPath)
val bedCols = lines.first.split("\t")
assert(bedCols.size === 6)
assert(bedCols(0) === "1")
assert(bedCols(1) === "1331345")
assert(bedCols(2) === "1331536")
assert(bedCols(3) === "106624")
assert(bedCols(4) === "67")
assert(bedCols(5) === "+")
}

sparkTest("save IntervalList as GTF format") {
val inputPath = testFile("SeqCap_EZ_Exome_v3.hg19.interval_list")
val features = sc.loadIntervalList(inputPath)
Expand Down