Skip to content

Commit

Permalink
[ADAM-1168] Write to .gff3 file path as GFF3
Browse files Browse the repository at this point in the history
  • Loading branch information
heuermh committed Sep 13, 2016
1 parent c6bce89 commit d447081
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 49 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ object TransformFeatures extends BDGCommandCompanion {

class TransformFeaturesArgs extends Args4jBase with ParquetSaveArgs {
@Argument(required = true, metaVar = "INPUT",
usage = "The features file to convert (e.g., .bed, .gff). If extension is not detected, Parquet is assumed.", index = 0)
usage = "The features file to convert (e.g., .bed, .gff/.gtf, .gff3, .interval_list, .narrowPeak). If extension is not detected, Parquet is assumed.", index = 0)
var featuresFile: String = _

@Argument(required = true, metaVar = "OUTPUT",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1205,6 +1205,7 @@ class ADAMContext private (@transient val sc: SparkContext) extends Serializable
*
* @see loadBed
* @see loadGtf
* @see loadGff3
* @see loadNarrowPeak
* @see loadIntervalList
* @see loadParquetFeatures
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,11 +210,10 @@ case class FeatureRDD(rdd: RDD[Feature],
/**
* Java friendly save function. Automatically detects the output format.
*
* If the filename ends in ".bed", we write a BED file. If the file name ends
* in ".gtf" or ".gff", we write the file as GTF/GFF2. If the file name ends
* in ".narrow[pP]eak", we save in the NarrowPeak format. If the file name
* ends in ".interval_list", we save in the interval list format. Else, we
* save as Parquet. These files are written as sharded text files.
* Writes files ending in .bed as BED6/12, .gff3 as GFF3, .gtf/.gff as
* GTF/GFF2, .narrow[pP]eak as NarrowPeak, and .interval_list as
* IntervalList. If none of these match, we fall back to Parquet.
* These files are written as sharded text files.
*
* @param filePath The location to write the output.
* @param asSingleFile If false, writes file to disk as shards with
Expand All @@ -227,6 +226,8 @@ case class FeatureRDD(rdd: RDD[Feature],
} else if (filePath.endsWith(".gtf") ||
filePath.endsWith(".gff")) {
saveAsGtf(filePath, asSingleFile = asSingleFile)
} else if (filePath.endsWith(".gff3")) {
saveAsGff3(filePath, asSingleFile = asSingleFile)
} else if (filePath.endsWith(".narrowPeak") ||
filePath.endsWith(".narrowpeak")) {
saveAsNarrowPeak(filePath, asSingleFile = asSingleFile)
Expand Down Expand Up @@ -340,7 +341,6 @@ case class FeatureRDD(rdd: RDD[Feature],
* file by merging the shards.
*/
def saveAsIntervalList(fileName: String, asSingleFile: Boolean = false) = {
// todo: SAM style header
val intervalEntities = rdd.map(FeatureRDD.toInterval)

if (asSingleFile) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,33 +138,10 @@ class ADAMContextSuite extends ADAMFunSuite {
val arr = annot.collect

val first = arr.find(f => f.getContigName == "chr1" && f.getStart == 14415L && f.getEnd == 14499L).get
assert(
first
.getDbxrefs
.map(dbxref => dbxref.getDb -> dbxref.getAccession)
.groupBy(_._1)
.mapValues(_.map(_._2).toSet) ==
Map(
"gn" -> Set("DDX11L1", "RP11-34P13.2"),
"ens" -> Set("ENSG00000223972", "ENSG00000227232"),
"vega" -> Set("OTTHUMG00000000958", "OTTHUMG00000000961")
)
)
assert(first.getName === "gn|DDX11L1;gn|RP11-34P13.2;ens|ENSG00000223972;ens|ENSG00000227232;vega|OTTHUMG00000000958;vega|OTTHUMG00000000961")

val last = arr.find(f => f.getContigName == "chrY" && f.getStart == 27190031L && f.getEnd == 27190210L).get
assert(
last
.getDbxrefs
.map(dbxref => dbxref.getDb -> dbxref.getAccession)
.groupBy(_._1)
.mapValues(_.map(_._2).toSet) ==
Map(
"gn" -> Set("BPY2C"),
"ccds" -> Set("CCDS44030"),
"ens" -> Set("ENSG00000185894"),
"vega" -> Set("OTTHUMG00000045199")
)
)
assert(last.getName === "gn|BPY2C;ccds|CCDS44030;ens|ENSG00000185894;vega|OTTHUMG00000045199")
}

sparkTest("can read a small .vcf file") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class FeatureRDDSuite extends ADAMFunSuite with TypeCheckedTripleEquals {
new File(tempDir, tempFile.getName + suffix).getAbsolutePath
}

sparkTest("save GTF as GTF format") {
sparkTest("round trip GTF format") {
val inputPath = resourcePath("Homo_sapiens.GRCh37.75.trun100.gtf")
val features = sc.loadGtf(inputPath)

Expand Down Expand Up @@ -131,6 +131,15 @@ class FeatureRDDSuite extends ADAMFunSuite with TypeCheckedTripleEquals {
assert(features.rdd.count === reloadedFeatures.rdd.count)
}

sparkTest("save GFF3 as GTF format") {
val inputPath = resourcePath("dvl1.200.gff3")
val features = sc.loadGff3(inputPath)
val outputPath = tempLocation(".gtf")
features.saveAsGtf(outputPath)
val reloadedFeatures = sc.loadGtf(outputPath)
assert(features.rdd.count === reloadedFeatures.rdd.count)
}

sparkTest("save GFF3 as BED format") {
val inputPath = resourcePath("dvl1.200.gff3")
val features = sc.loadGff3(inputPath)
Expand Down Expand Up @@ -183,9 +192,9 @@ class FeatureRDDSuite extends ADAMFunSuite with TypeCheckedTripleEquals {

val actual = sc.loadGff3(outputPath)
val pairs = expected.rdd.collect.zip(actual.rdd.collect)

// separate foreach since assert is not serializable
pairs.foreach({ pair: (Feature, Feature) => assert(pair._1 === pair._2) })
pairs.foreach(p => {
assert(p._1 === p._2)
})
}

sparkTest("save BED as GTF format") {
Expand Down Expand Up @@ -240,12 +249,11 @@ class FeatureRDDSuite extends ADAMFunSuite with TypeCheckedTripleEquals {
assert(bedCols(4) === "13.53")
assert(bedCols(5) === "+")

// grab all partitions, may not necessarily be in order; sort by reference
val actual = sc.loadBed(outputPath)
val pairs = expected.rdd.collect.zip(actual.rdd.collect)

// separate since assert is not serializable
pairs.foreach({ pair: (Feature, Feature) => assert(pair._1 === pair._2) })
pairs.foreach(p => {
assert(p._1 === p._2)
})
}

sparkTest("save IntervalList as GTF format") {
Expand Down Expand Up @@ -322,12 +330,11 @@ class FeatureRDDSuite extends ADAMFunSuite with TypeCheckedTripleEquals {
val outputPath = tempLocation(".interval_list")
expected.saveAsIntervalList(outputPath, asSingleFile = true)

// grab all partitions, may not necessarily be in order; sort by reference
val actual = sc.loadIntervalList(outputPath)
val pairs = expected.rdd.zip(actual.rdd).collect

// separate foreach since assert is not serializable
pairs.foreach({ pair: (Feature, Feature) => assert(pair._1 === pair._2) })
val pairs = expected.rdd.collect.zip(actual.rdd.collect)
pairs.foreach(p => {
assert(p._1 === p._2)
})
}

sparkTest("save NarrowPeak as GTF format") {
Expand Down Expand Up @@ -395,10 +402,8 @@ class FeatureRDDSuite extends ADAMFunSuite with TypeCheckedTripleEquals {

val actual = sc.loadNarrowPeak(outputPath)
val pairs = expected.rdd.zip(actual.rdd).collect

// separate foreach since assert is not serializable
pairs.foreach(pair => {
assert(pair._1 === pair._2)
pairs.foreach(p => {
assert(p._1 === p._2)
})
}

Expand Down

0 comments on commit d447081

Please sign in to comment.