[ADAM-1168] Write to .gff3 file path as GFF3

bigdatagenomics · Sep 13, 2016 · d447081 · d447081
1 parent c6bce89
commit d447081
Show file tree

Hide file tree

Showing 5 changed files with 32 additions and 49 deletions.
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFeatures.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFeatures.scala
@@ -33,7 +33,7 @@ object TransformFeatures extends BDGCommandCompanion {
 
 class TransformFeaturesArgs extends Args4jBase with ParquetSaveArgs {
   @Argument(required = true, metaVar = "INPUT",
-    usage = "The features file to convert (e.g., .bed, .gff). If extension is not detected, Parquet is assumed.", index = 0)
+    usage = "The features file to convert (e.g., .bed, .gff/.gtf, .gff3, .interval_list, .narrowPeak). If extension is not detected, Parquet is assumed.", index = 0)
   var featuresFile: String = _
 
   @Argument(required = true, metaVar = "OUTPUT",

diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala
@@ -1205,6 +1205,7 @@ class ADAMContext private (@transient val sc: SparkContext) extends Serializable
    *
    * @see loadBed
    * @see loadGtf
+   * @see loadGff3
    * @see loadNarrowPeak
    * @see loadIntervalList
    * @see loadParquetFeatures

diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/features/FeatureRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/features/FeatureRDD.scala
@@ -210,11 +210,10 @@ case class FeatureRDD(rdd: RDD[Feature],
   /**
    * Java friendly save function. Automatically detects the output format.
    *
-   * If the filename ends in ".bed", we write a BED file. If the file name ends
-   * in ".gtf" or ".gff", we write the file as GTF/GFF2. If the file name ends
-   * in ".narrow[pP]eak", we save in the NarrowPeak format. If the file name
-   * ends in ".interval_list", we save in the interval list format. Else, we
-   * save as Parquet. These files are written as sharded text files.
+   * Writes files ending in .bed as BED6/12, .gff3 as GFF3, .gtf/.gff as
+   * GTF/GFF2, .narrow[pP]eak as NarrowPeak, and .interval_list as
+   * IntervalList. If none of these match, we fall back to Parquet.
+   * These files are written as sharded text files.
    *
    * @param filePath The location to write the output.
    * @param asSingleFile If false, writes file to disk as shards with
@@ -227,6 +226,8 @@ case class FeatureRDD(rdd: RDD[Feature],
     } else if (filePath.endsWith(".gtf") ||
       filePath.endsWith(".gff")) {
       saveAsGtf(filePath, asSingleFile = asSingleFile)
+    } else if (filePath.endsWith(".gff3")) {
+      saveAsGff3(filePath, asSingleFile = asSingleFile)
     } else if (filePath.endsWith(".narrowPeak") ||
       filePath.endsWith(".narrowpeak")) {
       saveAsNarrowPeak(filePath, asSingleFile = asSingleFile)
@@ -340,7 +341,6 @@ case class FeatureRDD(rdd: RDD[Feature],
    *   file by merging the shards.
    */
   def saveAsIntervalList(fileName: String, asSingleFile: Boolean = false) = {
-    // todo:  SAM style header
     val intervalEntities = rdd.map(FeatureRDD.toInterval)
 
     if (asSingleFile) {

diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala
@@ -138,33 +138,10 @@ class ADAMContextSuite extends ADAMFunSuite {
     val arr = annot.collect
 
     val first = arr.find(f => f.getContigName == "chr1" && f.getStart == 14415L && f.getEnd == 14499L).get
-    assert(
-      first
-        .getDbxrefs
-        .map(dbxref => dbxref.getDb -> dbxref.getAccession)
-        .groupBy(_._1)
-        .mapValues(_.map(_._2).toSet) ==
-        Map(
-          "gn" -> Set("DDX11L1", "RP11-34P13.2"),
-          "ens" -> Set("ENSG00000223972", "ENSG00000227232"),
-          "vega" -> Set("OTTHUMG00000000958", "OTTHUMG00000000961")
-        )
-    )
+    assert(first.getName === "gn|DDX11L1;gn|RP11-34P13.2;ens|ENSG00000223972;ens|ENSG00000227232;vega|OTTHUMG00000000958;vega|OTTHUMG00000000961")
 
     val last = arr.find(f => f.getContigName == "chrY" && f.getStart == 27190031L && f.getEnd == 27190210L).get
-    assert(
-      last
-        .getDbxrefs
-        .map(dbxref => dbxref.getDb -> dbxref.getAccession)
-        .groupBy(_._1)
-        .mapValues(_.map(_._2).toSet) ==
-        Map(
-          "gn" -> Set("BPY2C"),
-          "ccds" -> Set("CCDS44030"),
-          "ens" -> Set("ENSG00000185894"),
-          "vega" -> Set("OTTHUMG00000045199")
-        )
-    )
+    assert(last.getName === "gn|BPY2C;ccds|CCDS44030;ens|ENSG00000185894;vega|OTTHUMG00000045199")
   }
 
   sparkTest("can read a small .vcf file") {

diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/features/FeatureRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/features/FeatureRDDSuite.scala
@@ -60,7 +60,7 @@ class FeatureRDDSuite extends ADAMFunSuite with TypeCheckedTripleEquals {
     new File(tempDir, tempFile.getName + suffix).getAbsolutePath
   }
 
-  sparkTest("save GTF as GTF format") {
+  sparkTest("round trip GTF format") {
     val inputPath = resourcePath("Homo_sapiens.GRCh37.75.trun100.gtf")
     val features = sc.loadGtf(inputPath)
 
@@ -131,6 +131,15 @@ class FeatureRDDSuite extends ADAMFunSuite with TypeCheckedTripleEquals {
     assert(features.rdd.count === reloadedFeatures.rdd.count)
   }
 
+  sparkTest("save GFF3 as GTF format") {
+    val inputPath = resourcePath("dvl1.200.gff3")
+    val features = sc.loadGff3(inputPath)
+    val outputPath = tempLocation(".gtf")
+    features.saveAsGtf(outputPath)
+    val reloadedFeatures = sc.loadGtf(outputPath)
+    assert(features.rdd.count === reloadedFeatures.rdd.count)
+  }
+
   sparkTest("save GFF3 as BED format") {
     val inputPath = resourcePath("dvl1.200.gff3")
     val features = sc.loadGff3(inputPath)
@@ -183,9 +192,9 @@ class FeatureRDDSuite extends ADAMFunSuite with TypeCheckedTripleEquals {
 
     val actual = sc.loadGff3(outputPath)
     val pairs = expected.rdd.collect.zip(actual.rdd.collect)
-
-    // separate foreach since assert is not serializable
-    pairs.foreach({ pair: (Feature, Feature) => assert(pair._1 === pair._2) })
+    pairs.foreach(p => {
+      assert(p._1 === p._2)
+    })
   }
 
   sparkTest("save BED as GTF format") {
@@ -240,12 +249,11 @@ class FeatureRDDSuite extends ADAMFunSuite with TypeCheckedTripleEquals {
     assert(bedCols(4) === "13.53")
     assert(bedCols(5) === "+")
 
-    // grab all partitions, may not necessarily be in order; sort by reference
     val actual = sc.loadBed(outputPath)
     val pairs = expected.rdd.collect.zip(actual.rdd.collect)
-
-    // separate since assert is not serializable
-    pairs.foreach({ pair: (Feature, Feature) => assert(pair._1 === pair._2) })
+    pairs.foreach(p => {
+      assert(p._1 === p._2)
+    })
   }
 
   sparkTest("save IntervalList as GTF format") {
@@ -322,12 +330,11 @@ class FeatureRDDSuite extends ADAMFunSuite with TypeCheckedTripleEquals {
     val outputPath = tempLocation(".interval_list")
     expected.saveAsIntervalList(outputPath, asSingleFile = true)
 
-    // grab all partitions, may not necessarily be in order; sort by reference
     val actual = sc.loadIntervalList(outputPath)
-    val pairs = expected.rdd.zip(actual.rdd).collect
-
-    // separate foreach since assert is not serializable
-    pairs.foreach({ pair: (Feature, Feature) => assert(pair._1 === pair._2) })
+    val pairs = expected.rdd.collect.zip(actual.rdd.collect)
+    pairs.foreach(p => {
+      assert(p._1 === p._2)
+    })
   }
 
   sparkTest("save NarrowPeak as GTF format") {
@@ -395,10 +402,8 @@ class FeatureRDDSuite extends ADAMFunSuite with TypeCheckedTripleEquals {
 
     val actual = sc.loadNarrowPeak(outputPath)
     val pairs = expected.rdd.zip(actual.rdd).collect
-
-    // separate foreach since assert is not serializable
-    pairs.foreach(pair => {
-      assert(pair._1 === pair._2)
+    pairs.foreach(p => {
+      assert(p._1 === p._2)
     })
   }