[ADAM-882] Add R API.

Resolves bigdatagenomics#882. Adds an R API that binds around the ADAM GenomicRDD APIs. Supports Spark 2.x and onwards, as accessible SparkR functionality for binding to Java libraries was added in Spark 2.0.0.
fnothaft · Jul 21, 2017 · d3975ba · d3975ba
1 parent 7652536
commit d3975ba
Show file tree

Hide file tree

Showing 20 changed files with 1,539 additions and 24 deletions.
diff --git a/adam-python/src/bdgenomics/adam/rdd.py b/adam-python/src/bdgenomics/adam/rdd.py
@@ -158,13 +158,13 @@ def saveAsSam(self,
 
         if asType is None:
 
-            type = self.sc._jvm.org.seqdoop.hadoop_bam.SAMFormat.inferFromFilePath(filePath)
+            fileType = self.sc._jvm.org.seqdoop.hadoop_bam.SAMFormat.inferFromFilePath(filePath)
 
         else:
 
-            type = self.sc._jvm.org.seqdoop.hadoop_bam.SAMFormat.valueOf(asType)
+            fileType = self.sc._jvm.org.seqdoop.hadoop_bam.SAMFormat.valueOf(asType)
 
-        self._jvmRdd.saveAsSam(filePath, type, asSingleFile, isSorted)
+        self._jvmRdd.saveAsSam(filePath, fileType, asSingleFile, isSorted)
 
 
     def saveAsSamString(self):

diff --git a/adam-r/bdg.adam/.gitignore b/adam-r/bdg.adam/.gitignore
@@ -0,0 +1,3 @@
+.Rproj.user
+.Rhistory
+.RData
diff --git a/adam-r/bdg.adam/DESCRIPTION b/adam-r/bdg.adam/DESCRIPTION
@@ -0,0 +1,25 @@
+Package: bdg.adam
+Type: Package
+Version: 0.23.0
+Title: R Frontend for Big Data Genomics/ADAM
+Description: The SparkR package provides an R Frontend for Apache Spark.
+Author: Big Data Genomics
+Maintainer: Frank Austin Nothaft <fnothaft@alumni.stanford.edu>
+Authors@R: c(person("Frank", "Nothaft", role = c("aut", "cre"),
+                    email = "fnothaft@alumni.stanford.edu"),
+             person(family = "Big Data Genomics", role = c("aut", "cph")))
+License: Apache License (== 2.0)
+URL: http://www.bdgenomics.org https://github.com/bigdatagenomics/adam
+BugReports: https://github.com/bigdatagenomics/adam/issues
+Imports:
+    methods
+Depends:
+    R (>= 3.0),
+    SparkR (>= 2.1.0)
+Suggests:
+    testthat
+Collate:
+    'generics.R'
+    'adam-context.R'
+    'rdd.R'
+RoxygenNote: 6.0.1
diff --git a/adam-r/bdg.adam/NAMESPACE b/adam-r/bdg.adam/NAMESPACE
@@ -0,0 +1,33 @@
+# Generated by roxygen2: do not edit by hand
+
+export(ADAMContext)
+exportClasses(ADAMContext)
+exportClasses(AlignmentRecordRDD)
+exportClasses(CoverageRDD)
+exportClasses(FeatureRDD)
+exportClasses(FragmentRDD)
+exportClasses(GenotypeRDD)
+exportClasses(NucleotideContigFragmentRDD)
+exportClasses(VariantRDD)
+exportMethods(aggregatedCoverage)
+exportMethods(countKmers)
+exportMethods(coverage)
+exportMethods(flankAdjacentFragments)
+exportMethods(flatten)
+exportMethods(loadAlignments)
+exportMethods(loadContigFragments)
+exportMethods(loadFeatures)
+exportMethods(loadFragments)
+exportMethods(loadGenotypes)
+exportMethods(loadVariants)
+exportMethods(markDuplicates)
+exportMethods(realignIndels)
+exportMethods(recalibrateBaseQualities)
+exportMethods(save)
+exportMethods(sortReadsByReferencePosition)
+exportMethods(sortReadsByReferencePositionAndIndex)
+exportMethods(toCoverage)
+exportMethods(toDF)
+exportMethods(toFeatureRDD)
+exportMethods(toFragments)
+exportMethods(toReads)
diff --git a/adam-r/bdg.adam/R/adam-context.R b/adam-r/bdg.adam/R/adam-context.R
@@ -0,0 +1,149 @@
+#
+# Licensed to Big Data Genomics (BDG) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The BDG licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+library(SparkR)
+
+setOldClass("jobj")
+
+#' @title Class that represents an ADAMContext.
+#' @description The ADAMContext provides helper methods for loading in genomic
+#'              data into a Spark RDD/Dataframe.
+#' @slot jac Java object reference to the backing JavaADAMContext.
+#' @export
+setClass("ADAMContext",
+         slots = list(jac = "jobj"))
+
+#' @export
+ADAMContext <- function(ss) {
+    ssc = sparkR.callJMethod(ss, "sparkContext")
+    ac = sparkR.newJObject("org.bdgenomics.adam.rdd.ADAMContext", ssc)
+    jac = sparkR.newJObject("org.bdgenomics.adam.api.java.JavaADAMContext", ac)
+
+    new("ADAMContext", jac = jac)
+}
+
+#' Loads in an ADAM read file. This method can load SAM, BAM, and ADAM files.
+#'
+#' Loads path names ending in:
+#' * .bam/.cram/.sam as BAM/CRAM/SAM format,
+#' * .fa/.fasta as FASTA format,
+#' * .fq/.fastq as FASTQ format, and
+#' * .ifq as interleaved FASTQ format.
+#'
+#' If none of these match, fall back to Parquet + Avro.
+#'
+#' For FASTA, FASTQ, and interleaved FASTQ formats, compressed files are supported
+#' through compression codecs configured in Hadoop, which by default include .gz and .bz2,
+#' but can include more.
+#'
+#' @param ac The ADAMContext.
+#' @param filePath The path to load the file from.
+#' @return Returns an RDD containing reads.
+#'
+#' @export
+setMethod("loadAlignments",
+          signature(ac = "ADAMContext", filePath = "character"),
+          function(ac, filePath) {
+              jrdd <- sparkR.callJMethod(ac@jac, "loadAlignments", filePath)
+              AlignmentRecordRDD(jrdd)
+          })
+
+#' Loads in sequence fragments.
+#'
+#' Can load from FASTA or from Parquet encoded NucleotideContigFragments.
+#'
+#' @param ac The ADAMContext.
+#' @param filePath The path to load the file from.
+#' @return Returns an RDD containing sequence fragments.
+#'
+#' @export
+setMethod("loadContigFragments",
+          signature(ac = "ADAMContext", filePath = "character"),
+          function(ac, filePath) {
+              jrdd <- sparkR.callJMethod(ac@jac, "loadContigFragments", filePath)
+              NucleotideContigFragmentRDD(jrdd)
+          })
+
+#' Loads in read pairs as fragments.
+#'
+#' @param ac The ADAMContext.
+#' @param filePath The path to load the file from.
+#' @return Returns an RDD containing sequence fragments.
+#'
+#' @export
+setMethod("loadFragments",
+          signature(ac = "ADAMContext", filePath = "character"),
+          function(ac, filePath) {
+              jrdd <- sparkR.callJMethod(ac@jac, "loadFragments", filePath)
+              FragmentRDD(jrdd)
+          })
+
+#' Loads in genomic features.
+#'
+#' @param ac The ADAMContext.
+#' @param filePath The path to load the file from.
+#' @return Returns an RDD containing features.
+#'
+#' @export
+setMethod("loadFeatures",
+          signature(ac = "ADAMContext", filePath = "character"),
+          function(ac, filePath) {
+              jrdd <- sparkR.callJMethod(ac@jac, "loadFeatures", filePath)
+              FeatureRDD(jrdd)
+          })
+
+#' Loads in genomic features as coverage counts.
+#'
+#' @param ac The ADAMContext.
+#' @param filePath The path to load the file from.
+#' @return Returns an RDD containing coverage.
+#'
+#' @export
+setMethod("loadCoverage",
+          signature(ac = "ADAMContext", filePath = "character"),
+          function(ac, filePath) {
+              jrdd <- sparkR.callJMethod(ac@jac, "loadCoverage", filePath)
+              CoverageRDD(jrdd)
+          })
+
+#' Loads in genotypes.
+#'
+#' @param ac The ADAMContext.
+#' @param filePath The path to load the file from.
+#' @return Returns an RDD containing genotypes.
+#'
+#' @export
+setMethod("loadGenotypes",
+          signature(ac = "ADAMContext", filePath = "character"),
+          function(ac, filePath) {
+              jrdd <- sparkR.callJMethod(ac@jac, "loadGenotypes", filePath)
+              GenotypeRDD(jrdd)
+          })
+
+#' Loads in variants.
+#'
+#' @param ac The ADAMContext.
+#' @param filePath The path to load the file from.
+#' @return Returns an RDD containing variants.
+#'
+#' @export
+setMethod("loadVariants",
+          signature(ac = "ADAMContext", filePath = "character"),
+          function(ac, filePath) {
+              jrdd <- sparkR.callJMethod(ac@jac, "loadVariants", filePath)
+              VariantRDD(jrdd)
+          })