Skip to content

Commit

Permalink
[ADAM-882] Add R API.
Browse files Browse the repository at this point in the history
Resolves bigdatagenomics#882. Adds an R API that binds around the ADAM GenomicRDD APIs.
Supports Spark 2.x and onwards, as accessible SparkR functionality for binding
to Java libraries was added in Spark 2.0.0.
  • Loading branch information
fnothaft committed Jul 21, 2017
1 parent 7652536 commit d3975ba
Show file tree
Hide file tree
Showing 20 changed files with 1,539 additions and 24 deletions.
6 changes: 3 additions & 3 deletions adam-python/src/bdgenomics/adam/rdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,13 +158,13 @@ def saveAsSam(self,

if asType is None:

type = self.sc._jvm.org.seqdoop.hadoop_bam.SAMFormat.inferFromFilePath(filePath)
fileType = self.sc._jvm.org.seqdoop.hadoop_bam.SAMFormat.inferFromFilePath(filePath)

else:

type = self.sc._jvm.org.seqdoop.hadoop_bam.SAMFormat.valueOf(asType)
fileType = self.sc._jvm.org.seqdoop.hadoop_bam.SAMFormat.valueOf(asType)

self._jvmRdd.saveAsSam(filePath, type, asSingleFile, isSorted)
self._jvmRdd.saveAsSam(filePath, fileType, asSingleFile, isSorted)


def saveAsSamString(self):
Expand Down
3 changes: 3 additions & 0 deletions adam-r/bdg.adam/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.Rproj.user
.Rhistory
.RData
25 changes: 25 additions & 0 deletions adam-r/bdg.adam/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
Package: bdg.adam
Type: Package
Version: 0.23.0
Title: R Frontend for Big Data Genomics/ADAM
Description: The SparkR package provides an R Frontend for Apache Spark.
Author: Big Data Genomics
Maintainer: Frank Austin Nothaft <fnothaft@alumni.stanford.edu>
Authors@R: c(person("Frank", "Nothaft", role = c("aut", "cre"),
email = "fnothaft@alumni.stanford.edu"),
person(family = "Big Data Genomics", role = c("aut", "cph")))
License: Apache License (== 2.0)
URL: http://www.bdgenomics.org https://github.com/bigdatagenomics/adam
BugReports: https://github.com/bigdatagenomics/adam/issues
Imports:
methods
Depends:
R (>= 3.0),
SparkR (>= 2.1.0)
Suggests:
testthat
Collate:
'generics.R'
'adam-context.R'
'rdd.R'
RoxygenNote: 6.0.1
33 changes: 33 additions & 0 deletions adam-r/bdg.adam/NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Generated by roxygen2: do not edit by hand

export(ADAMContext)
exportClasses(ADAMContext)
exportClasses(AlignmentRecordRDD)
exportClasses(CoverageRDD)
exportClasses(FeatureRDD)
exportClasses(FragmentRDD)
exportClasses(GenotypeRDD)
exportClasses(NucleotideContigFragmentRDD)
exportClasses(VariantRDD)
exportMethods(aggregatedCoverage)
exportMethods(countKmers)
exportMethods(coverage)
exportMethods(flankAdjacentFragments)
exportMethods(flatten)
exportMethods(loadAlignments)
exportMethods(loadContigFragments)
exportMethods(loadFeatures)
exportMethods(loadFragments)
exportMethods(loadGenotypes)
exportMethods(loadVariants)
exportMethods(markDuplicates)
exportMethods(realignIndels)
exportMethods(recalibrateBaseQualities)
exportMethods(save)
exportMethods(sortReadsByReferencePosition)
exportMethods(sortReadsByReferencePositionAndIndex)
exportMethods(toCoverage)
exportMethods(toDF)
exportMethods(toFeatureRDD)
exportMethods(toFragments)
exportMethods(toReads)
149 changes: 149 additions & 0 deletions adam-r/bdg.adam/R/adam-context.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#
# Licensed to Big Data Genomics (BDG) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The BDG licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
library(SparkR)

setOldClass("jobj")

#' @title Class that represents an ADAMContext.
#' @description The ADAMContext provides helper methods for loading in genomic
#' data into a Spark RDD/Dataframe.
#' @slot jac Java object reference to the backing JavaADAMContext.
#' @export
setClass("ADAMContext",
slots = list(jac = "jobj"))

#' @export
ADAMContext <- function(ss) {
ssc = sparkR.callJMethod(ss, "sparkContext")
ac = sparkR.newJObject("org.bdgenomics.adam.rdd.ADAMContext", ssc)
jac = sparkR.newJObject("org.bdgenomics.adam.api.java.JavaADAMContext", ac)

new("ADAMContext", jac = jac)
}

#' Loads in an ADAM read file. This method can load SAM, BAM, and ADAM files.
#'
#' Loads path names ending in:
#' * .bam/.cram/.sam as BAM/CRAM/SAM format,
#' * .fa/.fasta as FASTA format,
#' * .fq/.fastq as FASTQ format, and
#' * .ifq as interleaved FASTQ format.
#'
#' If none of these match, fall back to Parquet + Avro.
#'
#' For FASTA, FASTQ, and interleaved FASTQ formats, compressed files are supported
#' through compression codecs configured in Hadoop, which by default include .gz and .bz2,
#' but can include more.
#'
#' @param ac The ADAMContext.
#' @param filePath The path to load the file from.
#' @return Returns an RDD containing reads.
#'
#' @export
setMethod("loadAlignments",
signature(ac = "ADAMContext", filePath = "character"),
function(ac, filePath) {
jrdd <- sparkR.callJMethod(ac@jac, "loadAlignments", filePath)
AlignmentRecordRDD(jrdd)
})

#' Loads in sequence fragments.
#'
#' Can load from FASTA or from Parquet encoded NucleotideContigFragments.
#'
#' @param ac The ADAMContext.
#' @param filePath The path to load the file from.
#' @return Returns an RDD containing sequence fragments.
#'
#' @export
setMethod("loadContigFragments",
signature(ac = "ADAMContext", filePath = "character"),
function(ac, filePath) {
jrdd <- sparkR.callJMethod(ac@jac, "loadContigFragments", filePath)
NucleotideContigFragmentRDD(jrdd)
})

#' Loads in read pairs as fragments.
#'
#' @param ac The ADAMContext.
#' @param filePath The path to load the file from.
#' @return Returns an RDD containing sequence fragments.
#'
#' @export
setMethod("loadFragments",
signature(ac = "ADAMContext", filePath = "character"),
function(ac, filePath) {
jrdd <- sparkR.callJMethod(ac@jac, "loadFragments", filePath)
FragmentRDD(jrdd)
})

#' Loads in genomic features.
#'
#' @param ac The ADAMContext.
#' @param filePath The path to load the file from.
#' @return Returns an RDD containing features.
#'
#' @export
setMethod("loadFeatures",
signature(ac = "ADAMContext", filePath = "character"),
function(ac, filePath) {
jrdd <- sparkR.callJMethod(ac@jac, "loadFeatures", filePath)
FeatureRDD(jrdd)
})

#' Loads in genomic features as coverage counts.
#'
#' @param ac The ADAMContext.
#' @param filePath The path to load the file from.
#' @return Returns an RDD containing coverage.
#'
#' @export
setMethod("loadCoverage",
signature(ac = "ADAMContext", filePath = "character"),
function(ac, filePath) {
jrdd <- sparkR.callJMethod(ac@jac, "loadCoverage", filePath)
CoverageRDD(jrdd)
})

#' Loads in genotypes.
#'
#' @param ac The ADAMContext.
#' @param filePath The path to load the file from.
#' @return Returns an RDD containing genotypes.
#'
#' @export
setMethod("loadGenotypes",
signature(ac = "ADAMContext", filePath = "character"),
function(ac, filePath) {
jrdd <- sparkR.callJMethod(ac@jac, "loadGenotypes", filePath)
GenotypeRDD(jrdd)
})

#' Loads in variants.
#'
#' @param ac The ADAMContext.
#' @param filePath The path to load the file from.
#' @return Returns an RDD containing variants.
#'
#' @export
setMethod("loadVariants",
signature(ac = "ADAMContext", filePath = "character"),
function(ac, filePath) {
jrdd <- sparkR.callJMethod(ac@jac, "loadVariants", filePath)
VariantRDD(jrdd)
})
Loading

0 comments on commit d3975ba

Please sign in to comment.