Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support for parallel by preloading data #150

Merged
merged 25 commits into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/check-bioc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ jobs:
fail-fast: false
matrix:
config:
- { os: ubuntu-latest, r: 'devel', bioc: '3.17', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
- { os: macOS-latest, r: 'devel', bioc: '3.17'}
- { os: windows-latest, r: 'devel', bioc: '3.17'}
- { os: ubuntu-latest, r: 'devel', bioc: '3.19', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
- { os: macOS-latest, r: 'devel', bioc: '3.19'}
- { os: windows-latest, r: 'devel', bioc: '3.19'}
env:
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
RSPM: ${{ matrix.config.rspm }}
Expand Down
9 changes: 6 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,13 @@ Authors@R: c(person(given = "Johannes", family = "Rainer",
person(given = "Laurent", family = "Gatto",
email = "lg390@cam.ac.uk",
role = "ctb",
comment = c(ORCID = "0000-0002-1520-2268")))
comment = c(ORCID = "0000-0002-1520-2268")),
person(given = "Boyu", family = "Yu",
email = "boyu.yu.tim@gmail.com",
role = "ctb"))
Author: Johannes Rainer <johannes.rainer@eurac.edu> with contributions
from Tim Triche, Sebastian Gibb, Laurent Gatto and
Christian Weichenberger.
from Tim Triche, Sebastian Gibb, Laurent Gatto
Christian Weichenberger and Boyu Yu.
Maintainer: Johannes Rainer <johannes.rainer@eurac.edu>
URL: https://github.com/jorainer/ensembldb
BugReports: https://github.com/jorainer/ensembldb/issues
Expand Down
58 changes: 52 additions & 6 deletions R/Methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -925,15 +925,61 @@ setMethod("lengthOf", "EnsDb", function(x, of="gene",
## For EnsDb: calls the .transcriptLengths function.
.transcriptLengths <- function(x, with.cds_len = FALSE, with.utr5_len = FALSE,
with.utr3_len = FALSE,
filter = AnnotationFilterList()) {
filter <- .processFilterParam(filter, x)
allTxs <- transcripts(x, filter = filter)
filter = AnnotationFilterList(),
exons = NA, transcripts = NA) {
## The preloaded data option is currently only for the coordinates mapping
## functions, therefore the filter is "tx_id" only.
preload_ranges_missing <- which(c(
identical(exons,NA),
identical(transcripts,NA)
))
if(identical(integer(0), preload_ranges_missing)){
if (!is(exons, "CompressedGRangesList"))
stop("Argument 'exons' has to be a 'CompressedGRangesList' object")
if (!is(transcripts, "GRanges"))
stop("Argument 'transcripts' has to be an 'GRanges' object")
if (identical(integer(0),grep('T[0-9]', names(exons)[[1]])))
stop("Argument 'exons' has to be by 'tx'.")
## Check if x is a formula and eventually translate it.
if (is(filter, "formula"))
filter <- AnnotationFilter(filter)
tryCatch({
filter_type <- filter@field
filter_tx <- filter@value
}, error = function(e){
message("No filter detected, all transcripts will be returned")
filter_tx <<- names(transcripts)
})
if (exists('filter_type')){
if (filter_type != "tx_id")
stop("Filter must be 'tx_id'.")
}
tryCatch({
allTxs <- transcripts[names(transcripts) %in% unique(filter_tx)]
}, error = function(e){
allTxs <<- GRanges()
})
} else if (length(preload_ranges_missing) == 2){
filter <- .processFilterParam(filter, x)
allTxs <- transcripts(x, filter = filter)
} else {
stop(paste(
"Argument",
c("'exons'", "'transcripts'")[preload_ranges_missing],
'missing.'
, sep = " "
))
}
if (length(allTxs) == 0)
return(data.frame(tx_id = character(), gene_id = character(),
nexon = integer(), tx_len = integer()))
exns <- exonsBy(x, filter = TxIdFilter(allTxs$tx_id))
## Match ordering
exns <- exns[match(allTxs$tx_id, names(exns))]
if(identical(integer(0), preload_ranges_missing)){
exns <- exons[match(allTxs$tx_id, names(exons))]
} else {
exns <- exonsBy(x, filter = TxIdFilter(allTxs$tx_id))
## Match ordering
exns <- exns[match(allTxs$tx_id, names(exns))]
}
## Calculate length of transcripts.
txLengths <- sum(width(exns))
## Calculate no. of exons.
Expand Down
103 changes: 89 additions & 14 deletions R/genomeToX.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
#' @param x `GRanges` object with the genomic coordinates that should be
#' mapped.
#'
#' @param db `EnsDb` object.
#' @param db `EnsDb` object or pre-loaded exons 'CompressedGRangesList' object
#' using exonsBy().
#'
#' @return
#'
Expand Down Expand Up @@ -75,11 +76,48 @@
#' ## The result of the mapping is an IRangesList each element providing the
#' ## within-transcript coordinates for each input region
#' genomeToTranscript(gnm, edbx)
#'
#' ## If you are tring to calculate within-transcript coordinates of a huge
#' ## list of genomic region, you shall use pre-loaded exons GRangesList to
#' ## replace the SQLite db edbx
#'
#' ## Below is just a lazy demo of querying multiple genomic region
#' library(parallel)
#'
#' gnm <- rep(GRanges("X:107715899-107715901"),10)
#'
#' exons <- exonsBy(EnsDb.Hsapiens.v86)
#'
#' ## You can pre-define the exons region to further accelerate the code.
#'
#' exons <- exonsBy(
#' EnsDb.Hsapiens.v86, by = "tx",
#' filter = AnnotationFilterList(
#' SeqNameFilter(as.character(unique(seqnames(gnm)))),
#' GeneStartFilter(max(end(gnm)), condition = "<="),
#' GeneEndFilter(min(start(gnm)), condition = ">=")
#' )
#' )
#'
#' ## only run in Linux ##
#' # res_temp <- mclapply(1:10, function(ind){
#' # genomeToTranscript(gnm[ind], exons)
#' # }, mc.preschedule = TRUE, mc.cores = detectCores() - 1)
#'
#' # res <- do.call(c,res_temp)
#'
#' cl <- makeCluster(detectCores() - 1)
#' clusterExport(cl,c('genomeToTranscript','gnm','exons'))
#'
#' res <- parLapply(cl,1:10,function(ind){
#' genomeToTranscript(gnm[ind], exons)
#' })
#' stopCluster(cl)
genomeToTranscript <- function(x, db) {
if (missing(x) || !is(x, "GRanges"))
stop("Argument 'x' is required and has to be a 'GRanges' object")
if (missing(db) || !is(db, "EnsDb"))
stop("Argument 'db' is required and has to be an 'EnsDb' object")
if (missing(db) || !(is(db, "EnsDb") || is(db,"CompressedGRangesList")))
stop("Argument 'db' is required and has to be an 'EnsDb' object or a 'CompressedGRangesList' object") # load the exons priorly
res <- .genome_to_tx(x, db)
if (is(res, "IRanges"))
res <- IRangesList(res)
Expand Down Expand Up @@ -115,6 +153,8 @@ genomeToTranscript <- function(x, db) {
#' within-protein coordinates.
#'
#' @param db `EnsDb` object.
#'
#' @inheritParams transcriptToProtein
#'
#' @return
#'
Expand Down Expand Up @@ -174,17 +214,44 @@ genomeToTranscript <- function(x, db) {
#'
#' ## Mapping of intronic positions fail
#' res[[4]]
genomeToProtein <- function(x, db) {
#'
#' ## Meanwhile, this function can be called in parallel processes if you preload
#' ## the protein, exons and transcripts database.
#'
#' proteins <- proteins(edbx)
#' exons <- exonsBy(edbx)
#' transcripts <- transcripts(edbx)
#'
#' genomeToProtein(gnm, edbx, proteins = proteins, exons = exons, transcripts = transcripts)
genomeToProtein <- function(x, db, proteins = NA, exons = NA, transcripts = NA) {
if (missing(x) || !is(x, "GRanges"))
stop("Argument 'x' is required and has to be a 'GRanges' object")
if (missing(db) || !is(db, "EnsDb"))
stop("Argument 'db' is required and has to be an 'EnsDb' object")
txs <- genomeToTranscript(x, db)
preload_ranges_missing <- which(c(
identical(proteins,NA),
identical(exons,NA),
identical(transcripts,NA)
))
if(identical(integer(0), preload_ranges_missing))
txs <- genomeToTranscript(x, exons)
else if (length(preload_ranges_missing) == 3) {
txs <- genomeToTranscript(x, db)
} else {
stop(paste(
"Argument",
c("'proteins'", "'exons'", "'transcripts'")[preload_ranges_missing],
'missing.'
, sep = " "
))
}
int_ids <- rep(1:length(txs), lengths(txs))
txs <- unlist(txs, use.names = FALSE)
if (is.null(names(txs)))
names(txs) <- ""
prts <- transcriptToProtein(txs, db)
if (identical(integer(0), preload_ranges_missing))
prts <- transcriptToProtein(txs, db, proteins = proteins, exons = exons, transcripts = transcripts)
else prts <- transcriptToProtein(txs, db)
mcols(prts) <- cbind(mcols(prts)[, c("tx_id", "cds_ok")],
mcols(txs)[, colnames(mcols(txs)) != "tx_id"])
prts <- split(prts, int_ids)
Expand Down Expand Up @@ -215,8 +282,10 @@ genomeToProtein <- function(x, db) {
#'
#' @noRd
.genome_to_tx_ranges <- function(genome, exns) {
genome_gr <- as(genome, "GRangesList") # preserve the metadata
genome_gr@unlistData@elementMetadata <- genome@elementMetadata
IRangesList(unlist(
lapply(as(genome, "GRangesList"), FUN = function(rgn, exns) {
lapply(genome_gr, FUN = function(rgn, exns) {
if (length(exns)) {
ints <- pintersect(exns, rgn, drop.nohit.ranges = TRUE,
ignore.strand = FALSE)
Expand Down Expand Up @@ -248,6 +317,7 @@ genomeToProtein <- function(x, db) {
seq_name = as.character(seqnames(rgn)),
seq_strand = as.character(strand(rgn))
)
if(!is.null(mcols(rgn))) dfrm <- DataFrame(dfrm, mcols(rgn))
mcols(irng) <- dfrm
irng
}, MoreArgs = list(rgn = rgn), USE.NAMES = FALSE)
Expand All @@ -261,6 +331,7 @@ genomeToProtein <- function(x, db) {
seq_start = start(rgn), seq_end = end(rgn),
seq_name = as.character(seqnames(rgn)),
seq_strand = as.character(strand(rgn)))
if(!is.null(mcols(rgn))) metad <- DataFrame(metad, mcols(rgn))
empty_rng <- IRanges(start = -1, width = 1)
mcols(empty_rng) <- metad
empty_rng
Expand All @@ -275,7 +346,7 @@ genomeToProtein <- function(x, db) {
#' checks if the region is completely included in an exons and returns
#' the position within the transcript for that exon.
#'
#' @param db `EnsDb`.
#' @param db `EnsDb` or 'CompressedGRangesList'
#'
#' @param genome `GRanges`
#'
Expand Down Expand Up @@ -315,12 +386,16 @@ genomeToProtein <- function(x, db) {
#' ## Example with two genes, on two strands!
.genome_to_tx <- function(genome, db) {
## Get exonsBy for all input ranges.
exns <- exonsBy(db, by = "tx",
filter = AnnotationFilterList(
SeqNameFilter(as.character(unique(seqnames(genome)))),
GeneStartFilter(max(end(genome)), condition = "<="),
GeneEndFilter(min(start(genome)), condition = ">=")
))
if(is(db, "EnsDb")){
exns <- exonsBy(
db, by = "tx",
filter = AnnotationFilterList(
SeqNameFilter(as.character(unique(seqnames(genome)))),
GeneStartFilter(max(end(genome)), condition = "<="),
GeneEndFilter(min(start(genome)), condition = ">=")
)
)
} else exns <- db
res <- .genome_to_tx_ranges(genome, exns)
if (!is.null(names(genome)))
names(res) <- names(genome)
Expand Down
Loading
Loading