jorainer · jorainer · Nov 20, 2023 · Oct 18, 2023 · Oct 27, 2023 · Nov 2, 2023
diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml
@@ -52,9 +52,9 @@ jobs:
  fail-fast: false
  matrix:
  config:
- - { os: ubuntu-latest, r: 'devel', bioc: '3.17', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
- - { os: macOS-latest, r: 'devel', bioc: '3.17'}
- - { os: windows-latest, r: 'devel', bioc: '3.17'}
+ - { os: ubuntu-latest, r: 'devel', bioc: '3.19', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
+ - { os: macOS-latest, r: 'devel', bioc: '3.19'}
+ - { os: windows-latest, r: 'devel', bioc: '3.19'}
  env:
  R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
  RSPM: ${{ matrix.config.rspm }}

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -20,10 +20,13 @@ Authors@R: c(person(given = "Johannes", family = "Rainer",
  person(given = "Laurent", family = "Gatto",
  email = "lg390@cam.ac.uk",
  role = "ctb",
- comment = c(ORCID = "0000-0002-1520-2268")))
+ comment = c(ORCID = "0000-0002-1520-2268")),
+ person(given = "Boyu", family = "Yu",
+ email = "boyu.yu.tim@gmail.com",
+ role = "ctb"))
 Author: Johannes Rainer <johannes.rainer@eurac.edu> with contributions
- from Tim Triche, Sebastian Gibb, Laurent Gatto and
- Christian Weichenberger.
+ from Tim Triche, Sebastian Gibb, Laurent Gatto 
+ Christian Weichenberger and Boyu Yu.
 Maintainer: Johannes Rainer <johannes.rainer@eurac.edu>
 URL: https://github.com/jorainer/ensembldb
 BugReports: https://github.com/jorainer/ensembldb/issues

diff --git a/R/Methods.R b/R/Methods.R
@@ -925,15 +925,61 @@ setMethod("lengthOf", "EnsDb", function(x, of="gene",
 ## For EnsDb: calls the .transcriptLengths function.
 .transcriptLengths <- function(x, with.cds_len = FALSE, with.utr5_len = FALSE,
  with.utr3_len = FALSE,
- filter = AnnotationFilterList()) {
- filter <- .processFilterParam(filter, x)
- allTxs <- transcripts(x, filter = filter)
+ filter = AnnotationFilterList(), 
+ exons = NA, transcripts = NA) {
+ ## The preloaded data option is currently only for the coordinates mapping
+ ## functions, therefore the filter is "tx_id" only.
+ preload_ranges_missing <- which(c(
+ identical(exons,NA),
+ identical(transcripts,NA)
+ ))
+ if(identical(integer(0), preload_ranges_missing)){
+ if (!is(exons, "CompressedGRangesList"))
+ stop("Argument 'exons' has to be a 'CompressedGRangesList' object")
+ if (!is(transcripts, "GRanges"))
+ stop("Argument 'transcripts' has to be an 'GRanges' object")
+ if (identical(integer(0),grep('T[0-9]', names(exons)[[1]])))
+ stop("Argument 'exons' has to be by 'tx'.")
+ ## Check if x is a formula and eventually translate it.
+ if (is(filter, "formula"))
+ filter <- AnnotationFilter(filter)
+ tryCatch({
+ filter_type <- filter@field
+ filter_tx <- filter@value
+ }, error = function(e){
+ message("No filter detected, all transcripts will be returned")
+ filter_tx <<- names(transcripts)
+ })
+ if (exists('filter_type')){
+ if (filter_type != "tx_id")
+ stop("Filter must be 'tx_id'.") 
+ }
+ tryCatch({
+ allTxs <- transcripts[names(transcripts) %in% unique(filter_tx)]
+ }, error = function(e){
+ allTxs <<- GRanges()
+ })
+ } else if (length(preload_ranges_missing) == 2){
+ filter <- .processFilterParam(filter, x)
+ allTxs <- transcripts(x, filter = filter) 
+ } else {
+ stop(paste(
+ "Argument", 
+ c("'exons'", "'transcripts'")[preload_ranges_missing],
+ 'missing.'
+ , sep = " "
+ ))
+ }
  if (length(allTxs) == 0)
  return(data.frame(tx_id = character(), gene_id = character(),
  nexon = integer(), tx_len = integer()))
- exns <- exonsBy(x, filter = TxIdFilter(allTxs$tx_id))
- ## Match ordering
- exns <- exns[match(allTxs$tx_id, names(exns))]
+ if(identical(integer(0), preload_ranges_missing)){
+ exns <- exons[match(allTxs$tx_id, names(exons))]
+ } else {
+ exns <- exonsBy(x, filter = TxIdFilter(allTxs$tx_id))
+ ## Match ordering
+ exns <- exns[match(allTxs$tx_id, names(exns))] 
+ }
  ## Calculate length of transcripts.
  txLengths <- sum(width(exns))
  ## Calculate no. of exons.

diff --git a/R/genomeToX.R b/R/genomeToX.R
@@ -26,7 +26,8 @@
 #' @param x `GRanges` object with the genomic coordinates that should be
 #' mapped.
 #'
-#' @param db `EnsDb` object.
+#' @param db `EnsDb` object or pre-loaded exons 'CompressedGRangesList' object 
+#' using exonsBy().
 #'
 #' @return
 #'
@@ -75,11 +76,48 @@
 #' ## The result of the mapping is an IRangesList each element providing the
 #' ## within-transcript coordinates for each input region
 #' genomeToTranscript(gnm, edbx)
+#' 
+#' ## If you are tring to calculate within-transcript coordinates of a huge 
+#' ## list of genomic region, you shall use pre-loaded exons GRangesList to 
+#' ## replace the SQLite db edbx
+#' 
+#' ## Below is just a lazy demo of querying multiple genomic region 
+#' library(parallel)
+#' 
+#' gnm <- rep(GRanges("X:107715899-107715901"),10)
+#' 
+#' exons <- exonsBy(EnsDb.Hsapiens.v86)
+#' 
+#' ## You can pre-define the exons region to further accelerate the code.
+#' 
+#' exons <- exonsBy(
+#' EnsDb.Hsapiens.v86, by = "tx",
+#' filter = AnnotationFilterList(
+#' SeqNameFilter(as.character(unique(seqnames(gnm)))),
+#' GeneStartFilter(max(end(gnm)), condition = "<="),
+#' GeneEndFilter(min(start(gnm)), condition = ">=")
+#' )
+#' )
+#' 
+#' ## only run in Linux ## 
+#' # res_temp <- mclapply(1:10, function(ind){
+#' # genomeToTranscript(gnm[ind], exons)
+#' # }, mc.preschedule = TRUE, mc.cores = detectCores() - 1)
+#' 
+#' # res <- do.call(c,res_temp)
+#' 
+#' cl <- makeCluster(detectCores() - 1)
+#' clusterExport(cl,c('genomeToTranscript','gnm','exons'))
+#' 
+#' res <- parLapply(cl,1:10,function(ind){
+#' genomeToTranscript(gnm[ind], exons)
+#' })
+#' stopCluster(cl)
 genomeToTranscript <- function(x, db) {
  if (missing(x) || !is(x, "GRanges"))
  stop("Argument 'x' is required and has to be a 'GRanges' object")
- if (missing(db) || !is(db, "EnsDb"))
- stop("Argument 'db' is required and has to be an 'EnsDb' object")
+ if (missing(db) || !(is(db, "EnsDb") || is(db,"CompressedGRangesList"))) 
+ stop("Argument 'db' is required and has to be an 'EnsDb' object or a 'CompressedGRangesList' object") # load the exons priorly
  res <- .genome_to_tx(x, db)
  if (is(res, "IRanges"))
  res <- IRangesList(res)
@@ -115,6 +153,8 @@ genomeToTranscript <- function(x, db) {
 #' within-protein coordinates.
 #'
 #' @param db `EnsDb` object.
+#' 
+#' @inheritParams transcriptToProtein
 #'
 #' @return
 #'
@@ -174,17 +214,44 @@ genomeToTranscript <- function(x, db) {
 #'
 #' ## Mapping of intronic positions fail
 #' res[[4]]
-genomeToProtein <- function(x, db) {
+#' 
+#' ## Meanwhile, this function can be called in parallel processes if you preload
+#' ## the protein, exons and transcripts database.
+#' 
+#' proteins <- proteins(edbx)
+#' exons <- exonsBy(edbx)
+#' transcripts <- transcripts(edbx)
+#' 
+#' genomeToProtein(gnm, edbx, proteins = proteins, exons = exons, transcripts = transcripts)
+genomeToProtein <- function(x, db, proteins = NA, exons = NA, transcripts = NA) {
  if (missing(x) || !is(x, "GRanges"))
  stop("Argument 'x' is required and has to be a 'GRanges' object")
  if (missing(db) || !is(db, "EnsDb"))
  stop("Argument 'db' is required and has to be an 'EnsDb' object")
- txs <- genomeToTranscript(x, db)
+ preload_ranges_missing <- which(c(
+ identical(proteins,NA),
+ identical(exons,NA),
+ identical(transcripts,NA)
+ ))
+ if(identical(integer(0), preload_ranges_missing))
+ txs <- genomeToTranscript(x, exons)
+ else if (length(preload_ranges_missing) == 3) {
+ txs <- genomeToTranscript(x, db)
+ } else {
+ stop(paste(
+ "Argument", 
+ c("'proteins'", "'exons'", "'transcripts'")[preload_ranges_missing],
+ 'missing.'
+ , sep = " "
+ ))
+ }
  int_ids <- rep(1:length(txs), lengths(txs))
  txs <- unlist(txs, use.names = FALSE)
  if (is.null(names(txs)))
  names(txs) <- ""
- prts <- transcriptToProtein(txs, db)
+ if (identical(integer(0), preload_ranges_missing))
+ prts <- transcriptToProtein(txs, db, proteins = proteins, exons = exons, transcripts = transcripts)
+ else prts <- transcriptToProtein(txs, db)
  mcols(prts) <- cbind(mcols(prts)[, c("tx_id", "cds_ok")],
  mcols(txs)[, colnames(mcols(txs)) != "tx_id"])
  prts <- split(prts, int_ids)
@@ -215,8 +282,10 @@ genomeToProtein <- function(x, db) {
 #'
 #' @noRd
 .genome_to_tx_ranges <- function(genome, exns) {
+ genome_gr <- as(genome, "GRangesList") # preserve the metadata
+ genome_gr@unlistData@elementMetadata <- genome@elementMetadata
  IRangesList(unlist(
- lapply(as(genome, "GRangesList"), FUN = function(rgn, exns) {
+ lapply(genome_gr, FUN = function(rgn, exns) {
  if (length(exns)) {
  ints <- pintersect(exns, rgn, drop.nohit.ranges = TRUE,
  ignore.strand = FALSE)
@@ -248,6 +317,7 @@ genomeToProtein <- function(x, db) {
  seq_name = as.character(seqnames(rgn)),
  seq_strand = as.character(strand(rgn))
  )
+ if(!is.null(mcols(rgn))) dfrm <- DataFrame(dfrm, mcols(rgn))
  mcols(irng) <- dfrm
  irng
  }, MoreArgs = list(rgn = rgn), USE.NAMES = FALSE)
@@ -261,6 +331,7 @@ genomeToProtein <- function(x, db) {
  seq_start = start(rgn), seq_end = end(rgn),
  seq_name = as.character(seqnames(rgn)),
  seq_strand = as.character(strand(rgn)))
+ if(!is.null(mcols(rgn))) metad <- DataFrame(metad, mcols(rgn))
  empty_rng <- IRanges(start = -1, width = 1)
  mcols(empty_rng) <- metad
  empty_rng
@@ -275,7 +346,7 @@ genomeToProtein <- function(x, db) {
 #' checks if the region is completely included in an exons and returns
 #' the position within the transcript for that exon.
 #'
-#' @param db `EnsDb`.
+#' @param db `EnsDb` or 'CompressedGRangesList' 
 #'
 #' @param genome `GRanges`
 #'
@@ -315,12 +386,16 @@ genomeToProtein <- function(x, db) {
 #' ## Example with two genes, on two strands!
 .genome_to_tx <- function(genome, db) {
  ## Get exonsBy for all input ranges.
- exns <- exonsBy(db, by = "tx",
- filter = AnnotationFilterList(
- SeqNameFilter(as.character(unique(seqnames(genome)))),
- GeneStartFilter(max(end(genome)), condition = "<="),
- GeneEndFilter(min(start(genome)), condition = ">=")
- ))
+ if(is(db, "EnsDb")){
+ exns <- exonsBy(
+ db, by = "tx",
+ filter = AnnotationFilterList(
+ SeqNameFilter(as.character(unique(seqnames(genome)))),
+ GeneStartFilter(max(end(genome)), condition = "<="),
+ GeneEndFilter(min(start(genome)), condition = ">=")
+ )
+ )
+ } else exns <- db
  res <- .genome_to_tx_ranges(genome, exns)
  if (!is.null(names(genome)))
  names(res) <- names(genome)