diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index 358f070..3c8878c 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -52,9 +52,9 @@ jobs: fail-fast: false matrix: config: - - { os: ubuntu-latest, r: 'devel', bioc: '3.17', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } - - { os: macOS-latest, r: 'devel', bioc: '3.17'} - - { os: windows-latest, r: 'devel', bioc: '3.17'} + - { os: ubuntu-latest, r: 'devel', bioc: '3.19', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } + - { os: macOS-latest, r: 'devel', bioc: '3.19'} + - { os: windows-latest, r: 'devel', bioc: '3.19'} env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true RSPM: ${{ matrix.config.rspm }} diff --git a/DESCRIPTION b/DESCRIPTION index c893e15..8f161ae 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,10 +20,13 @@ Authors@R: c(person(given = "Johannes", family = "Rainer", person(given = "Laurent", family = "Gatto", email = "lg390@cam.ac.uk", role = "ctb", - comment = c(ORCID = "0000-0002-1520-2268"))) + comment = c(ORCID = "0000-0002-1520-2268")), + person(given = "Boyu", family = "Yu", + email = "boyu.yu.tim@gmail.com", + role = "ctb")) Author: Johannes Rainer with contributions - from Tim Triche, Sebastian Gibb, Laurent Gatto and - Christian Weichenberger. + from Tim Triche, Sebastian Gibb, Laurent Gatto + Christian Weichenberger and Boyu Yu. Maintainer: Johannes Rainer URL: https://github.com/jorainer/ensembldb BugReports: https://github.com/jorainer/ensembldb/issues diff --git a/R/Methods.R b/R/Methods.R index 8eb8436..2865c6b 100644 --- a/R/Methods.R +++ b/R/Methods.R @@ -925,15 +925,61 @@ setMethod("lengthOf", "EnsDb", function(x, of="gene", ## For EnsDb: calls the .transcriptLengths function. .transcriptLengths <- function(x, with.cds_len = FALSE, with.utr5_len = FALSE, with.utr3_len = FALSE, - filter = AnnotationFilterList()) { - filter <- .processFilterParam(filter, x) - allTxs <- transcripts(x, filter = filter) + filter = AnnotationFilterList(), + exons = NA, transcripts = NA) { + ## The preloaded data option is currently only for the coordinates mapping + ## functions, therefore the filter is "tx_id" only. + preload_ranges_missing <- which(c( + identical(exons,NA), + identical(transcripts,NA) + )) + if(identical(integer(0), preload_ranges_missing)){ + if (!is(exons, "CompressedGRangesList")) + stop("Argument 'exons' has to be a 'CompressedGRangesList' object") + if (!is(transcripts, "GRanges")) + stop("Argument 'transcripts' has to be an 'GRanges' object") + if (identical(integer(0),grep('T[0-9]', names(exons)[[1]]))) + stop("Argument 'exons' has to be by 'tx'.") + ## Check if x is a formula and eventually translate it. + if (is(filter, "formula")) + filter <- AnnotationFilter(filter) + tryCatch({ + filter_type <- filter@field + filter_tx <- filter@value + }, error = function(e){ + message("No filter detected, all transcripts will be returned") + filter_tx <<- names(transcripts) + }) + if (exists('filter_type')){ + if (filter_type != "tx_id") + stop("Filter must be 'tx_id'.") + } + tryCatch({ + allTxs <- transcripts[names(transcripts) %in% unique(filter_tx)] + }, error = function(e){ + allTxs <<- GRanges() + }) + } else if (length(preload_ranges_missing) == 2){ + filter <- .processFilterParam(filter, x) + allTxs <- transcripts(x, filter = filter) + } else { + stop(paste( + "Argument", + c("'exons'", "'transcripts'")[preload_ranges_missing], + 'missing.' + , sep = " " + )) + } if (length(allTxs) == 0) return(data.frame(tx_id = character(), gene_id = character(), nexon = integer(), tx_len = integer())) - exns <- exonsBy(x, filter = TxIdFilter(allTxs$tx_id)) - ## Match ordering - exns <- exns[match(allTxs$tx_id, names(exns))] + if(identical(integer(0), preload_ranges_missing)){ + exns <- exons[match(allTxs$tx_id, names(exons))] + } else { + exns <- exonsBy(x, filter = TxIdFilter(allTxs$tx_id)) + ## Match ordering + exns <- exns[match(allTxs$tx_id, names(exns))] + } ## Calculate length of transcripts. txLengths <- sum(width(exns)) ## Calculate no. of exons. diff --git a/R/genomeToX.R b/R/genomeToX.R index 52c3915..d72f0bb 100644 --- a/R/genomeToX.R +++ b/R/genomeToX.R @@ -26,7 +26,8 @@ #' @param x `GRanges` object with the genomic coordinates that should be #' mapped. #' -#' @param db `EnsDb` object. +#' @param db `EnsDb` object or pre-loaded exons 'CompressedGRangesList' object +#' using exonsBy(). #' #' @return #' @@ -75,11 +76,48 @@ #' ## The result of the mapping is an IRangesList each element providing the #' ## within-transcript coordinates for each input region #' genomeToTranscript(gnm, edbx) +#' +#' ## If you are tring to calculate within-transcript coordinates of a huge +#' ## list of genomic region, you shall use pre-loaded exons GRangesList to +#' ## replace the SQLite db edbx +#' +#' ## Below is just a lazy demo of querying multiple genomic region +#' library(parallel) +#' +#' gnm <- rep(GRanges("X:107715899-107715901"),10) +#' +#' exons <- exonsBy(EnsDb.Hsapiens.v86) +#' +#' ## You can pre-define the exons region to further accelerate the code. +#' +#' exons <- exonsBy( +#' EnsDb.Hsapiens.v86, by = "tx", +#' filter = AnnotationFilterList( +#' SeqNameFilter(as.character(unique(seqnames(gnm)))), +#' GeneStartFilter(max(end(gnm)), condition = "<="), +#' GeneEndFilter(min(start(gnm)), condition = ">=") +#' ) +#' ) +#' +#' ## only run in Linux ## +#' # res_temp <- mclapply(1:10, function(ind){ +#' # genomeToTranscript(gnm[ind], exons) +#' # }, mc.preschedule = TRUE, mc.cores = detectCores() - 1) +#' +#' # res <- do.call(c,res_temp) +#' +#' cl <- makeCluster(detectCores() - 1) +#' clusterExport(cl,c('genomeToTranscript','gnm','exons')) +#' +#' res <- parLapply(cl,1:10,function(ind){ +#' genomeToTranscript(gnm[ind], exons) +#' }) +#' stopCluster(cl) genomeToTranscript <- function(x, db) { if (missing(x) || !is(x, "GRanges")) stop("Argument 'x' is required and has to be a 'GRanges' object") - if (missing(db) || !is(db, "EnsDb")) - stop("Argument 'db' is required and has to be an 'EnsDb' object") + if (missing(db) || !(is(db, "EnsDb") || is(db,"CompressedGRangesList"))) + stop("Argument 'db' is required and has to be an 'EnsDb' object or a 'CompressedGRangesList' object") # load the exons priorly res <- .genome_to_tx(x, db) if (is(res, "IRanges")) res <- IRangesList(res) @@ -115,6 +153,8 @@ genomeToTranscript <- function(x, db) { #' within-protein coordinates. #' #' @param db `EnsDb` object. +#' +#' @inheritParams transcriptToProtein #' #' @return #' @@ -174,17 +214,44 @@ genomeToTranscript <- function(x, db) { #' #' ## Mapping of intronic positions fail #' res[[4]] -genomeToProtein <- function(x, db) { +#' +#' ## Meanwhile, this function can be called in parallel processes if you preload +#' ## the protein, exons and transcripts database. +#' +#' proteins <- proteins(edbx) +#' exons <- exonsBy(edbx) +#' transcripts <- transcripts(edbx) +#' +#' genomeToProtein(gnm, edbx, proteins = proteins, exons = exons, transcripts = transcripts) +genomeToProtein <- function(x, db, proteins = NA, exons = NA, transcripts = NA) { if (missing(x) || !is(x, "GRanges")) stop("Argument 'x' is required and has to be a 'GRanges' object") if (missing(db) || !is(db, "EnsDb")) stop("Argument 'db' is required and has to be an 'EnsDb' object") - txs <- genomeToTranscript(x, db) + preload_ranges_missing <- which(c( + identical(proteins,NA), + identical(exons,NA), + identical(transcripts,NA) + )) + if(identical(integer(0), preload_ranges_missing)) + txs <- genomeToTranscript(x, exons) + else if (length(preload_ranges_missing) == 3) { + txs <- genomeToTranscript(x, db) + } else { + stop(paste( + "Argument", + c("'proteins'", "'exons'", "'transcripts'")[preload_ranges_missing], + 'missing.' + , sep = " " + )) + } int_ids <- rep(1:length(txs), lengths(txs)) txs <- unlist(txs, use.names = FALSE) if (is.null(names(txs))) names(txs) <- "" - prts <- transcriptToProtein(txs, db) + if (identical(integer(0), preload_ranges_missing)) + prts <- transcriptToProtein(txs, db, proteins = proteins, exons = exons, transcripts = transcripts) + else prts <- transcriptToProtein(txs, db) mcols(prts) <- cbind(mcols(prts)[, c("tx_id", "cds_ok")], mcols(txs)[, colnames(mcols(txs)) != "tx_id"]) prts <- split(prts, int_ids) @@ -215,8 +282,10 @@ genomeToProtein <- function(x, db) { #' #' @noRd .genome_to_tx_ranges <- function(genome, exns) { + genome_gr <- as(genome, "GRangesList") # preserve the metadata + genome_gr@unlistData@elementMetadata <- genome@elementMetadata IRangesList(unlist( - lapply(as(genome, "GRangesList"), FUN = function(rgn, exns) { + lapply(genome_gr, FUN = function(rgn, exns) { if (length(exns)) { ints <- pintersect(exns, rgn, drop.nohit.ranges = TRUE, ignore.strand = FALSE) @@ -248,6 +317,7 @@ genomeToProtein <- function(x, db) { seq_name = as.character(seqnames(rgn)), seq_strand = as.character(strand(rgn)) ) + if(!is.null(mcols(rgn))) dfrm <- DataFrame(dfrm, mcols(rgn)) mcols(irng) <- dfrm irng }, MoreArgs = list(rgn = rgn), USE.NAMES = FALSE) @@ -261,6 +331,7 @@ genomeToProtein <- function(x, db) { seq_start = start(rgn), seq_end = end(rgn), seq_name = as.character(seqnames(rgn)), seq_strand = as.character(strand(rgn))) + if(!is.null(mcols(rgn))) metad <- DataFrame(metad, mcols(rgn)) empty_rng <- IRanges(start = -1, width = 1) mcols(empty_rng) <- metad empty_rng @@ -275,7 +346,7 @@ genomeToProtein <- function(x, db) { #' checks if the region is completely included in an exons and returns #' the position within the transcript for that exon. #' -#' @param db `EnsDb`. +#' @param db `EnsDb` or 'CompressedGRangesList' #' #' @param genome `GRanges` #' @@ -315,12 +386,16 @@ genomeToProtein <- function(x, db) { #' ## Example with two genes, on two strands! .genome_to_tx <- function(genome, db) { ## Get exonsBy for all input ranges. - exns <- exonsBy(db, by = "tx", - filter = AnnotationFilterList( - SeqNameFilter(as.character(unique(seqnames(genome)))), - GeneStartFilter(max(end(genome)), condition = "<="), - GeneEndFilter(min(start(genome)), condition = ">=") - )) + if(is(db, "EnsDb")){ + exns <- exonsBy( + db, by = "tx", + filter = AnnotationFilterList( + SeqNameFilter(as.character(unique(seqnames(genome)))), + GeneStartFilter(max(end(genome)), condition = "<="), + GeneEndFilter(min(start(genome)), condition = ">=") + ) + ) + } else exns <- db res <- .genome_to_tx_ranges(genome, exns) if (!is.null(names(genome))) names(res) <- names(genome) diff --git a/R/proteinToX.R b/R/proteinToX.R index e82b3e9..153db5c 100644 --- a/R/proteinToX.R +++ b/R/proteinToX.R @@ -2,6 +2,10 @@ #' @title Map protein-relative coordinates to positions within the transcript #' +#' @name proteinToTranscript +#' +#' @aliases proteinToTranscript proteinToTranscript,EnsDb-method +#' #' @description #' #' `proteinToTranscript` maps protein-relative coordinates to positions within @@ -34,7 +38,9 @@ #' use these as input for the mapping function. #' #' @inheritParams proteinToGenome -#' +#' +#' @param ... Further arguments to be passed on. +#' #' @return #' #' `IRangesList`, each element being the mapping results for one of the input @@ -100,8 +106,11 @@ #' #' ## The result for the region within the second protein #' res[[2]] -proteinToTranscript <- function(x, db, id = "name", - idType = "protein_id") { +setGeneric("proteinToTranscript", signature="db", + function(x, db, ...) standardGeneric("proteinToTranscript") +) +setMethod("proteinToTranscript", "EnsDb", + function(x, db, id = "name", idType = "protein_id") { if (missing(x) || !is(x, "IRanges")) stop("Argument 'x' is required and has to be an 'IRanges' object") if (missing(db) || !is(db, "EnsDb")) @@ -149,6 +158,8 @@ proteinToTranscript <- function(x, db, id = "name", else mc$protein_id <- names(prt) ir <- IRanges(start = -1, width = 1) mcols(ir) <- mc + if(!is.null(mcols(prt))) + mcols(ir) <- cbind(mcols(ir),mcols(prt)) ir } else { ids <- names(gnm) @@ -167,11 +178,132 @@ proteinToTranscript <- function(x, db, id = "name", if (idType == "uniprot_id") mc$uniprot_id <- names(prt) mcols(res) <- mc + if(!is.null(mcols(prt))) + mcols(res) <- cbind(mcols(res),mcols(prt)) res } }), "IRangesList") -} +}) +#' @rdname proteinToTranscript +#' +#' @aliases proteinToTranscript proteinToTranscript,Preloaded-method +#' +#' @inheritParams proteinToGenome +#' +#' @param fiveUTR A `CompressedGRangesList` object generated by +#' [fiveUTRsByTranscript()]. +#' +#' @family coordinate mapping functions +#' +#' @author Johannes Rainer +#' +#' @md +#' +#' @examples +#' +#' ## Meanwhile, this function can be called in parallel processes if you preload +#' ## the CDS data with desired data columns and fiveUTR data +#' +#' cds <- cdsBy(edbx,columns = c(listColumns(edbx,'tx'),'protein_id','uniprot_id','protein_sequence')) +#' # cds <- cdsBy(edbx,columns = c(listColumns(edbx,'tx'),'protein_id','protein_sequence')) +#' # cds <- cdsBy(edbx,columns = c('tx_id','protein_id','protein_sequence')) +#' +#' fiveUTR <- fiveUTRsByTranscript(edbx) +#' +#' ## Define an IRange with protein-relative coordinates within a protein for +#' ## the gene SYP +#' syp <- IRanges(start = 4, end = 17) +#' names(syp) <- "ENSP00000418169" +#' res <- proteinToTranscript(syp, cds, fiveUTR = fiveUTR) +#' res +#' ## Positions 4 to 17 within the protein span are encoded by the region +#' ## from nt 23 to 64. +#' +#' ## Perform the mapping for multiple proteins identified by their Uniprot +#' ## IDs. +#' ids <- c("O15266", "Q9HBJ8", "unexistant") +#' prngs <- IRanges(start = c(13, 43, 100), end = c(21, 80, 100)) +#' names(prngs) <- ids +#' +#' res <- proteinToTranscript(prngs, cds, idType = "uniprot_id", fiveUTR = fiveUTR) +setMethod("proteinToTranscript", "CompressedGRangesList", + function(x, db, id = "name", idType = "protein_id", fiveUTR) { + if (missing(x) || !is(x, "IRanges")) + stop("Argument 'x' is required and has to be an 'IRanges' object") + if (missing(db) || !is(db, "CompressedGRangesList")) + stop("Argument 'db' is required and has to be an 'CompressedGRangesList' object") + if (missing(fiveUTR) || !is(fiveUTR, "CompressedGRangesList")) + stop("Argument 'fiveUTR' is required and has to be an 'CompressedGRangesList' object") + coords_cds <- .proteinCoordsToTx(x) + ## 1) retrieve CDS for each protein + message("Fetching CDS for ", length(x), " proteins ... ", + appendLF = FALSE) + cds_genome <- .cds_for_id_range(db, x, id = id, idType = idType) + miss <- lengths(cds_genome) == 0 + if (any(miss)) + warning("No CDS found for: ", paste0(names(cds_genome)[miss], + collapse = ", ")) + message(sum(!miss), " found") + ## 2) ensure that the CDS matches the AA sequence length + message("Checking CDS and protein sequence lengths ... ", appendLF = FALSE) + cds_genome <- .cds_matching_protein(cds_genome) + are_ok <- vapply(cds_genome, function(z) { + if (is(z, "GRangesList")) + all(z[[1]]$cds_ok) + else NA + }, FUN.VALUE = logical(1)) + are_ok <- are_ok[!is.na(are_ok)] + ## We've got now a list of GRanges + message(sum(are_ok), "/", length(are_ok), " OK") + ## Get for each transcript it's 5' UTR and add its width to the coords_cds + tx_ids <- unique(unlist(lapply(cds_genome, names), use.names = FALSE)) + if (length(tx_ids)) { + five_utr <- fiveUTR[names(fiveUTR) %in% tx_ids] + ## Calculate 5' widths for these + five_width <- sum(width(five_utr)) + } + as(mapply( + cds_genome, as(coords_cds, "IRangesList"), split(x, 1:length(x)), + FUN = function(gnm, cds, prt) { + if (is.null(gnm)) { + ## Define the metadata columns + mc <- DataFrame(protein_id = NA_character_, + tx_id = NA_character_, + cds_ok = NA, + protein_start = start(prt), + protein_end = end(prt)) + if (idType == "uniprot_id") + mc$uniprot_id <- names(prt) + else mc$protein_id <- names(prt) + ir <- IRanges(start = -1, width = 1) + mcols(ir) <- mc + if(!is.null(mcols(prt))) + mcols(ir) <- cbind(mcols(ir),mcols(prt)) + ir + } else { + ids <- names(gnm) + res <- IRanges(start = start(cds) + five_width[ids], + end = end(cds) + five_width[ids], + names = ids) + ## Populate mcols + mc <- DataFrame( + protein_id = unlist( + lapply(gnm, function(z) z$protein_id[1])), + tx_id = ids, + cds_ok = gnm[[1]]$cds_ok[1], + protein_start = start(prt), + protein_end = end(prt) + ) + if (idType == "uniprot_id") + mc$uniprot_id <- names(prt) + mcols(res) <- mc + if(!is.null(mcols(prt))) + mcols(res) <- cbind(mcols(res),mcols(prt)) + res + } + }), "IRangesList") +}) #' @title Map within-protein coordinates to genomic coordinates #' @@ -341,8 +473,10 @@ setMethod("proteinToGenome", "EnsDb", are_ok <- are_ok[!is.na(are_ok)] ## We've got now a list of GRanges ## Perform the mapping for each input range with each mapped cds + x_IRangesList <- as(x, "IRangesList") ## preserve the metadata + x_IRangesList@unlistData@elementMetadata <- x@elementMetadata res <- mapply( - cds_genome, as(coords_cds, "IRangesList"), as(x, "IRangesList"), + cds_genome, as(coords_cds, "IRangesList"), x_IRangesList, FUN = function(gnm, cds, prt) { if (!length(gnm)) { ## addresses NULL and empty elements GRanges() @@ -355,6 +489,101 @@ setMethod("proteinToGenome", "EnsDb", mcols(maps) <- cbind(mcols(maps), protein_start = start(prt), protein_end = end(prt)) + if(!is.null(mcols(prt))) + mcols(maps) <- cbind(mcols(maps),mcols(prt)) + maps[order(maps$exon_rank)] + } + }) + ## Split each element again, if there are more than one protein_id. Names + ## of the elements are then the protein_id. + lapply(res, function(z) { + if (length(unique(z$protein_id)) > 1) + split(z, f = z$protein_id) + else z + }) + }) + +#' @rdname proteinToGenome +#' +#' @aliases proteinToGenome proteinToGenome,Preloaded-method +#' +#' @param db For the method for `EnsDb` objects: An `EnsDb` object to be used to +#' retrieve genomic coordinates of encoding transcripts.

+#' For the method for `CompressedGRangesList` objects: A `CompressedGRangesList` object +#' generated by [cdsBy()] where by = 'tx' and columns = c('tx_id' +#' ,'protein_id','uniprot_id','protein_sequence'). +#' +#' @md +#' +#' @examples +#' +#' ## Meanwhile, this function can be called in parallel processes if you preload +#' ## the CDS data with desired data columns +#' cds <- cdsBy(edbx,columns = c(listColumns(edbx,'tx'),'protein_id','uniprot_id','protein_sequence')) +#' # cds <- cdsBy(edbx,columns = c(listColumns(edbx,'tx'),'protein_id','protein_sequence')) +#' # cds <- cdsBy(edbx,columns = c('tx_id','protein_id','protein_sequence')) +#' ## Define an IRange with protein-relative coordinates within a protein for +#' ## the gene SYP +#' syp <- IRanges(start = 4, end = 17) +#' names(syp) <- "ENSP00000418169" +#' res <- proteinToGenome(syp, cds) +#' res +#' ## Positions 4 to 17 within the protein span two exons of the encoding +#' ## transcript. +#' +#' ## Perform the mapping for multiple proteins identified by their Uniprot +#' ## IDs. +#' ids <- c("O15266", "Q9HBJ8", "unexistant") +#' prngs <- IRanges(start = c(13, 43, 100), end = c(21, 80, 100)) +#' names(prngs) <- ids +#' +#' res <- proteinToGenome(prngs, cds, idType = "uniprot_id") +setMethod("proteinToGenome", "CompressedGRangesList", + function(x, db, id = "name", idType = "protein_id") { + if (missing(x) || !is(x, "IRanges")) + stop("Argument 'x' is required and has to be an 'IRanges' object") + if (missing(db) || !is(db, "CompressedGRangesList")) + stop("Argument 'db' is required and has to be an 'CompressedGRangesList' object") + coords_cds <- .proteinCoordsToTx(x) + ## 1) retrieve CDS for each protein + message("Fetching CDS for ", length(x), " proteins ... ", + appendLF = FALSE) + cds_genome <- .cds_for_id_range(db, x, id = id, idType = idType) + miss <- lengths(cds_genome) == 0 + if (any(miss)) + warning("No CDS found for: ", paste0(names(cds_genome)[miss], + collapse = ", ")) + message(sum(!miss), " found") + ## 2) ensure that the CDS matches the AA sequence length + message("Checking CDS and protein sequence lengths ... ", appendLF = FALSE) + cds_genome <- .cds_matching_protein(cds_genome) + are_ok <- unlist(lapply(cds_genome, function(z) { + if (is(z, "GRangesList") & length(z) > 0) + all(z[[1]]$cds_ok) + else NA + })) + message(sum(are_ok, na.rm = TRUE), "/", length(are_ok), " OK") + are_ok <- are_ok[!is.na(are_ok)] + ## We've got now a list of GRanges + ## Perform the mapping for each input range with each mapped cds + x_IRangesList <- as(x, "IRangesList") ## preserve the metadata + x_IRangesList@unlistData@elementMetadata <- x@elementMetadata + res <- mapply( + cds_genome, as(coords_cds, "IRangesList"), x_IRangesList, + FUN = function(gnm, cds, prt) { + if (!length(gnm)) { ## addresses NULL and empty elements + GRanges() + } else { + ## Unlist because we'd like to have a GRanges here. Will split + ## again later. + maps <- unlist(.to_genome(gnm, cds)) + ## Don't want to have GRanges names! + names(maps) <- NULL + mcols(maps) <- cbind(mcols(maps), + protein_start = start(prt), + protein_end = end(prt)) + if(!is.null(mcols(prt))) + mcols(maps) <- cbind(mcols(maps),mcols(prt)) maps[order(maps$exon_rank)] } }) @@ -437,7 +666,7 @@ setMethod("proteinToGenome", "EnsDb", #' #' Use one query to fetch CDS for all (unique) input IDs. If input IDs are #' Uniprot identifiers we have to perform additional checks and data -#' re-organizations because one transript (and thus CDS) can be associated +#' re-organizations because one transcript (and thus CDS) can be associated #' with multiple Uniprot identifiers. #' #' @param x `EnsDb` object. @@ -464,7 +693,7 @@ setMethod("proteinToGenome", "EnsDb", tx_id <- split(map$tx_id, map[, idType]) } else { tx_id <- id - split(tx_id, id) + tx_id <- split(tx_id, id) } if (length(tx_id)) { suppressWarnings( @@ -490,6 +719,83 @@ setMethod("proteinToGenome", "EnsDb", cds } +#' @description Fetch the CDS for all transcripts encoding the specified protein using +#' preloaded data. +#' +#' @note +#' +#' Use one query to fetch CDS for all (unique) input IDs using preloaded data. If input +#' IDs are Uniprot identifiers we have to perform additional checks and data +#' re-organizations because one transcript (and thus CDS) can be associated +#' with multiple Uniprot identifiers. +#' +#' @param x `CompressedGRangesList` object generated by [cdsBy()] where +#' by = 'tx' and columns = c(listColumns(edb,'tx'),'protein_id','uniprot_id','protein_sequence'). +#' +#' @param id `character` with the protein ID(s). +#' +#' @param idType `character(1)` defining what type of IDs are provided. Has to +#' be one of `"protein_id"` (default), `"uniprot_id"` or `"tx_id"`. +#' +#' @return a `list` with the CDS of the encoding transcript(s) for each provided +#' id (as a `GRangesList`). Names of the `list` are the ids, if no +#' transcript was found `NULL` is returned. +#' +#' @author Johannes Rainer, Boyu Yu +#' +#' @md +#' +#' @noRd +setMethod(".cds_for_id2", "CompressedGRangesList", + function(x, id, idType = "protein_id") { + ## No need for get the transcripts first, just use the metadata of CDS + ## if you find this modification well, you may want to change the + ## generic .cds_for_id2 function above + if (!all(c("tx_id", "protein_id",idType,"protein_sequence") %in% colnames(mcols(x[[1]])))) + stop( + paste( + c( + "CDS must have", + unique(c("'tx_id'","'protein_sequence'", "'protein_id'",paste("'",idType,"'",sep=''))), + "in the metadata." + ), + collapse = ' ') + ) + map <- x@unlistData[as.data.frame(x@partitioning)[,1]]@elementMetadata[,unique(c('tx_id', 'protein_id', idType))] + map <- map[map[,idType] %in% id,] + if (idType != "tx_id") { + tx_id <- split(map$tx_id, map[, idType]) + } else { + tx_id <- id + tx_id <- split(tx_id, id) + } + if (length(tx_id)) { + cds <- x[names(x) %in% unique(unlist(tx_id, use.names = FALSE))] + if (length(cds)) { + if (idType == "uniprot_id") { + ## Additional (in)sanity checks for Uniprot identifiers + ## This has a significant impact on performance, but otherwise + ## we end up with duplicated transcript entries! + tmp <- unlist(cds) + tmp <- as.list(split(unname(tmp), tmp$uniprot_id)) + cds <- lapply(tmp, function(z) split(z, z$tx_id)) + } else if ("uniprot_id" %in% colnames(mcols(x[[1]]))) { + warning( + "Multiple uniprot ids were preserved for the same transcript entry. ", + "Please don't include uniprot_id column in cdsBy() if unneeded." + ) + cds <- lapply(tx_id, function(z) cds[z]) + } else { + cds <- lapply(tx_id, function(z) cds[z]) + } + } + else cds <- list() + } else cds <- list() + cds <- cds[id] + names(cds) <- id # to add also names for elements not found + cds +}) + #' @description #' #' Uses .cds_for_id to fetch CDS for protein identifiers and in addition @@ -545,6 +851,8 @@ setMethod("proteinToGenome", "EnsDb", #' @param x `EnsDb` object. #' #' @param cds `GRangesList` +#' +#' @param ... Further arguments to be passed on.. #' #' @return `list` of `GRangesList`s with one list element per input protein, #' and the `GRangesList` containing the CDS of all matching (or one not @@ -555,7 +863,11 @@ setMethod("proteinToGenome", "EnsDb", #' @md #' #' @noRd -.cds_matching_protein <- function(x, cds) { +setGeneric(".cds_matching_protein", signature="x", + function(x, ...) standardGeneric(".cds_matching_protein") +) +setMethod(".cds_matching_protein", "EnsDb", + function(x, cds) { ## Fetch the protein sequences, all in one go. ## Loop through the cds prot_ids <- unique(unlist(lapply(cds, function(z) { @@ -596,7 +908,67 @@ setMethod("proteinToGenome", "EnsDb", } } }) -} +}) + +#' @description +#' +#' Fetches the protein sequence using the provided `"protein_id"` in `cds` and +#' checks whether the cds lenghts match the protein sequence. The +#' function returns all CDS for each protein/transcript that have the correct +#' length) and throws a warning if none of the specified CDS matches the +#' protein sequence. In the latter case a single CDS (the one with the CDS +#' length closest to the expected length). Whether a CDS has the expected length +#' or not is also reported in the metadata column `"cds_ok"`. +#' +#' @details +#' +#' Mismatch between the CDS and the AA sequence are in most instances caused by +#' incomplete (3' or 5') CDS sequences. +#' +#' @param x `list` object generated by .cds_for_id2. +#' +#' +#' @return `list` of `GRangesList`s with one list element per input protein, +#' and the `GRangesList` containing the CDS of all matching (or one not +#' matching) transcripts. +#' +#' @author Johannes Rainer, Boyu Yu +#' +#' @md +#' +#' @noRd +setMethod(".cds_matching_protein", "list", + function(x) { + mapply(x, names(x), FUN = function(z, nm) { + if (!is.null(z)) { + cds_lens <- sum(width(z)) + exp_cds_len <- unlist(lapply(z, function(y) nchar(unique(y$protein_sequence)) * 3 + 3)) + ## Calculate the expected CDS length (add +3 to add the stop codon). + diffs <- cds_lens - exp_cds_len + ## Return all for which diff is 0, or otherwise the one with the + ## smallest diff. + if (any(diffs == 0)) { + z <- lapply(z, function(grng) { + mcols(grng)$cds_ok <- TRUE + grng + }) + GRangesList(z[diffs == 0]) + } else { + warning("Could not find a CDS whith the expected length for ", + "protein: '", nm, "'. The returned genomic ", + "coordinates might thus not be correct for this ", + "protein.", call. = FALSE) + ## Alternatively we could align the RNA and AA sequences and + ## trim the protein sequence... + z <- lapply(z, function(grng) { + mcols(grng)$cds_ok <- FALSE + grng + }) + GRangesList(z[which.min(abs(diffs))]) + } + } + }) +}) #' @description #' diff --git a/R/transcriptToX.R b/R/transcriptToX.R index 3409dee..8de57c1 100644 --- a/R/transcriptToX.R +++ b/R/transcriptToX.R @@ -30,6 +30,13 @@ #' #' @param id `character(1)` specifying where the transcript identifier can be #' found. Has to be either `"name"` or one of `colnames(mcols(prng))`. +#' +#' @param proteins `DFrame` object generated by [proteins()]. +#' +#' @param exons `CompressedGRangesList` object generated by [exonsBy()] where +#' by = 'tx'. +#' +#' @param transcripts `GRanges` object generated by [transcripts()]. #' #' @return #' @@ -85,12 +92,48 @@ #' #' ## Or because the provided coordinates are not within the CDS #' transcriptToProtein(IRanges(1, 1, names = "ENST00000381578"), edbx) -transcriptToProtein <- function(x, db, id = "name") { +#' +#' ## Meanwhile, this function can be called in parallel processes if you preload +#' ## the protein, exons and transcripts database. +#' +#' proteins <- proteins(edbx) +#' exons <- exonsBy(edbx) +#' transcripts <- transcripts(edbx) +#' +#' txpos <- IRanges(start = c(692, 693, 694, 695), +#' width = rep(1, 4), +#' names = c(rep("ENST00000381578", 2), rep("ENST00000486554", 2)), +#' info='test') +#' +#' transcriptToProtein(txpos,edbx,proteins = proteins,exons = exons,transcripts = transcripts) +transcriptToProtein <- function(x, db, id = "name", proteins = NA,exons = NA, transcripts = NA){ if (missing(x) || !is(x, "IRanges")) stop("Argument 'x' is required and has to be an 'IRanges' object") if (missing(db) || !is(db, "EnsDb")) stop("Argument 'db' is required and has to be an 'EnsDb' object") - .txs_to_proteins(x, db = db, id = id) + if (is(id, "DFrame") & any(any(is.na(transcripts)))) + stop("Your input parameters are not matching; please assign them accordingly.") + preload_ranges_missing <- which(c( + identical(proteins,NA), + identical(exons,NA), + identical(transcripts,NA) + )) + if (identical(integer(0), preload_ranges_missing)){ + if (!is(proteins, "DFrame")) + stop("Argument 'proteins' has to be an 'DFrame' object") + if (!"tx_id" %in% colnames(proteins)) + stop("Argument 'proteins' has to have 'tx_id'") + .txs_to_proteins(x, db = db, id = id, proteins, exons, transcripts) + } else if (length(preload_ranges_missing) == 3){ + .txs_to_proteins(x, db = db, id = id) + } else { + stop(paste( + "Argument", + c("'proteins'", "'exons'", "'transcripts'")[preload_ranges_missing], + 'missing.' + , sep = " " + )) + } } #' This version performs the mapping for each IRange separately! Results are @@ -173,7 +216,9 @@ transcriptToProtein <- function(x, db, id = "name") { #' @md #' #' @noRd -.txs_to_proteins <- function(x, db, id = "name") { +.txs_to_proteins <- function(x, db, id = "name", + proteins = NA, + exons = NA, transcripts = NA) { ## Fetch all in one. id <- match.arg(id, c("name", colnames(mcols(x)))) if (id == "name") @@ -185,11 +230,16 @@ transcriptToProtein <- function(x, db, id = "name") { ## Define internal IDs - we'll need them to return the result in the ## correct order. internal_ids <- paste0(names(x), ":", start(x), ":", end(x)) + ## Preserve the metadata if any + if(!is.null(mcols(x))) + raw_x_metadata <- mcols(x) tx_lens <- .transcriptLengths(db, filter = TxIdFilter(unique(ids)), with.cds_len = TRUE, with.utr5_len = TRUE, - with.utr3_len = TRUE) + with.utr3_len = TRUE, + exons = exons, ## Pass the preloaded + transcripts = transcripts) ## Process transcripts not found not_found <- !(ids %in% tx_lens$tx_id) fail_res <- IRanges() @@ -265,8 +315,18 @@ transcriptToProtein <- function(x, db, id = "name") { x <- x[!outside_cds] } ## Now check if the CDS length matches the protein sequence length. - prt <- proteins(db, filter = TxIdFilter(names(x)), - columns = "protein_sequence") + ## check if preloaded data is available. + if(!identical(proteins,NA)){ + tryCatch({ + prt <- proteins[proteins$tx_id %in% unique(names(x)),] + }, error = function(e){ + prt <<- DataFrame() + }) + } else { + prt <- proteins(db, filter = TxIdFilter(names(x)), + columns = "protein_sequence") + } + id_idx_prt <- match(names(x), prt$tx_id) id_idx <- match(names(x), tx_lens$tx_id) res <- IRanges(start = ceiling((start(x) - tx_lens$utr5_len[id_idx]) / 3), @@ -283,9 +343,13 @@ transcriptToProtein <- function(x, db, id = "name") { mcols(res) <- DataFrame(tx_id = names(x), tx_start = start(x), tx_end = end(x), cds_ok = cds_ok) res <- c(fail_res, res) - res[match(internal_ids, paste0(mcols(res)$tx_id, ":", + res <- res[match(internal_ids, paste0(mcols(res)$tx_id, ":", mcols(res)$tx_start, ":", mcols(res)$tx_end))] + ## Add the preserved metadata is any + if(!is.null(mcols(x))) + mcols(res) <- DataFrame(mcols(res), raw_x_metadata) + res } #' @title Map transcript-relative coordinates to genomic coordinates @@ -297,8 +361,17 @@ transcriptToProtein <- function(x, db, id = "name") { #' nucleotide of the **transcript**, not the **CDS**. CDS-relative coordinates #' have to be converted to transcript-relative positions first with the #' [cdsToTranscript()] function. +#' +#' @param x `IRanges` with the coordinates within the transcript. Coordinates +#' are counted from the start of the transcript (including the 5' UTR). The +#' Ensembl IDs of the corresponding transcripts have to be provided either +#' as `names` of the `IRanges`, or in one of its metadata columns. #' -#' @inheritParams transcriptToProtein +#' @param db `EnsDb` object or pre-loaded exons 'CompressedGRangesList' object +#' using exonsBy(). +#' +#' @param id `character(1)` specifying where the transcript identifier can be +#' found. Has to be either `"name"` or one of `colnames(mcols(prng))`. #' #' @return #' @@ -357,11 +430,39 @@ transcriptToProtein <- function(x, db, id = "name") { #' res[[3]] #' #' res +#' ## If you are tring to map a huge list of transcript-relative coordinates +#' ## to genomic level, you shall use pre-loaded exons GRangesList to replace +#' ## the SQLite db edbx +#' +#' exons <- exonsBy(EnsDb.Hsapiens.v86) +#' +#' ## Below is just a lazy demo of querying 10^4 transcript-relative +#' ## coordinates without any pre-splitting +#' library(parallel) +#' +#' txpos <- IRanges( +#' start = rep(1,10), +#' end = rep(30,10), +#' names = c(rep('ENST00000486554',9),'some'), +#' note = rep('something',10)) +#' +#' ## only run in Linux ## +#' # res_temp <- mclapply(1:10, function(ind){ +#' # transcriptToGenome(txpos[ind], exons) +#' # }, mc.preschedule = TRUE, mc.cores = detectCores() - 1) +#' +#' # res <- do.call(c,res_temp) +#' cl <- makeCluster(detectCores() - 1) +#' clusterExport(cl,c('transcriptToGenome','txpos','exons')) +#' res <- parLapply(cl,1:10,function(ind){ +#' transcriptToGenome(txpos[ind], exons) +#' }) +#' stopCluster(cl) transcriptToGenome <- function(x, db, id = "name") { if (missing(x) || !is(x, "IRanges")) stop("Argument 'x' is required and has to be an 'IRanges' object") - if (missing(db) || !is(db, "EnsDb")) - stop("Argument 'db' is required and has to be an 'EnsDb' object") + if (missing(db) || !(is(db, "EnsDb") || is(db,"CompressedGRangesList"))) + stop("Argument 'db' is required and has to be an 'EnsDb' object or a 'CompressedGRangesList' object") # load the exons priorly to allow spontaneous query since RSQLite does not support res <- .tx_to_genome(x, db, id = id) not_found <- sum(lengths(res) == 0) if (not_found > 0) @@ -379,14 +480,24 @@ transcriptToGenome <- function(x, db, id = "name") { if (any(is.null(ids))) stop("One or more of the provided IDs are NULL", call. = FALSE) names(x) <- ids - exns <- exonsBy(db, filter = TxIdFilter(unique(ids))) + if(is(db, "EnsDb")) { + exns <- exonsBy(db, filter = TxIdFilter(unique(ids))) + } else { + tryCatch({ + exns <- db[names(db) %in% unique(ids)] + }, error = function(e){ + exns <<- GRangesList() + }) + + } ## Loop over x res <- lapply(split(x, f = 1:length(x)), function(z) { if (any(names(exns) == names(z))) { gen_map <- .to_genome(exns[[names(z)]], z) if (length(gen_map)) { mcols(gen_map) <- DataFrame(mcols(gen_map), tx_start = start(z), - tx_end = end(z)) + tx_end = end(z)) # preserve the metadata from the targeted tx + if(!is.null(mcols(z))) mcols(gen_map) <- DataFrame(mcols(gen_map), mcols(z)) gen_map[order(gen_map$exon_rank)] } else GRanges() } else GRanges() @@ -441,7 +552,16 @@ transcriptToGenome <- function(x, db, id = "name") { #' ## ENST00000590515 does not encode a protein and thus -1 is returned #' ## The coordinates within ENST00000333681 are outside the CDS and thus also #' ## -1 is reported. -transcriptToCds <- function(x, db, id = "name") { +#' +#' ## Meanwhile, this function can be called in parallel processes if you preload +#' ## the exons and transcripts database. +#' +#' exons <- exonsBy(EnsDb.Hsapiens.v86) +#' transcripts <- transcripts(EnsDb.Hsapiens.v86) +#' +#' transcriptToCds(txcoords, EnsDb.Hsapiens.v86, exons = exons,transcripts = transcripts) +transcriptToCds <- function(x, db, id = "name", + exons = NA, transcripts = NA) { if (missing(x) || !is(x, "IRanges")) stop("Argument 'x' is required and has to be an 'IRanges' object") if (missing(db) || !is(db, "EnsDb")) @@ -452,7 +572,9 @@ transcriptToCds <- function(x, db, id = "name") { else ids <- mcols(x)[, id] tx_lens <- .transcriptLengths(db, with.utr5_len = TRUE, with.cds_len = TRUE, - filter = ~ tx_id %in% ids) + filter = ~ tx_id %in% ids, + exons = exons, ## Pass the preloaded + transcripts = transcripts) mcols(x)$tx_start <- start(x) mcols(x)$tx_end <- end(x) start(x) <- -1 @@ -544,7 +666,16 @@ transcriptToCds <- function(x, db, id = "name") { #' ## coordinates #' transcriptToGenome(cdsToTranscript(pkp2, EnsDb.Hsapiens.v86), #' EnsDb.Hsapiens.v86) -cdsToTranscript <- function(x, db, id = "name") { +#' +#' ## Meanwhile, this function can be called in parallel processes if you preload +#' ## the exons and transcripts database. +#' +#' exons <- exonsBy(EnsDb.Hsapiens.v86) +#' transcripts <- transcripts(EnsDb.Hsapiens.v86) +#' +#' cdsToTranscript(txcoords, EnsDb.Hsapiens.v86, exons = exons,transcripts = transcripts) +cdsToTranscript <- function(x, db, id = "name", + exons = NA, transcripts = NA) { if (missing(x) || !is(x, "IRanges")) stop("Argument 'x' is required and has to be an 'IRanges' object") if (missing(db) || !is(db, "EnsDb")) @@ -555,7 +686,9 @@ cdsToTranscript <- function(x, db, id = "name") { else ids <- mcols(x)[, id] tx_lens <- .transcriptLengths(db, with.utr5_len = TRUE, with.cds_len = TRUE, - filter = ~ tx_id %in% ids) + filter = ~ tx_id %in% ids, + exons = exons, ## Pass the preloaded + transcripts = transcripts) mcols(x)$cds_start <- start(x) mcols(x)$cds_end <- end(x) start(x) <- -1 diff --git a/man/cdsToTranscript.Rd b/man/cdsToTranscript.Rd index 57d6647..0ba9a0d 100644 --- a/man/cdsToTranscript.Rd +++ b/man/cdsToTranscript.Rd @@ -5,7 +5,7 @@ \title{Map positions within the CDS to coordinates relative to the start of the transcript} \usage{ -cdsToTranscript(x, db, id = "name") +cdsToTranscript(x, db, id = "name", exons = NA, transcripts = NA) } \arguments{ \item{x}{\code{IRanges} with the coordinates within the CDS. Coordinates @@ -18,6 +18,11 @@ in one of its metadata columns.} \item{id}{\code{character(1)} specifying where the transcript identifier can be found. Has to be either \code{"name"} or one of \code{colnames(mcols(prng))}.} + +\item{exons}{\code{CompressedGRangesList} object generated by \code{\link[=exonsBy]{exonsBy()}} where +by = 'tx'.} + +\item{transcripts}{\code{GRanges} object generated by \code{\link[=transcripts]{transcripts()}}.} } \value{ \code{IRanges} with the same length (and order) than the input \code{IRanges} @@ -53,6 +58,14 @@ pkp2 <- IRanges(start = 1643, width = 1, name = "ENST00000070846") ## coordinates transcriptToGenome(cdsToTranscript(pkp2, EnsDb.Hsapiens.v86), EnsDb.Hsapiens.v86) + +## Meanwhile, this function can be called in parallel processes if you preload +## the exons and transcripts database. + +exons <- exonsBy(EnsDb.Hsapiens.v86) +transcripts <- transcripts(EnsDb.Hsapiens.v86) + +cdsToTranscript(txcoords, EnsDb.Hsapiens.v86, exons = exons,transcripts = transcripts) } \seealso{ Other coordinate mapping functions: diff --git a/man/genomeToProtein.Rd b/man/genomeToProtein.Rd index 9bf80c0..ebf9b3a 100644 --- a/man/genomeToProtein.Rd +++ b/man/genomeToProtein.Rd @@ -4,13 +4,20 @@ \alias{genomeToProtein} \title{Map genomic coordinates to protein coordinates} \usage{ -genomeToProtein(x, db) +genomeToProtein(x, db, proteins = NA, exons = NA, transcripts = NA) } \arguments{ \item{x}{\code{GRanges} with the genomic coordinates that should be mapped to within-protein coordinates.} \item{db}{\code{EnsDb} object.} + +\item{proteins}{\code{DFrame} object generated by \code{\link[=proteins]{proteins()}}.} + +\item{exons}{\code{CompressedGRangesList} object generated by \code{\link[=exonsBy]{exonsBy()}} where +by = 'tx'.} + +\item{transcripts}{\code{GRanges} object generated by \code{\link[=transcripts]{transcripts()}}.} } \value{ An \code{IRangesList} with each element representing the mapping of one of the @@ -79,6 +86,15 @@ res[[3]] ## Mapping of intronic positions fail res[[4]] + +## Meanwhile, this function can be called in parallel processes if you preload +## the protein, exons and transcripts database. + +proteins <- proteins(edbx) +exons <- exonsBy(edbx) +transcripts <- transcripts(edbx) + +genomeToProtein(gnm, edbx, proteins = proteins, exons = exons, transcripts = transcripts) } \seealso{ Other coordinate mapping functions: diff --git a/man/genomeToTranscript.Rd b/man/genomeToTranscript.Rd index 70ef337..22b01e0 100644 --- a/man/genomeToTranscript.Rd +++ b/man/genomeToTranscript.Rd @@ -10,7 +10,8 @@ genomeToTranscript(x, db) \item{x}{\code{GRanges} object with the genomic coordinates that should be mapped.} -\item{db}{\code{EnsDb} object.} +\item{db}{\code{EnsDb} object or pre-loaded exons 'CompressedGRangesList' object +using exonsBy().} } \value{ An \code{IRangesList} with length equal to \code{length(x)}. Each element providing @@ -70,6 +71,43 @@ gnm <- GRanges("X", IRanges(start = c(644635, 107716399, 107716399), ## The result of the mapping is an IRangesList each element providing the ## within-transcript coordinates for each input region genomeToTranscript(gnm, edbx) + +## If you are tring to calculate within-transcript coordinates of a huge +## list of genomic region, you shall use pre-loaded exons GRangesList to +## replace the SQLite db edbx + +## Below is just a lazy demo of querying multiple genomic region +library(parallel) + +gnm <- rep(GRanges("X:107715899-107715901"),10) + +exons <- exonsBy(EnsDb.Hsapiens.v86) + +## You can pre-define the exons region to further accelerate the code. + +exons <- exonsBy( + EnsDb.Hsapiens.v86, by = "tx", + filter = AnnotationFilterList( + SeqNameFilter(as.character(unique(seqnames(gnm)))), + GeneStartFilter(max(end(gnm)), condition = "<="), + GeneEndFilter(min(start(gnm)), condition = ">=") + ) +) + +## only run in Linux ## +# res_temp <- mclapply(1:10, function(ind){ +# genomeToTranscript(gnm[ind], exons) +# }, mc.preschedule = TRUE, mc.cores = detectCores() - 1) + +# res <- do.call(c,res_temp) + +cl <- makeCluster(detectCores() - 1) +clusterExport(cl,c('genomeToTranscript','gnm','exons')) + +res <- parLapply(cl,1:10,function(ind){ + genomeToTranscript(gnm[ind], exons) +}) +stopCluster(cl) } \seealso{ Other coordinate mapping functions: diff --git a/man/proteinToGenome.Rd b/man/proteinToGenome.Rd index 0ee8ac9..e2d19c3 100644 --- a/man/proteinToGenome.Rd +++ b/man/proteinToGenome.Rd @@ -3,17 +3,24 @@ \name{proteinToGenome} \alias{proteinToGenome} \alias{proteinToGenome,EnsDb-method} +\alias{proteinToGenome,CompressedGRangesList-method} +\alias{proteinToGenome,Preloaded-method} \title{Map within-protein coordinates to genomic coordinates} \usage{ \S4method{proteinToGenome}{EnsDb}(x, db, id = "name", idType = "protein_id") + +\S4method{proteinToGenome}{CompressedGRangesList}(x, db, id = "name", idType = "protein_id") } \arguments{ \item{x}{\code{IRanges} with the coordinates within the protein(s). The object has also to provide some means to identify the protein (see details).} -\item{db}{\code{EnsDb} object to be used to retrieve genomic coordinates of -encoding transcripts.} +\item{db}{For the method for \code{EnsDb} objects: An \code{EnsDb} object to be used to +retrieve genomic coordinates of encoding transcripts.\if{html}{\out{
}}\if{html}{\out{
}} +For the method for \code{CompressedGRangesList} objects: A \code{CompressedGRangesList} object +generated by \code{\link[=cdsBy]{cdsBy()}} where by = 'tx' and columns = c('tx_id' +,'protein_id','uniprot_id','protein_sequence').} \item{id}{\code{character(1)} specifying where the protein identifier can be found. Has to be either \code{"name"} or one of \code{colnames(mcols(prng))}.} @@ -131,6 +138,28 @@ res[[1]] ## The coordinates within the second protein span two exons res[[2]] + +## Meanwhile, this function can be called in parallel processes if you preload +## the CDS data with desired data columns +cds <- cdsBy(edbx,columns = c(listColumns(edbx,'tx'),'protein_id','uniprot_id','protein_sequence')) +# cds <- cdsBy(edbx,columns = c(listColumns(edbx,'tx'),'protein_id','protein_sequence')) +# cds <- cdsBy(edbx,columns = c('tx_id','protein_id','protein_sequence')) +## Define an IRange with protein-relative coordinates within a protein for +## the gene SYP +syp <- IRanges(start = 4, end = 17) +names(syp) <- "ENSP00000418169" +res <- proteinToGenome(syp, cds) +res +## Positions 4 to 17 within the protein span two exons of the encoding +## transcript. + +## Perform the mapping for multiple proteins identified by their Uniprot +## IDs. +ids <- c("O15266", "Q9HBJ8", "unexistant") +prngs <- IRanges(start = c(13, 43, 100), end = c(21, 80, 100)) +names(prngs) <- ids + +res <- proteinToGenome(prngs, cds, idType = "uniprot_id") } \seealso{ \code{\link[GenomicFeatures]{proteinToGenome}} in the diff --git a/man/proteinToTranscript.Rd b/man/proteinToTranscript.Rd index 2f95e67..03e0827 100644 --- a/man/proteinToTranscript.Rd +++ b/man/proteinToTranscript.Rd @@ -2,23 +2,36 @@ % Please edit documentation in R/proteinToX.R \name{proteinToTranscript} \alias{proteinToTranscript} +\alias{proteinToTranscript,EnsDb-method} +\alias{proteinToTranscript,CompressedGRangesList-method} +\alias{proteinToTranscript,Preloaded-method} \title{Map protein-relative coordinates to positions within the transcript} \usage{ -proteinToTranscript(x, db, id = "name", idType = "protein_id") +proteinToTranscript(x, db, ...) + +\S4method{proteinToTranscript}{CompressedGRangesList}(x, db, id = "name", idType = "protein_id", fiveUTR) } \arguments{ \item{x}{\code{IRanges} with the coordinates within the protein(s). The object has also to provide some means to identify the protein (see details).} -\item{db}{\code{EnsDb} object to be used to retrieve genomic coordinates of -encoding transcripts.} +\item{db}{For the method for \code{EnsDb} objects: An \code{EnsDb} object to be used to +retrieve genomic coordinates of encoding transcripts.\if{html}{\out{
}}\if{html}{\out{
}} +For the method for \code{CompressedGRangesList} objects: A \code{CompressedGRangesList} object +generated by \code{\link[=cdsBy]{cdsBy()}} where by = 'tx' and columns = c('tx_id' +,'protein_id','uniprot_id','protein_sequence').} + +\item{...}{Further arguments to be passed on.} \item{id}{\code{character(1)} specifying where the protein identifier can be found. Has to be either \code{"name"} or one of \code{colnames(mcols(prng))}.} \item{idType}{\code{character(1)} defining what type of IDs are provided. Has to be one of \code{"protein_id"} (default), \code{"uniprot_id"} or \code{"tx_id"}.} + +\item{fiveUTR}{A \code{CompressedGRangesList} object generated by +\code{\link[=fiveUTRsByTranscript]{fiveUTRsByTranscript()}}.} } \value{ \code{IRangesList}, each element being the mapping results for one of the input @@ -108,8 +121,43 @@ res[[1]] ## The result for the region within the second protein res[[2]] + +## Meanwhile, this function can be called in parallel processes if you preload +## the CDS data with desired data columns and fiveUTR data + +cds <- cdsBy(edbx,columns = c(listColumns(edbx,'tx'),'protein_id','uniprot_id','protein_sequence')) +# cds <- cdsBy(edbx,columns = c(listColumns(edbx,'tx'),'protein_id','protein_sequence')) +# cds <- cdsBy(edbx,columns = c('tx_id','protein_id','protein_sequence')) + +fiveUTR <- fiveUTRsByTranscript(edbx) + +## Define an IRange with protein-relative coordinates within a protein for +## the gene SYP +syp <- IRanges(start = 4, end = 17) +names(syp) <- "ENSP00000418169" +res <- proteinToTranscript(syp, cds, fiveUTR = fiveUTR) +res +## Positions 4 to 17 within the protein span are encoded by the region +## from nt 23 to 64. + +## Perform the mapping for multiple proteins identified by their Uniprot +## IDs. +ids <- c("O15266", "Q9HBJ8", "unexistant") +prngs <- IRanges(start = c(13, 43, 100), end = c(21, 80, 100)) +names(prngs) <- ids + +res <- proteinToTranscript(prngs, cds, idType = "uniprot_id", fiveUTR = fiveUTR) } \seealso{ +Other coordinate mapping functions: +\code{\link{cdsToTranscript}()}, +\code{\link{genomeToProtein}()}, +\code{\link{genomeToTranscript}()}, +\code{\link{proteinToGenome}()}, +\code{\link{transcriptToCds}()}, +\code{\link{transcriptToGenome}()}, +\code{\link{transcriptToProtein}()} + Other coordinate mapping functions: \code{\link{cdsToTranscript}()}, \code{\link{genomeToProtein}()}, diff --git a/man/transcriptToCds.Rd b/man/transcriptToCds.Rd index f6ef043..a3d0b98 100644 --- a/man/transcriptToCds.Rd +++ b/man/transcriptToCds.Rd @@ -4,7 +4,7 @@ \alias{transcriptToCds} \title{Map transcript-relative coordinates to positions within the CDS} \usage{ -transcriptToCds(x, db, id = "name") +transcriptToCds(x, db, id = "name", exons = NA, transcripts = NA) } \arguments{ \item{x}{\code{IRanges} with the coordinates within the transcript. Coordinates @@ -17,6 +17,11 @@ in one of its metadata columns.} \item{id}{\code{character(1)} specifying where the transcript identifier can be found. Has to be either \code{"name"} or one of \code{colnames(mcols(prng))}.} + +\item{exons}{\code{CompressedGRangesList} object generated by \code{\link[=exonsBy]{exonsBy()}} where +by = 'tx'.} + +\item{transcripts}{\code{GRanges} object generated by \code{\link[=transcripts]{transcripts()}}.} } \value{ \code{IRanges} with the same length (and order) than the input \code{IRanges} @@ -46,6 +51,14 @@ transcriptToCds(txcoords, EnsDb.Hsapiens.v86) ## ENST00000590515 does not encode a protein and thus -1 is returned ## The coordinates within ENST00000333681 are outside the CDS and thus also ## -1 is reported. + +## Meanwhile, this function can be called in parallel processes if you preload +## the exons and transcripts database. + +exons <- exonsBy(EnsDb.Hsapiens.v86) +transcripts <- transcripts(EnsDb.Hsapiens.v86) + +transcriptToCds(txcoords, EnsDb.Hsapiens.v86, exons = exons,transcripts = transcripts) } \seealso{ Other coordinate mapping functions: diff --git a/man/transcriptToGenome.Rd b/man/transcriptToGenome.Rd index 0023482..d1a4808 100644 --- a/man/transcriptToGenome.Rd +++ b/man/transcriptToGenome.Rd @@ -12,7 +12,8 @@ are counted from the start of the transcript (including the 5' UTR). The Ensembl IDs of the corresponding transcripts have to be provided either as \code{names} of the \code{IRanges}, or in one of its metadata columns.} -\item{db}{\code{EnsDb} object.} +\item{db}{\code{EnsDb} object or pre-loaded exons 'CompressedGRangesList' object +using exonsBy().} \item{id}{\code{character(1)} specifying where the transcript identifier can be found. Has to be either \code{"name"} or one of \code{colnames(mcols(prng))}.} @@ -71,6 +72,34 @@ length(res) res[[3]] res +## If you are tring to map a huge list of transcript-relative coordinates +## to genomic level, you shall use pre-loaded exons GRangesList to replace +## the SQLite db edbx + +exons <- exonsBy(EnsDb.Hsapiens.v86) + +## Below is just a lazy demo of querying 10^4 transcript-relative +## coordinates without any pre-splitting +library(parallel) + +txpos <- IRanges( + start = rep(1,10), + end = rep(30,10), + names = c(rep('ENST00000486554',9),'some'), + note = rep('something',10)) + +## only run in Linux ## +# res_temp <- mclapply(1:10, function(ind){ +# transcriptToGenome(txpos[ind], exons) +# }, mc.preschedule = TRUE, mc.cores = detectCores() - 1) + +# res <- do.call(c,res_temp) +cl <- makeCluster(detectCores() - 1) +clusterExport(cl,c('transcriptToGenome','txpos','exons')) +res <- parLapply(cl,1:10,function(ind){ + transcriptToGenome(txpos[ind], exons) +}) +stopCluster(cl) } \seealso{ \code{\link[=cdsToTranscript]{cdsToTranscript()}} and \code{\link[=transcriptToCds]{transcriptToCds()}} for the mapping between diff --git a/man/transcriptToProtein.Rd b/man/transcriptToProtein.Rd index b1e2c1e..06b02d0 100644 --- a/man/transcriptToProtein.Rd +++ b/man/transcriptToProtein.Rd @@ -5,7 +5,14 @@ \title{Map transcript-relative coordinates to amino acid residues of the encoded protein} \usage{ -transcriptToProtein(x, db, id = "name") +transcriptToProtein( + x, + db, + id = "name", + proteins = NA, + exons = NA, + transcripts = NA +) } \arguments{ \item{x}{\code{IRanges} with the coordinates within the transcript. Coordinates @@ -17,6 +24,13 @@ as \code{names} of the \code{IRanges}, or in one of its metadata columns.} \item{id}{\code{character(1)} specifying where the transcript identifier can be found. Has to be either \code{"name"} or one of \code{colnames(mcols(prng))}.} + +\item{proteins}{\code{DFrame} object generated by \code{\link[=proteins]{proteins()}}.} + +\item{exons}{\code{CompressedGRangesList} object generated by \code{\link[=exonsBy]{exonsBy()}} where +by = 'tx'.} + +\item{transcripts}{\code{GRanges} object generated by \code{\link[=transcripts]{transcripts()}}.} } \value{ \code{IRanges} with the same length (and order) than the input \code{IRanges} @@ -78,6 +92,20 @@ transcriptToProtein(IRanges(1, 1, names = "unknown"), edbx) ## Or because the provided coordinates are not within the CDS transcriptToProtein(IRanges(1, 1, names = "ENST00000381578"), edbx) + +## Meanwhile, this function can be called in parallel processes if you preload +## the protein, exons and transcripts database. + +proteins <- proteins(edbx) +exons <- exonsBy(edbx) +transcripts <- transcripts(edbx) + +txpos <- IRanges(start = c(692, 693, 694, 695), + width = rep(1, 4), + names = c(rep("ENST00000381578", 2), rep("ENST00000486554", 2)), + info='test') + +transcriptToProtein(txpos,edbx,proteins = proteins,exons = exons,transcripts = transcripts) } \seealso{ \code{\link[=cdsToTranscript]{cdsToTranscript()}} and \code{\link[=transcriptToCds]{transcriptToCds()}} for conversion between diff --git a/tests/testthat/test_Methods.R b/tests/testthat/test_Methods.R index 0e8c66b..741df68 100644 --- a/tests/testthat/test_Methods.R +++ b/tests/testthat/test_Methods.R @@ -441,17 +441,17 @@ test_that("lengthOf works", { expect_identical(lenY, lenY2) ## Just using the transcriptLengths - res <- ensembldb:::.transcriptLengths(edb, filter = GenenameFilter("ZBTB16")) - res_2 <- lengthOf(edb, "tx", filter = GenenameFilter("ZBTB16")) + res <- ensembldb:::.transcriptLengths(edb, filter = GeneNameFilter("ZBTB16")) + res_2 <- lengthOf(edb, "tx", filter = GeneNameFilter("ZBTB16")) ## 'GenenameFilter' is deprecated. expect_identical(sort(res$tx_len), unname(sort(res_2))) ## also cds lengths etc. - res <- ensembldb:::.transcriptLengths(edb, filter = GenenameFilter("ZBTB16"), + res <- ensembldb:::.transcriptLengths(edb, filter = GeneNameFilter("ZBTB16"), with.cds_len = TRUE, with.utr5_len = TRUE, with.utr3_len = TRUE) expect_identical(colnames(res), c("tx_id", "gene_id", "nexon", "tx_len", "cds_len", "utr5_len", "utr3_len")) - tx <- transcripts(edb, filter = list(GenenameFilter("ZBTB16"), + tx <- transcripts(edb, filter = list(GeneNameFilter("ZBTB16"), TxBiotypeFilter("protein_coding"))) expect_true(all(!is.na(res[names(tx), "cds_len"]))) expect_equal(unname(res[names(tx), "tx_len"]), @@ -471,6 +471,27 @@ test_that("lengthOf works", { expect_equal(res[names(fives), "utr5_len"], unname(sum(width(fives)))) expect_equal(res[names(threes), "utr3_len"], unname(sum(width(threes)))) expect_equal(res[names(cdss), "cds_len"], unname(sum(width(cdss)))) + + ## Preloaded data test for tx_id only filter + exons <- exonsBy(edbx) + transcripts <- transcripts(edbx) + res_1 <- ensembldb:::.transcriptLengths(edbx, with.cds_len = TRUE, + with.utr5_len = TRUE, + with.utr3_len = TRUE, + exons = exons, + transcripts = transcripts) + expect_error(ensembldb:::.transcriptLengths(edbx, with.cds_len = TRUE, + with.utr5_len = TRUE, + with.utr3_len = TRUE, + exons = exons, + transcripts = exons)) + + expect_error(ensembldb:::.transcriptLengths(edbx, with.cds_len = TRUE, + with.utr5_len = TRUE, + with.utr3_len = TRUE, + exons = exons, + transcripts = exons)) + expect_equal(res,res_1) }) diff --git a/tests/testthat/test_genomeToX.R b/tests/testthat/test_genomeToX.R index 1f52df5..27a893a 100644 --- a/tests/testthat/test_genomeToX.R +++ b/tests/testthat/test_genomeToX.R @@ -12,23 +12,26 @@ test_that(".genome_to_tx and genomeToTranscript works", { ## 1) ENST00000381578, + strand last four nt of CDS. ## Position in ENST00000381578: 259 + 709 + 209 + 58 + 89 + 245 = 1569 ## gnm <- GRanges("X", IRanges(start = 605370, end = 605374)) - gnm <- GRanges("X", IRanges(start = 644635, end = 644639)) + gnm <- GRanges("X", IRanges(start = 644635, end = 644639), info = "Others") res <- ensembldb:::.genome_to_tx(gnm, db = edbx) res_lst <- c(res_lst, list(res)) expect_true(length(res) == 2) expect_true(is(res, "IRanges")) expect_equal(start(res["ENST00000381578"]), 1569) expect_equal(width(res["ENST00000381578"]), 5) + expect_equal(res@elementMetadata$info,c("Others","Others")) ## Restrict to negative strand - nothing there. strand(gnm) <- "-" res <- ensembldb:::.genome_to_tx(gnm, db = edbx) expect_equal(length(res), 1) expect_equal(start(res), -1) + expect_equal(res@elementMetadata$info,c("Others")) expect_true(is(res, "IRanges")) ## 2) ENST00000486554, - strand, nt 1-3 of tx ## gnm <- GRanges("X:106959629-106959631") gnm <- GRanges("X:107716399-107716401") + gnm$info <- "Others" res <- ensembldb:::.genome_to_tx(gnm, db = edbx) res_lst <- c(res_lst, list(res)) expect_equal(names(res), c("ENST00000372390", "ENST00000486554")) @@ -38,6 +41,7 @@ test_that(".genome_to_tx and genomeToTranscript works", { ## 3) ENST00000486554, - strand, nt 1-3 of CDS, nt 501-503 of tx ## gnm <- GRanges("X:106959129-106959131") gnm <- GRanges("X:107715899-107715901") + gnm$info <- "Others" res <- ensembldb:::.genome_to_tx(gnm, db = edbx) res_lst <- c(res_lst, list(res)) expect_equal(length(res), 11) @@ -47,6 +51,7 @@ test_that(".genome_to_tx and genomeToTranscript works", { ## 4) As above, just not all within the exon. ## gnm <- GRanges("X:106959629-106959635") gnm <- GRanges("X:107716399-107716405") + gnm$info <- "Others" res <- ensembldb:::.genome_to_tx(gnm, db = edbx) res_lst <- c(res_lst, list(res)) expect_true(length(res) == 1) @@ -56,6 +61,7 @@ test_that(".genome_to_tx and genomeToTranscript works", { ## exon 1: 503nt: region is expected at 508-511. ## gnm <- GRanges("X", IRanges(end = 106957975, width = 4)) gnm <- GRanges("X", IRanges(end = 107714745, width = 4)) + gnm$info <- "Others" res <- ensembldb:::.genome_to_tx(gnm, db = edbx) res_lst <- c(res_lst, list(res)) expect_true(length(res) == 9) # overlapping many tx... @@ -66,6 +72,7 @@ test_that(".genome_to_tx and genomeToTranscript works", { ## exon 3: 446 + 52 + 1529 total length: 2025 ## gnm <- GRanges("X", IRanges(start = 106956451, width = 3)) gnm <- GRanges("X", IRanges(start = 107713221, width = 3)) + gnm$info <- "Others" res <- ensembldb:::.genome_to_tx(gnm, db = edbx) res_lst <- c(res_lst, list(res)) expect_true(length(res) == 3) @@ -75,6 +82,7 @@ test_that(".genome_to_tx and genomeToTranscript works", { ## 7) A region with a tx/exon on both strands. ## gnm <- GRanges("7", IRanges(start = 127231550, width = 5)) gnm <- GRanges("7", IRanges(start = 127591496, width = 5)) + gnm$info <- "Others" edb7 <- filter(EnsDb.Hsapiens.v86, filter = ~ seq_name == "7") res <- ensembldb:::.genome_to_tx(gnm, edb7) res_lst <- c(res_lst, list(res)) @@ -89,7 +97,7 @@ test_that(".genome_to_tx and genomeToTranscript works", { gnm <- GRanges("X", IRanges(start = c(644635, 107716399), end = c(644639, 107716401) )) names(gnm) <- c("a", "b") - res <- .genome_to_tx(gnm, edbx) + res <- ensembldb:::.genome_to_tx(gnm, edbx) expect_true(is(res, "IRangesList")) expect_true(length(res) == 2) expect_equal(start(res[[1]]["ENST00000381578"]), 1569) @@ -105,6 +113,7 @@ test_that(".genome_to_tx and genomeToTranscript works", { end = c(644639, 107716401, 107715901, 107716405, 107714745, 107713223, 127591500, 127591996, 127591497)), seqnames = c("X", "X", "X", "X", "X", "X", "7", "7", "Z")) + gnm$info <- "Others" res <- ensembldb:::.genome_to_tx(gnm, edb) expect_equal(res[[1]], res_lst[[1]]) expect_equal(res[[2]], res_lst[[2]]) @@ -115,7 +124,7 @@ test_that(".genome_to_tx and genomeToTranscript works", { expect_equal(res[[7]], res_lst[[7]]) expect_equal(start(res[[8]]), -1) expect_equal(start(res[[9]]), -1) - + expect_equal(res[[8]]@elementMetadata$info,"Others") ## Check errors etc. expect_error(genomeToTranscript()) expect_error(genomeToTranscript(x = 4, db = edbx)) @@ -138,6 +147,19 @@ test_that(".genome_to_tx and genomeToTranscript works", { expect_equal(names(res), names(gnm)) expect_true(is(res, "IRangesList")) expect_true(start(res[[3]]) < 0) + + # test with pre-loaded data + exons <- exonsBy(edb) + + gnm <- GRanges( + IRanges(start = c(644635, 107716399, 107715899, 107716399, 107714742, + 107713221, 127591496, 127591496, 127591496), + end = c(644639, 107716401, 107715901, 107716405, 107714745, + 107713223, 127591500, 127591996, 127591497)), + seqnames = c("X", "X", "X", "X", "X", "X", "7", "7", "Z")) + gnm$info <- 'Others' + expect_warning(expect_true(identical(genomeToTranscript(gnm,edb), genomeToTranscript(gnm,exons)))) + }) test_that("genomeToProtein works", { @@ -230,4 +252,98 @@ test_that("genomeToProtein works", { names = c("good1","bad1"))) expect_warning(res <- genomeToProtein(grg, edb_2)) expect_equal(length(res), length(grg)) + + ## Preloaded data test + proteins <- proteins(edb) + exons <- exonsBy(edb) + transcripts <- transcripts(edb) + + gnm <- GRanges("X", IRanges(start = c(630898, 644636, 644633, 634829), + width = c(5, 1, 1, 3))) + names(gnm) <- c("a", "b", "c", "d") + expect_error(res <- genomeToProtein(gnm, edbx, proteins = proteins, exons = exons)) + expect_error(res <- genomeToProtein(gnm, edbx, exons = exons)) + expect_warning(expect_error(res <- genomeToProtein(gnm, edbx, proteins = proteins, exons = exons, transcripts = exons))) + expect_warning(res <- genomeToProtein(gnm, edbx, proteins = proteins, exons = exons, transcripts = transcripts)) + expect_true(is(res, "IRangesList")) + expect_equal(length(res), 4) + ## Elements 2 and 4 can not be mapped + expect_equal(start(res[[2]]), c(-1, -1)) + expect_equal(start(res[[4]]), c(-1)) + ## Check that order is correct + expect_true(mcols(res[[1]])$seq_start[1] == 630898) + expect_true(mcols(res[[2]])$seq_start[1] == 644636) + expect_true(mcols(res[[3]])$seq_start[1] == 644633) + expect_true(mcols(res[[4]])$seq_start[1] == 634829) + ## Check coords within the protein: + expect_true(start(res[[1]])[1] == 1) + expect_true(end(res[[1]])[1] == 2) + prt <- proteins(edbx, filter = ProteinIdFilter(names(res[[3]])[1])) + expect_equal(start(res[[3]])[1], nchar(prt$protein_sequence)) + + ## 2) ENST00000486554, - strand, nt 1-3 + ## gnm <- GRanges("X:106959629-106959631") + gnm <- GRanges("X:107716399-107716401") + expect_warning(res <- genomeToProtein(gnm, db = edbx, proteins = proteins, exons = exons, transcripts = transcripts)) + expect_true(is(res, "IRangesList")) + expect_true(length(res) == length(gnm)) + + ## gnm <- GRanges("X:106959629-106959931") + gnm <- GRanges("X:107716399-107716701") + expect_warning(res <- genomeToProtein(gnm, db = edbx, proteins = proteins, exons = exons, transcripts = transcripts)) + expect_true(is(res, "IRangesList")) + expect_true(length(res) == length(gnm)) + expect_true(start(res[[1]]) < 0) + + ## ENST00000486554 + ## 106959629:3: first 3 nt of transcript + ## 106959629:700: spans exon 1 and part of intron 1 + ## 106959130:2: first two nt of CDS. + ## 106957979:1 first nt in exon 2, second amino acid. + ## gnm <- GRanges("X", IRanges(start = c(106959629, 106959629, 106959130, + ## 106957979), + ## width = c(3, 700, 2, 1))) + gnm <- GRanges("X", IRanges(start = c(107716399, 107716399, 107715900, + 107714749), + width = c(3, 700, 2, 1))) + expect_warning(res <- genomeToProtein(gnm, db = edbx, proteins = proteins, exons = exons, transcripts = transcripts)) + expect_true(is(res, "IRangesList")) + expect_true(all(start(res[[1]]) < 0)) + expect_true(all(start(res[[2]]) < 0)) + expect_equal(start(res[[3]]["ENSP00000425414"]), 1) + expect_equal(width(res[[3]]["ENSP00000425414"]), 1) + expect_equal(start(res[[4]]["ENSP00000425414"]), 2) + expect_equal(width(res[[4]]["ENSP00000425414"]), 1) + + ## gnm <- GRanges("X", IRanges(start = 106959130, width = 2)) + gnm <- GRanges("X", IRanges(start = 107715900, width = 2)) + expect_warning(res <- genomeToProtein(gnm, edbx, proteins = proteins, exons = exons, transcripts = transcripts)) + expect_true(is(res, "IRangesList")) + expect_true(length(res) == length(gnm)) + expect_equal(start(res[[1]]["ENSP00000425414"]), 1) + expect_false(mcols(res[[1]]["ENSP00000425414"])$cds_ok) + + ## That's from issue #69 + library(EnsDb.Hsapiens.v86) + library(testthat) + edb_2 <- filter(EnsDb.Hsapiens.v86, filter = ~ seq_name == "19") + + ## grg <- GRanges(Rle(c("chr19")), IRanges(51920103, width=1, names = "good1")) + grg <- GRanges(Rle(c("chr19")), IRanges(51416849, width=1, names = "good1")) + expect_warning(genomeToProtein(grg, db = edb_2, proteins = proteins, exons = exons, transcripts = transcripts)) + edb_2 <- filter(EnsDb.Hsapiens.v86, filter = ~ seq_name == "chr19") + seqlevelsStyle(edb_2) <- "UCSC" + + proteins <- proteins(edb_2) + exons <- exonsBy(edb_2) + transcripts <- transcripts(edb_2) + expect_warning(res <- genomeToProtein(grg, db = edb_2, proteins = proteins, exons = exons, transcripts = transcripts)) + expect_true(length(res[[1]]) == 4) + + ## grg <- GRanges("chr19", IRanges(c(51920103, 51919998), width = 1, + ## names = c("good1","bad1"))) + grg <- GRanges("chr19", IRanges(c(51416849, 51416744), width = 1, + names = c("good1","bad1"))) + expect_warning(res <- genomeToProtein(grg, edb_2, proteins = proteins, exons = exons, transcripts = transcripts)) + expect_equal(length(res), length(grg)) }) diff --git a/tests/testthat/test_proteinToGenome.R b/tests/testthat/test_proteinToGenome.R index 6f75b6e..49fe9f6 100644 --- a/tests/testthat/test_proteinToGenome.R +++ b/tests/testthat/test_proteinToGenome.R @@ -76,6 +76,41 @@ test_that(".cds_for_id and .cds_matching_protein work", { expect_equal(unname(lengths(clnd)), c(1, 0, 1)) expect_true(all(clnd[[1]][[1]]$cds_ok)) expect_true(all(clnd[[3]][[1]]$cds_ok == FALSE)) + + ## Preloaded data test + cds <- cdsBy(edb,columns = c(listColumns(edb,'tx'),'protein_id','uniprot_id','protein_sequence')) + + ## Use protein_id + expect_warning(res <- ensembldb:::.cds_for_id2(cds, prts)) + expect_equal(names(res), prts) + expect_equal(names(res[[1]]), txs[1]) + expect_equal(names(res[[3]]), txs[2]) + expect_true(is.null(res[[2]])) + + ## Use tx_id + expect_warning(res <- ensembldb:::.cds_for_id2(cds,txs,idType = 'tx_id')) + expect_equal(names(res), txs) + expect_equal(res[[1]][[1]]$protein_id[1], prts[1]) + + ## Use uniprot_id + unis <- c("Q9NPH5", "Q8TD30", "H0YIP2") + unis_counts <- c(10, 2, 1) + res <- ensembldb:::.cds_for_id2(cds, unis, idType = "uniprot_id") + expect_equal(unname(lengths(res)), unis_counts) + expect_equal(names(res), unis) + + ## Run .cds_matching_protein on the results to avoid re-querying. + ## H0YIP2_HUMAN has incomplete 5' and 3' CDS. + expect_warning(clnd <- ensembldb:::.cds_matching_protein(edb,res)) + expect_warning(clnd_preload <- ensembldb:::.cds_matching_protein(res)) + expect_equal(clnd,clnd_preload) + + ## ENSP00000437716 encoding transcript has an incomplete 3' CDS + cds <- cdsBy(edb,columns = c(listColumns(edb,'tx'),'protein_id','protein_sequence')) + res_preload <- ensembldb:::.cds_for_id2(cds, prts) + expect_warning(clnd <- ensembldb:::.cds_matching_protein(edb, res_preload)) + expect_warning(clnd_preload <- ensembldb:::.cds_matching_protein(res_preload)) + expect_equal(clnd, clnd_preload) }) test_that("proteinToGenome works", { @@ -126,6 +161,23 @@ test_that("proteinToGenome works", { expect_equal(res[[4]]$exon_rank, c(2, 3, 4, 5)) expect_true(all(res[[4]]$cds_ok)) + ## Preloaded data test + cds <- cdsBy(edb,columns = c('tx_id','uniprot_id','protein_id','protein_sequence')) + expect_warning(res <- proteinToGenome(prngs, cds)) + expect_true(all(sapply(res, function(z) sum(width(z))) %% 3 == 0)) + ## Manually check for the ranges here. + ## ENSP00000418169, SYP, encoded by ENST00000479808, - strand + ## coords cds: 23 -> 67, 24 -> 72 + ## exon 1 width 49, 5' UTR: 13nt long. 36 nt + ## exon 2 width 66. relative pos is 31nt in exon 2 + ## exon 2 starts: 49,055,492 [nt1], [nt31] is 49055492 - 30 = 49055462 + expect_equal(end(res[[2]]), 49199003) + expect_equal(start(res[[2]]), 49199003 - 5) + expect_equal(res[[2]]$protein_id, "ENSP00000418169") + expect_equal(res[[2]]$tx_id, "ENST00000479808") + expect_equal(res[[2]]$exon_rank, 2) + expect_true(res[[2]]$cds_ok) + ## Uniprot identifier ## ids <- c("D6RDZ7_HUMAN", "SHOX_HUMAN", "TMM27_HUMAN", "unexistant") ids <- c("D6RDZ7", "O15266", "Q9HBJ8", "unexistant") @@ -152,6 +204,27 @@ test_that("proteinToGenome works", { expect_true(all(strts == strts[1])) nds <- unlist(end(res[[2]])) expect_true(all(nds == nds[1])) + + ## Preloaded data test + expect_warning(res <- proteinToGenome(prngs, cds,idType = "uniprot_id")) + expect_true(is(res[[1]], "GRanges")) + expect_true(is(res[[2]], "GRangesList")) + expect_true(is(res[[3]], "GRanges")) + expect_true(is(res[[4]], "GRanges")) + expect_true(length(res[[4]]) == 0) + res <- res[1:3] + expect_true(all(unlist(lapply(res, function(z) sum(width(z)))) %% 3 == 0)) + ## We've got multi-mapping here + expect_equal(unname(lengths(res)), c(1, 4, 2)) + ## Although we have different proteins, all coords are the same + strts <- unlist(start(res[[1]])) + expect_true(all(strts == strts[1])) + nds <- unlist(end(res[[1]])) + expect_true(all(nds == nds[1])) + strts <- unlist(start(res[[2]])) + expect_true(all(strts == strts[1])) + nds <- unlist(end(res[[2]])) + expect_true(all(nds == nds[1])) }) test_that("proteinToTranscript works", { @@ -181,6 +254,13 @@ test_that("proteinToTranscript works", { expect_equal(start(tx_rel[["ENSP00000217901"]]), (197L + 7L)) expect_equal(start(tx_rel[["ENSP00000425155"]]), (86L + 92L + 31L)) + ## Preloaded data test + + cds <- cdsBy(edb, columns = c('tx_id','protein_id','uniprot_id','protein_sequence')) + fiveUTR <- fiveUTRsByTranscript(edb) + expect_warning(tx_rel_preload <- proteinToTranscript(prng, cds, fiveUTR = fiveUTR)) + expect_equal(tx_rel_preload,tx_rel) + ## Now for Uniprot... ## ids <- c("D6RDZ7_HUMAN", "SHOX_HUMAN", "TMM27_HUMAN", "unexistant") ids <- c("D6RDZ7", "O15266", "Q9HBJ8", "unexistant") @@ -195,6 +275,12 @@ test_that("proteinToTranscript works", { expect_equal(unique(lens[2:5]), width(prngs)[2] * 3) expect_equal(unique(lens[6]), width(prngs)[3] * 3) + ## Preloaded data test + expect_warning(tx_rel_preload <- proteinToTranscript(prngs, cds, + idType = "uniprot_id", + fiveUTR = fiveUTR)) + expect_equal(tx_rel_preload,tx_rel) + ## Mapping fails for all... ids <- c("a", "unexistant") prngs <- IRanges(start = c(1, 13), end = c(2, 21)) @@ -203,6 +289,11 @@ test_that("proteinToTranscript works", { expect_warning(res <- proteinToTranscript(prngs, edb)) expect_true(is(res, "IRangesList")) expect_equal(unlist(unname(start(res))), c(-1, -1)) + + ## Preloaded data test + expect_warning(res_preload <- proteinToTranscript(prngs, cds, + fiveUTR = fiveUTR)) + expect_equal(res_preload,res) }) test_that(".cds_for_id_range works", { diff --git a/tests/testthat/test_transcriptToX.R b/tests/testthat/test_transcriptToX.R index cbdbbff..7bb5627 100644 --- a/tests/testthat/test_transcriptToX.R +++ b/tests/testthat/test_transcriptToX.R @@ -71,6 +71,7 @@ test_that(".tx_to_protein works", { mcols(x)$id <- "ENST00000381578" res_a <- .tx_to_protein(x, edbx, id = "id") res_b <- .txs_to_proteins(x, edbx, id = "id") + mcols(res_b) <- mcols(res_b)[1:4] expect_equal(res_a, res_b) ## Multiple input. @@ -83,6 +84,16 @@ test_that(".tx_to_protein works", { expect_warning(res_a <- .tx_to_protein(x, edbx)) expect_warning(res_b <- .txs_to_proteins(x, edbx)) expect_equal(res_a, res_b) + + ## Preloaded data test + proteins <- proteins(edbx) + exons <- exonsBy(edbx) + transcripts <- transcripts(edbx) + expect_warning(res_c <- .txs_to_proteins(x, edbx, + proteins = proteins, + exons = exons, + transcripts = transcripts)) + expect_equal(res_c, res_b) }) test_that("transcriptToProtein works", { @@ -100,6 +111,25 @@ test_that("transcriptToProtein works", { res <- transcriptToProtein(x, edbx, id = "id") expect_equal(start(res), 1) expect_equal(end(res), 1) + + ## Preloaded data test + + proteins <- proteins(edbx) + exons <- exonsBy(edbx) + transcripts <- transcripts(edbx) + expect_error(transcriptToProtein(x, edbx, id = "id", + exons = exons, + transcripts = transcripts)) + expect_error(transcriptToProtein(x, edbx, + proteins = proteins, + exons = exons, + transcripts = transcripts)) + expect_error(transcriptToProtein(x, edbx, id = "id", + proteins = proteins, + exons = proteins, + transcripts = transcripts)) + res2 <- transcriptToProtein(x, edbx, id = "id", proteins = proteins, exons = exons, transcripts = transcripts) + expect_equal(res2, res) }) test_that(".tx_to_genome works", { @@ -140,6 +170,14 @@ test_that(".tx_to_genome works", { expect_equal(as.character(strand(res[[6]])), "+") expect_equal(as.character(seqnames(res[[6]])), "Y") + ## add pre-load data test + exons <- exonsBy(edbx) + res_preload <- unlist(ensembldb:::.tx_to_genome(rng, exons)) + res_unlisted <- unlist(res) + res_unlisted$tx_id <- NULL + seqlevels(res_preload) <- seqlevels(res_unlisted) + expect_equal(res_unlisted, res_preload) + ## wrong ID and range outside tx rng <- IRanges(start = c(501, 200, 1), end = c(505, 1200, 5), names = c("ENST00000486554", "ENST00000486554", "B")) @@ -150,6 +188,13 @@ test_that(".tx_to_genome works", { expect_equal(a, b) expect_equal(lengths(res_2), c(ENST00000486554 = 2, ENST00000486554 = 0, B = 0)) + ## add pre-load data test + expect_warning(res_3 <- ensembldb:::.tx_to_genome(rng, exons)) + c <- unlist(res_3[1]) + seqlevels(res_preload) <- seqlevels(c) + expect_equal(res_preload, res_preload) + expect_equal(lengths(res_3), c(ENST00000486554 = 2, ENST00000486554 = 0, + B = 0)) }) test_that("transcriptToGenome works", { @@ -186,7 +231,40 @@ test_that("transcriptToGenome works", { x <- IRanges(start = c(256, 2), end = c(265, 12000), names = c("ENST00000381578", "ENST00000381578")) - res <- transcriptToGenome(x, edbx) + expect_warning(res <- transcriptToGenome(x, edbx)) # indeed warn expected + expect_equal(lengths(res), c(ENST00000381578 = 2, ENST00000381578 = 0)) + + ## add pre-load data test + exons <- exonsBy(edbx) + x <- IRanges(start = c(259, 1, 259), end = c(260, 4, 261), + names = c("ENST00000381578", "some", "ENST00000381578")) + expect_warning(res <- transcriptToGenome(x, exons)) + expect_true(is(res, "GRangesList")) + expect_true(length(res) == length(x)) + expect_true(length(res[[2]]) == 0) + expect_equal(names(res), names(x)) + expect_equal(start(res[[1]]), start(res[[3]])) + expect_equal(end(res[[1]])[1], end(res[[3]])[1]) + expect_equal(end(res[[1]])[2] + 1, end(res[[3]])[2]) + + expect_warning(res <- transcriptToGenome(x[2], exons)) + expect_true(is(res, "GRangesList")) + expect_true(length(res) == 1) + + expect_warning(res <- transcriptToGenome(x[c(2, 2)], exons)) + expect_true(is(res, "GRangesList")) + expect_true(length(res) == 2) + + res <- transcriptToGenome(x[1], exons) + expect_true(is(res, "GRangesList")) + expect_true(length(res) == 1) + expect_true(length(res[[1]]) == 2) + + x <- IRanges(start = c(256, 2), end = c(265, 12000), + names = c("ENST00000381578", "ENST00000381578"), + info = 'ORFs') + expect_warning(res <- transcriptToGenome(x, exons)) + expect_true(res$ENST00000381578$info[1] == 'ORFs') expect_equal(lengths(res), c(ENST00000381578 = 2, ENST00000381578 = 0)) }) @@ -230,6 +308,46 @@ test_that("transcriptToCds works", { expect_warning(res <- transcriptToCds(txcoords, edb18)) expect_true(all(start(res) < 0)) expect_true(all(end(res) < 0)) + + ## Preloaded data test + exons <- exonsBy(edb18) + transcripts <- transcripts(edb18) + expect_error(transcriptToCds(db = edb18, exons = exons,transcripts = transcripts)) + ## 1) unknown tx ids + txcoords <- IRanges(start = c(4, 3), width = c(1, 1), names = c("a", "b")) + expect_error(transcriptToCds(x = txcoords)) + expect_warning(res <- transcriptToCds(txcoords, edb18, exons = exons,transcripts = transcripts)) + expect_true(all(start(res) == -1)) + expect_true(all(end(res) == -1)) + ## 2) all tx not coding + txcoords <- IRanges(start = c(132, 133), end = c(323, 323), + names = rep("ENST00000590515", 2)) + expect_warning(res <- transcriptToCds(txcoords, edb18, exons = exons,transcripts = transcripts)) + expect_true(all(start(res) == -1)) + expect_true(all(end(res) == -1)) + ## 3) some tx not coding + ## 4) coordinate not within coding + txcoords <- IRanges(start = c(1463, 3, 143, 147, 1463), width = 1, + names = c("ENST00000398117", "ENST00000333681", + "ENST00000590515", "ENST00000589955", + "ENST00000398117")) + expect_warning(res <- transcriptToCds(txcoords, edb18, exons = exons,transcripts = transcripts)) + expect_equal(start(res), c(1, -1, -1, 1, 1)) + expect_equal(end(res), c(1, -1, -1, 1, 1)) + expect_equal(res[1], res[5]) + ## End position outside of CDS + txcoords <- IRanges(start = c(1463, 3, 143, 147), width = c(4, 1, 1, 765), + names = c("ENST00000398117", "ENST00000333681", + "ENST00000590515", "ENST00000589955")) + expect_warning(res <- transcriptToCds(txcoords, edb18, exons = exons,transcripts = transcripts)) + expect_equal(start(res), c(1, -1, -1, -1)) + expect_equal(end(res), c(4, -1, -1, -1)) + txcoords <- IRanges(start = c(3, 143, 147), width = c(1, 1, 765), + names = c("ENST00000333681", + "ENST00000590515", "ENST00000589955")) + expect_warning(res <- transcriptToCds(txcoords, edb18, exons = exons,transcripts = transcripts)) + expect_true(all(start(res) < 0)) + expect_true(all(end(res) < 0)) }) test_that("cdsToTranscript works", { @@ -266,6 +384,44 @@ test_that("cdsToTranscript works", { res <- getSeq(BSgenome.Hsapiens.NCBI.GRCh38, unlist(gnm)) exp <- c("G", "C", "C", "C", "T") expect_equal(exp, unname(as.character(res))) + + ## Preloaded data test + exons <- exonsBy(EnsDb.Hsapiens.v86) + transcripts <- transcripts(EnsDb.Hsapiens.v86) + + edb18 <- filter(EnsDb.Hsapiens.v86, filter = ~ seq_name == "18") + expect_error(cdsToTranscript()) + expect_error(cdsToTranscript(db = edb18, exons = exons,transcripts = transcripts)) + txcoords <- IRanges(start = c(4, 3, 143, 147), width = 1, + names = c("ENST00000398117", "ENST00000333681", + "ENST00000590515", "ENST00000589955")) + expect_error(cdsToTranscript(x = txcoords)) + expect_warning(res <- cdsToTranscript(txcoords, edb18, exons = exons,transcripts = transcripts)) + expect_equal(start(res), c(1466, 902, -1, 293)) + + txcoords <- IRanges(start = c(4, 3, 50000, 147), width = 1, + names = c("ENST00000398117", "ENST00000398117", + "ENST00000398117", "b")) + expect_warning(res <- cdsToTranscript(txcoords, edb18, exons = exons,transcripts = transcripts)) + expect_equal(start(res), c(1466, 1465, -1, -1)) + + ## Map variants: + ## ENST00000070846:c.1643DelG + ## ENST00000070846:c.1881DelC + ## ENST00000379802:c.6995C>A + ## ENST00000261590:c.1088C>T + ## ENST00000261590:c.561T>G + rngs <- IRanges(start = c(1643, 1881, 6995, 1088, 561), width = 1, + names = c("ENST00000070846", "ENST00000070846", + "ENST00000379802", "ENST00000261590", + "ENST00000261590")) + rngs_tx <- cdsToTranscript(rngs, EnsDb.Hsapiens.v86, exons = exons,transcripts = transcripts) + gnm <- transcriptToGenome(rngs_tx, exons) + + library(BSgenome.Hsapiens.NCBI.GRCh38) + res <- getSeq(BSgenome.Hsapiens.NCBI.GRCh38, unlist(gnm)) + exp <- c("G", "C", "C", "C", "T") + expect_equal(exp, unname(as.character(res))) }) test_that(".ids_message works", {