From 453bc37a277cbf7776d7a75daed1c32a27255e2a Mon Sep 17 00:00:00 2001 From: HDash <16350928+HDash@users.noreply.github.com> Date: Fri, 28 Jun 2024 10:47:54 +0100 Subject: [PATCH] Add segregate_seqs and BiocParallel wrappers for parallel processing --- DESCRIPTION | 3 +- NAMESPACE | 5 ++ R/MotifPeeker.R | 2 +- R/bpapply.R | 46 +++++++++++++++ R/get_bpparam.R | 45 +++++++++++++++ R/segregate_seqs.R | 58 +++++++++++++++++++ inst/markdown/MotifPeeker.Rmd | 4 ++ man/MotifPeeker.Rd | 2 +- man/bpapply.Rd | 85 ++++++++++++++++++++++++++++ man/get_bpparam.Rd | 41 ++++++++++++++ man/segregate_seqs.Rd | 49 ++++++++++++++++ tests/testthat/test-bpapply.R | 26 +++++++++ tests/testthat/test-segregate_seqs.R | 19 +++++++ 13 files changed, 382 insertions(+), 3 deletions(-) create mode 100644 R/bpapply.R create mode 100644 R/get_bpparam.R create mode 100644 R/segregate_seqs.R create mode 100644 man/bpapply.Rd create mode 100644 man/get_bpparam.Rd create mode 100644 man/segregate_seqs.Rd create mode 100644 tests/testthat/test-bpapply.R create mode 100644 tests/testthat/test-segregate_seqs.R diff --git a/DESCRIPTION b/DESCRIPTION index 82a72de..3ddda8b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -56,7 +56,8 @@ Imports: GenomeInfoDb, Biostrings, BSgenome, - memes + memes, + S4Vectors Suggests: BiocStyle, BSgenome.Hsapiens.UCSC.hg19, diff --git a/NAMESPACE b/NAMESPACE index c480423..898ef6f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(MotifPeeker) +export(bpapply) export(calc_frip) export(check_ENCODE) export(check_JASPAR) @@ -12,6 +13,7 @@ export(read_motif_file) export(read_peak_file) export(report_command) export(report_header) +export(segregate_seqs) export(summit_to_motif) export(to_plotly) export(trim_seqs) @@ -29,6 +31,7 @@ importFrom(GenomeInfoDb,seqlengths) importFrom(GenomicAlignments,summarizeOverlaps) importFrom(GenomicRanges,GRanges) importFrom(GenomicRanges,end) +importFrom(GenomicRanges,findOverlaps) importFrom(GenomicRanges,mcols) importFrom(GenomicRanges,seqnames) importFrom(GenomicRanges,start) @@ -36,6 +39,8 @@ importFrom(GenomicRanges,strand) importFrom(GenomicRanges,width) importFrom(IRanges,IRanges) importFrom(Rsamtools,countBam) +importFrom(S4Vectors,queryHits) +importFrom(S4Vectors,subjectHits) importFrom(SummarizedExperiment,assay) importFrom(htmltools,tagList) importFrom(htmlwidgets,JS) diff --git a/R/MotifPeeker.R b/R/MotifPeeker.R index 697df1d..4eea867 100644 --- a/R/MotifPeeker.R +++ b/R/MotifPeeker.R @@ -176,7 +176,7 @@ MotifPeeker <- function( output_dir = tempdir(), display = NULL, use_cache = TRUE, - ncpus = 1, + ncpus = 2, quiet = TRUE, debug = FALSE, verbose = FALSE diff --git a/R/bpapply.R b/R/bpapply.R new file mode 100644 index 0000000..937291e --- /dev/null +++ b/R/bpapply.R @@ -0,0 +1,46 @@ +#' Use BiocParallel functions with appropriate parameters +#' +#' Light wrapper around \code{\link[BiocParallel]{BiocParallel}} functions that +#' automatically sets the appropriate parameters based on the number of workers +#' specified. +#' +#' @param apply_fun A \code{\link[BiocParallel]{BiocParallel}} function to use +#' for parallel processing. (default = \code{BiocParallel::bplapply}) +#' @inheritParams BiocParallel::bplapply +#' @inheritDotParams BiocParallel::bplapply +#' @inheritDotParams BiocParallel::bpmapply +#' @inheritParams get_bpparam +#' +#' @returns Output relevant to the \code{apply_fun} specified. +#' +#' @examples +#' half_it <- function(arg1) return(arg1 / 2) +#' x <- seq_len(10) +#' +#' res <- bpapply(x, half_it, workers = 2) +#' print(res) +#' +#' @export +bpapply <- function( + X, + FUN, + apply_fun = BiocParallel::bplapply, + workers = 1, + progressbar = workers > 1, + force_snowparam = FALSE, + verbose = FALSE, + ... +) { + stp_msg <- paste("Supplied apply_fun is not a valid BiocParallel function.") + apply_fun_package <- attr(apply_fun, "package") + if (length(apply_fun_package) == 0 || + apply_fun_package != "BiocParallel") stopper(stp_msg) + + BPPARAM <- get_bpparam(workers = workers, + progressbar = progressbar, + force_snowparam = force_snowparam, + verbose = verbose) + + res <- apply_fun(X, FUN = FUN, BPPARAM = BPPARAM, ...) + return(res) +} diff --git a/R/get_bpparam.R b/R/get_bpparam.R new file mode 100644 index 0000000..e957952 --- /dev/null +++ b/R/get_bpparam.R @@ -0,0 +1,45 @@ +#' Get parameters for \link[BiocParallel]{BiocParallel} +#' +#' Get appropriate parameters for \code{BiocParallel} based on the +#' number of workers specified. For less than 4 workers, the function returns a +#' \code{MulticoreParam} object. For 4 or more cores, the function +#' returns a \code{SnowParam} object. Since Windows supports +#' neither, the function returns a \code{SerialParam} object. As a +#' result, Windows users do not benefit from parallel processing. +#' +#' @param workers The number of workers to use for parallel processing. +#' @param force_snowparam A logical indicating whether to force the use of +#' \link[BiocParallel]{SnowParam} object. +#' @param verbose A logical indicating whether to print verbose messages while +#' running the function. (default = FALSE) +#' @inheritParams BiocParallel::SnowParam +#' +#' @returns A \code{BPPARAM} object. +#' +#' @seealso \link[BiocParallel]{BiocParallelParam} +#' +#' @keywords internal +get_bpparam <- function(workers, + progressbar = workers > 1, + force_snowparam = FALSE, + verbose = FALSE) { + if (.Platform$OS.type == "windows") { + custom_bpparam <- BiocParallel::SerialParam() + messager("Windows does not support parallel processing.", + "Returning SerialParam object for BiocParallel.", + v = verbose) + } else if (workers < 4 && !force_snowparam) { + custom_bpparam <- + BiocParallel::MulticoreParam(workers = workers, + progressbar = progressbar) + messager("Using MulticoreParam object for BiocParallel (workers =", + paste0(workers, ")."), v = verbose) + } else { + custom_bpparam <- BiocParallel::SnowParam(workers = workers, + progressbar = progressbar) + messager("Using SnowParam object for BiocParallel (workers =", + paste0(workers, ")."), v = verbose) + } + + return(custom_bpparam) +} diff --git a/R/segregate_seqs.R b/R/segregate_seqs.R new file mode 100644 index 0000000..43f866d --- /dev/null +++ b/R/segregate_seqs.R @@ -0,0 +1,58 @@ +#' Segregate input sequences into common and unique groups +#' +#' This function takes two sets of sequences and segregates them into common and +#' unique sequences. The common sequences are sequences that are present in both +#' sets of sequences. The unique sequences are sequences that are present in +#' only one of the sets of sequences. +#' +#' Sequences are considered common if their base pairs align in any +#' position, even if they vary in length. Consequently, while the number of +#' common sequences remains consistent between both sets, but the length and +#' composition of these sequences may differ. As a result, the function returns +#' distinct sets of common sequences for each input set of sequences. +#' +#' @param seqs1 A set of sequences (\code{GRanges} object) +#' @param seqs2 A set of sequences (\code{GRanges} object) +#' +#' @importFrom GenomicRanges findOverlaps +#' @importFrom S4Vectors queryHits subjectHits +#' +#' @returns A list containing the common sequences and unique sequences for each +#' set of sequences. The list contains the following \code{GRanges} objects: +#' \itemize{ +#' \item \code{common_seqs1}: Common sequences in \code{seqs1} +#' \item \code{common_seqs2}: Common sequences in \code{seqs2} +#' \item \code{unique_seqs1}: Unique sequences in \code{seqs1} +#' \item \code{unique_seqs2}: Unique sequences in \code{seqs2} +#' } +#' +#' @examples +#' data("CTCF_ChIP_peaks", package = "MotifPeeker") +#' data("CTCF_TIP_peaks", package = "MotifPeeker") +#' +#' seqs1 <- CTCF_ChIP_peaks +#' seqs2 <- CTCF_TIP_peaks +#' res <- segregate_seqs(seqs1, seqs2) +#' print(res) +#' +#' @seealso \link[GenomicRanges]{findOverlaps} +#' +#' @export +segregate_seqs <- function(seqs1, seqs2) { + common_seqs_ranges <- GenomicRanges::findOverlaps(seqs1, seqs2, + type = "any") + + common_seqs1 <- seqs1[S4Vectors::queryHits(common_seqs_ranges)] + common_seqs2 <- seqs2[S4Vectors::subjectHits(common_seqs_ranges)] + unique_seqs1 <- seqs1[-S4Vectors::queryHits(common_seqs_ranges)] + unique_seqs2 <- seqs2[-S4Vectors::subjectHits(common_seqs_ranges)] + + return( + list( + common_seqs1 = common_seqs1, + common_seqs2 = common_seqs2, + unique_seqs1 = unique_seqs1, + unique_seqs2 = unique_seqs2 + ) + ) +} diff --git a/inst/markdown/MotifPeeker.Rmd b/inst/markdown/MotifPeeker.Rmd index c70a137..6f26b40 100644 --- a/inst/markdown/MotifPeeker.Rmd +++ b/inst/markdown/MotifPeeker.Rmd @@ -127,7 +127,11 @@ if (alignment_metrics) { ### Known-motif Analysis ### comparison_indices <- setdiff(length(result$peaks), params$reference_index) +segregated_peaks <- lapply(comparison_indices, function(x) { + segregate_seqs(result$peaks[[params$reference_index]], result$peaks[[x]]) +}) +## Calculate enrichment for segregated peaks } diff --git a/man/MotifPeeker.Rd b/man/MotifPeeker.Rd index bcbb39e..6e8b785 100644 --- a/man/MotifPeeker.Rd +++ b/man/MotifPeeker.Rd @@ -23,7 +23,7 @@ MotifPeeker( output_dir = tempdir(), display = NULL, use_cache = TRUE, - ncpus = 1, + ncpus = 2, quiet = TRUE, debug = FALSE, verbose = FALSE diff --git a/man/bpapply.Rd b/man/bpapply.Rd new file mode 100644 index 0000000..8fb6594 --- /dev/null +++ b/man/bpapply.Rd @@ -0,0 +1,85 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/bpapply.R +\name{bpapply} +\alias{bpapply} +\title{Use BiocParallel functions with appropriate parameters} +\usage{ +bpapply( + X, + FUN, + apply_fun = BiocParallel::bplapply, + workers = 1, + progressbar = workers > 1, + force_snowparam = FALSE, + verbose = FALSE, + ... +) +} +\arguments{ +\item{X}{ + Any object for which methods \code{length}, \code{[}, and + \code{[[} are implemented. + } + +\item{FUN}{ + The \code{function} to be applied to each element of \code{X}. + } + +\item{apply_fun}{A \code{\link[BiocParallel]{BiocParallel}} function to use +for parallel processing. (default = \code{BiocParallel::bplapply})} + +\item{workers}{The number of workers to use for parallel processing.} + +\item{progressbar}{ + \code{logical(1)} Enable progress bar (based on plyr:::progress_text). + } + +\item{force_snowparam}{A logical indicating whether to force the use of +\link[BiocParallel]{SnowParam} object.} + +\item{verbose}{A logical indicating whether to print verbose messages while +running the function. (default = FALSE)} + +\item{...}{ + Arguments passed on to \code{\link[BiocParallel:bplapply]{BiocParallel::bplapply}}, \code{\link[BiocParallel:bpmapply]{BiocParallel::bpmapply}} + \describe{ + \item{\code{BPPARAM}}{ + An optional \code{\link[BiocParallel]{BiocParallelParam}} instance + determining the parallel back-end to be used during evaluation, or a + \code{list} of \code{BiocParallelParam} instances, to be applied in + sequence for nested calls to \pkg{BiocParallel} functions. + } + \item{\code{BPREDO}}{A \code{list} of output from \code{bplapply} with one or + more failed elements. When a list is given in \code{BPREDO}, + \code{bpok} is used to identify errors, tasks are rerun and inserted + into the original results. + } + \item{\code{BPOPTIONS}}{ + Additional options to control the behavior of the parallel evaluation, see \code{\link[BiocParallel]{bpoptions}}. + } + \item{\code{MoreArgs}}{List of additional arguments to \code{FUN}. + } + \item{\code{SIMPLIFY}}{ + If \code{TRUE} the result will be simplified using + \code{\link{simplify2array}}. + } + \item{\code{USE.NAMES}}{If \code{TRUE} the result will be named. + } + }} +} +\value{ +Output relevant to the \code{apply_fun} specified. +} +\description{ +Light wrapper around \code{\link[BiocParallel]{BiocParallel}} functions that +automatically sets the appropriate parameters based on the number of workers +specified. +} +\examples{ +half_it <- function(arg1) return(arg1 / 2) +x <- seq_len(10) + +res <- bpapply(x, half_it, workers = 2) +print(res) + +} diff --git a/man/get_bpparam.Rd b/man/get_bpparam.Rd new file mode 100644 index 0000000..1e48ece --- /dev/null +++ b/man/get_bpparam.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_bpparam.R +\name{get_bpparam} +\alias{get_bpparam} +\title{Get parameters for \link[BiocParallel]{BiocParallel}} +\usage{ +get_bpparam( + workers, + progressbar = workers > 1, + force_snowparam = FALSE, + verbose = FALSE +) +} +\arguments{ +\item{workers}{The number of workers to use for parallel processing.} + +\item{progressbar}{ + \code{logical(1)} Enable progress bar (based on plyr:::progress_text). + } + +\item{force_snowparam}{A logical indicating whether to force the use of +\link[BiocParallel]{SnowParam} object.} + +\item{verbose}{A logical indicating whether to print verbose messages while +running the function. (default = FALSE)} +} +\value{ +A \code{BPPARAM} object. +} +\description{ +Get appropriate parameters for \code{BiocParallel} based on the +number of workers specified. For less than 4 workers, the function returns a +\code{MulticoreParam} object. For 4 or more cores, the function +returns a \code{SnowParam} object. Since Windows supports +neither, the function returns a \code{SerialParam} object. As a +result, Windows users do not benefit from parallel processing. +} +\seealso{ +\link[BiocParallel]{BiocParallelParam} +} +\keyword{internal} diff --git a/man/segregate_seqs.Rd b/man/segregate_seqs.Rd new file mode 100644 index 0000000..656d39c --- /dev/null +++ b/man/segregate_seqs.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/segregate_seqs.R +\name{segregate_seqs} +\alias{segregate_seqs} +\title{Segregate input sequences into common and unique groups} +\usage{ +segregate_seqs(seqs1, seqs2) +} +\arguments{ +\item{seqs1}{A set of sequences (\code{GRanges} object)} + +\item{seqs2}{A set of sequences (\code{GRanges} object)} +} +\value{ +A list containing the common sequences and unique sequences for each +set of sequences. The list contains the following \code{GRanges} objects: +\itemize{ + \item \code{common_seqs1}: Common sequences in \code{seqs1} + \item \code{common_seqs2}: Common sequences in \code{seqs2} + \item \code{unique_seqs1}: Unique sequences in \code{seqs1} + \item \code{unique_seqs2}: Unique sequences in \code{seqs2} +} +} +\description{ +This function takes two sets of sequences and segregates them into common and +unique sequences. The common sequences are sequences that are present in both +sets of sequences. The unique sequences are sequences that are present in +only one of the sets of sequences. +} +\details{ +Sequences are considered common if their base pairs align in any +position, even if they vary in length. Consequently, while the number of +common sequences remains consistent between both sets, but the length and +composition of these sequences may differ. As a result, the function returns +distinct sets of common sequences for each input set of sequences. +} +\examples{ +data("CTCF_ChIP_peaks", package = "MotifPeeker") +data("CTCF_TIP_peaks", package = "MotifPeeker") + +seqs1 <- CTCF_ChIP_peaks +seqs2 <- CTCF_TIP_peaks +res <- segregate_seqs(seqs1, seqs2) +print(res) + +} +\seealso{ +\link[GenomicRanges]{findOverlaps} +} diff --git a/tests/testthat/test-bpapply.R b/tests/testthat/test-bpapply.R new file mode 100644 index 0000000..8c767e1 --- /dev/null +++ b/tests/testthat/test-bpapply.R @@ -0,0 +1,26 @@ +test_that("bpapply works", { + test_func <- function(arg1, arg2 = NULL) { + if (is.null(arg2)) return(arg1) + return(arg1 + arg2) + } + + x <- y <- seq_len(10) + + ### Non-existent apply_fun ### + expect_error(bpapply(x, test_func, apply_fun = "does_not_exist")) + + ### bplapply ### + res <- bpapply(x, test_func, workers = 2) + expect_equal(unlist(res), x) + + ### SnowParam ### + res <- bpapply(x, test_func, workers = 1, force_snowparam = TRUE, + progressbar = FALSE) + expect_equal(unlist(res), x) + + ### bpmapply ### + res <- bpapply(x, test_func, apply_fun = BiocParallel::bpmapply, + workers = 2, MoreArgs = list(arg2 = y), + progressbar = FALSE) + expect_equal(res[1,2], 3) +}) diff --git a/tests/testthat/test-segregate_seqs.R b/tests/testthat/test-segregate_seqs.R new file mode 100644 index 0000000..36b103b --- /dev/null +++ b/tests/testthat/test-segregate_seqs.R @@ -0,0 +1,19 @@ +test_that("seggregate_seqs work", { + data("CTCF_ChIP_peaks", package = "MotifPeeker") + data("CTCF_TIP_peaks", package = "MotifPeeker") + + seqs1 <- CTCF_ChIP_peaks + seqs2 <- CTCF_TIP_peaks + res <- segregate_seqs(seqs1, seqs2) + common_overlaps <- GenomicRanges::findOverlaps(res$common_seqs1, + res$common_seqs2, + type = "any") + unique_overlaps <- GenomicRanges::findOverlaps(res$unique_seqs1, + res$unique_seqs2, + type = "any") + + expect_equal(length(S4Vectors::queryHits(common_overlaps)), + length(S4Vectors::subjectHits(common_overlaps))) + expect_equal(length(S4Vectors::queryHits(unique_overlaps)), 0) + expect_equal(length(S4Vectors::subjectHits(unique_overlaps)), 0) +})