From 2b1f51db646aae41526f8072f185c6634b87bf3b Mon Sep 17 00:00:00 2001 From: HDash <16350928+HDash@users.noreply.github.com> Date: Tue, 5 Nov 2024 11:33:06 +0000 Subject: [PATCH] Replace `workers` with `BPPARAM` argument and remove `get_bpparam()` --- DESCRIPTION | 1 - NAMESPACE | 1 + NEWS.md | 4 ++ R/MotifPeeker.R | 63 ++++++++++++++---------- R/bpapply.R | 15 ++---- R/denovo_motifs.R | 9 ++-- R/find_motifs.R | 5 +- R/get_bpparam.R | 47 ------------------ R/get_df_distances.R | 7 ++- R/get_df_enrichment.R | 8 +-- R/motif_similarity.R | 8 +-- R/plot_enrichment_individual.R | 2 +- R/plot_enrichment_overall.R | 2 +- README.md | 2 +- inst/markdown/MotifPeeker.Rmd | 12 ++--- man/MotifPeeker.Rd | 56 ++++++++++++--------- man/bpapply.Rd | 26 ++-------- man/denovo_motifs.Rd | 9 ++-- man/find_motifs.Rd | 9 ++-- man/get_bpparam.Rd | 41 --------------- man/get_df_distances.Rd | 20 +++++--- man/get_df_enrichment.Rd | 19 +++++-- man/motif_similarity.Rd | 8 +-- man/plot_enrichment_individual.Rd | 2 +- man/plot_enrichment_overall.Rd | 2 +- tests/testthat/test-MotifPeeker.R | 3 -- tests/testthat/test-bpapply.R | 9 +--- tests/testthat/test-denovo_motif_funcs.R | 4 +- tests/testthat/test-enrichment_funcs.R | 3 +- tests/testthat/test-get_df_distances.R | 2 +- vignettes/MotifPeeker.Rmd | 10 ++-- vignettes/troubleshooting.Rmd | 2 +- 32 files changed, 168 insertions(+), 243 deletions(-) delete mode 100644 R/get_bpparam.R delete mode 100644 man/get_bpparam.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 359a3e8..46513fa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -64,7 +64,6 @@ Imports: stats, utils Suggests: - BiocStyle, BSgenome.Hsapiens.UCSC.hg19, BSgenome.Hsapiens.UCSC.hg38, downloadthis, diff --git a/NAMESPACE b/NAMESPACE index aa91489..cf3dd3a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -29,6 +29,7 @@ importFrom(BSgenome,getSeq) importFrom(BiocFileCache,BiocFileCache) importFrom(BiocFileCache,bfcinfo) importFrom(BiocFileCache,bfcrpath) +importFrom(BiocParallel,bpnworkers) importFrom(Biostrings,DNAString) importFrom(Biostrings,letterFrequency) importFrom(DT,datatable) diff --git a/NEWS.md b/NEWS.md index 3f129d3..e614659 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # MotifPeeker 0.99.7 +## New Features +* Replace `workers` argument with `BPPARAM`. Give users more control over the +BiocParallel implementation. + ## Miscellaneous * Remove `cat()` calls in functions. diff --git a/R/MotifPeeker.R b/R/MotifPeeker.R index a41b395..e934542 100644 --- a/R/MotifPeeker.R +++ b/R/MotifPeeker.R @@ -11,28 +11,28 @@ #' hours to complete. To make computation faster, we highly recommend tuning the #' following arguments: #' \describe{ -#' \item{\code{workers}}{Running motif discovery in parallel can -#' significantly reduce runtime, but it is very memory-intensive, consuming -#' upwards of 10GB of RAM per thread. Memory starvation can greatly slow the -#' process, so set \code{workers} with caution.} -#' \item{\code{denovo_motifs}}{The number of motifs to discover per sequence -#' group exponentially increases runtime. We recommend no more than 5 -#' motifs to make a meaningful inference.} -#' \item{\code{trim_seq_width}}{Trimming sequences before running de-novo -#' motif discovery can significantly reduce the search space. Sequence -#' length can exponentially increase runtime. We recommend running the -#' script with \code{denovo_motif_discovery = FALSE} and studying the -#' motif-summit distance distribution under general metrics to find the -#' sequence length that captures most motifs. A good starting point is 150 -#' but it can be reduced further if appropriate.} +#' \item{\code{BPPARAM=MulticoreParam(x)}}{Running motif discovery in +#' parallel can significantly reduce runtime, but it is very +#' memory-intensive, consuming 10+GB of RAM per thread. Memory starvation can +#' greatly slow the process, so set the number of cores with caution.} +#' \item{\code{denovo_motifs}}{The number of motifs to discover per sequence +#' group exponentially increases runtime. We recommend no more than 5 +#' motifs to make a meaningful inference.} +#' \item{\code{trim_seq_width}}{Trimming sequences before running de-novo +#' motif discovery can significantly reduce the search space. Sequence +#' length can exponentially increase runtime. We recommend running the +#' script with \code{denovo_motif_discovery = FALSE} and studying the +#' motif-summit distance distribution under general metrics to find the +#' sequence length that captures most motifs. A good starting point is 150 +#' but it can be reduced further if appropriate.} #' } #' #' @param peak_files A character vector of path to peak files, or a vector of #' GRanges objects generated using \code{\link{read_peak_file}}. Currently, #' peak files from the following peak-calling tools are supported: #' \itemize{ -#' \item MACS2: \code{.narrowPeak} files -#' \item SEACR: \code{.bed} files +#' \item MACS2: \code{.narrowPeak} files +#' \item SEACR: \code{.bed} files #' } #' ENCODE file IDs can also be provided to automatically fetch peak file(s) from #' the ENCODE database. @@ -81,13 +81,22 @@ #' @param display A character vector specifying the display mode for the HTML #' report once it is generated. (default = NULL) Options are: #' \itemize{ -#' \item \code{"browser"}: Open the report in the default web browser. -#' \item \code{"rstudio"}: Open the report in the RStudio Viewer. -#' \item \code{NULL}: Do not open the report. +#' \item \code{"browser"}: Open the report in the default web browser. +#' \item \code{"rstudio"}: Open the report in the RStudio Viewer. +#' \item \code{NULL}: Do not open the report. #' } -#' @param workers An integer specifying the number of threads to use for -#' parallel processing. (default = 1)\cr -#' \strong{IMPORTANT:} For each worker, please ensure a minimum of 6GB of +#' @param BPPARAM A \code{\link[BiocParallel]{BiocParallelParam-class}} object +#' enabling parallel execution. (default = SerialParam(), single-CPU run)\cr\cr +#' Following are two examples of how to set up parallel processing: +#' \itemize{ +#' \item \code{BPPARAM = BiocParallel::MulticoreParam(4)}: Uses 4 +#' CPU cores for parallel processing. +#' \item \code{library("BiocParallel")} followed by +#' \code{register(MulticoreParam(4))} sets all subsequent BiocParallel +#' functions to use 4 CPU cores. \code{Motifpeeker()} must be run +#' with \code{BPPARAM = BiocParallel::MulticoreParam()}. +#' } +#' \strong{IMPORTANT:} For each worker, please ensure a minimum of 8GB of #' memory (RAM) is available as \code{denovo_motif_discovery} is #' memory-intensive. #' @param quiet A logical indicating whether to print markdown knit messages. @@ -99,7 +108,7 @@ #' @inheritParams check_genome_build #' @inheritParams read_motif_file #' @inheritParams check_genome_build -#' @inheritParams get_bpparam +#' @inheritParams bpapply #' @inheritParams memes::runFimo #' @inheritParams denovo_motifs #' @inheritParams find_motifs @@ -111,6 +120,7 @@ #' @importFrom viridis scale_fill_viridis scale_color_viridis #' @importFrom tools file_path_sans_ext #' @importFrom rmarkdown render +#' @importFrom BiocParallel bpnworkers #' #' @return Path to the output directory. #' @@ -159,7 +169,6 @@ #' motif_db = NULL, #' download_buttons = TRUE, #' out_dir = tempdir(), -#' workers = 1, #' debug = FALSE, #' quiet = TRUE, #' verbose = FALSE @@ -188,7 +197,7 @@ MotifPeeker <- function( out_dir = tempdir(), save_runfiles = FALSE, display = if (interactive()) "browser", - workers = 2, + BPPARAM = BiocParallel::SerialParam(), # Default to single-core quiet = TRUE, debug = FALSE, verbose = FALSE @@ -269,7 +278,7 @@ MotifPeeker <- function( meme_path = meme_path, out_dir = out_dir, save_runfiles = save_runfiles, - workers = workers, + BPPARAM = BPPARAM, debug = debug, verbose = verbose ) @@ -277,6 +286,8 @@ MotifPeeker <- function( ### Knit Rmd ### rmd_file <- system.file("markdown", "MotifPeeker.Rmd", package = "MotifPeeker") + messager("Starting run with", BiocParallel::bpnworkers(BPPARAM), "cores.", + v = verbose) rmarkdown::render( input = rmd_file, output_dir = out_dir, diff --git a/R/bpapply.R b/R/bpapply.R index 12bfe05..69a1047 100644 --- a/R/bpapply.R +++ b/R/bpapply.R @@ -1,15 +1,15 @@ #' Use BiocParallel functions with appropriate parameters #' #' Light wrapper around \code{\link[BiocParallel]{BiocParallel}} functions that -#' automatically sets the appropriate parameters based on the number of workers -#' specified. +#' automatically applies appropriate parallel function. #' #' @param apply_fun A \code{\link[BiocParallel]{BiocParallel}} function to use #' for parallel processing. (default = \code{BiocParallel::bplapply}) +#' @param BPPARAM A \code{\link[BiocParallel]{BiocParallelParam-class}} object +#' specifying run parameters. (default = bpparam()) #' @inheritParams BiocParallel::bplapply #' @inheritDotParams BiocParallel::bplapply #' @inheritDotParams BiocParallel::bpmapply -#' @inheritParams get_bpparam #' #' @import BiocParallel #' @@ -19,7 +19,7 @@ #' half_it <- function(arg1) return(arg1 / 2) #' x <- seq_len(10) #' -#' res <- MotifPeeker:::bpapply(x, half_it, workers = 2) +#' res <- MotifPeeker:::bpapply(x, half_it) #' print(res) #' #' @keywords internal @@ -27,7 +27,7 @@ bpapply <- function( X, FUN, apply_fun = BiocParallel::bplapply, - workers = 1, + BPPARAM = BiocParallel::bpparam(), progressbar = FALSE, force_snowparam = FALSE, verbose = FALSE, @@ -38,11 +38,6 @@ bpapply <- function( if (length(apply_fun_package) == 0 || apply_fun_package != "BiocParallel") stop(stp_msg) - BPPARAM <- get_bpparam(workers = workers, - progressbar = progressbar, - force_snowparam = force_snowparam, - verbose = verbose) - res <- apply_fun(X, FUN = FUN, BPPARAM = BPPARAM, ...) return(res) } diff --git a/R/denovo_motifs.R b/R/denovo_motifs.R index 8212395..87c2e1e 100644 --- a/R/denovo_motifs.R +++ b/R/denovo_motifs.R @@ -25,12 +25,15 @@ #' (default = 6) #' @param out_dir A \code{character} vector of output directory to save STREME #' results to. (default = \code{tempdir()}) +#' @param BPPARAM A \code{\link[BiocParallel]{BiocParallelParam-class}} object +#' specifying run parameters. (default = SerialParam(), single core run) +#' @param debug A logical indicating whether to print debug messages while +#' running the function. (default = FALSE) #' @param ... Additional arguments to pass to \code{STREME}. For more #' information, refer to the official MEME Suite documentation on #' \href{https://meme-suite.org/meme/doc/streme.html}{STREME}. #' @inheritParams bpapply #' @inheritParams motif_enrichment -#' @inheritParams MotifPeeker #' #' @returns A list of \code{\link[universalmotif]{universalmotif}} objects and #' associated metadata. @@ -60,7 +63,7 @@ denovo_motifs <- function(seqs, filter_n = 6, out_dir = tempdir(), meme_path = NULL, - workers = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE, debug = FALSE, ...) { @@ -96,7 +99,7 @@ denovo_motifs <- function(seqs, ### Filter motifs ### out <- filter_repeats(streme_out, filter_n) return(out) - }, workers = workers, verbose = verbose + }, BPPARAM = BPPARAM, verbose = verbose ) messager("STREME run complete.", v = verbose) return(res) diff --git a/R/find_motifs.R b/R/find_motifs.R index 50b7ccc..dcd8dc7 100644 --- a/R/find_motifs.R +++ b/R/find_motifs.R @@ -13,6 +13,7 @@ #' @param ... Additional arguments to pass to \code{TOMTOM}. For more #' information, refer to the official MEME Suite documentation on #' \href{https://meme-suite.org/meme/doc/tomtom.html}{TOMTOM}. +#' @inheritParams bpapply #' @inheritParams denovo_motifs #' #' @importFrom memes runTomTom @@ -46,7 +47,7 @@ find_motifs <- function(streme_out, motif_db, out_dir = tempdir(), meme_path = NULL, - workers = 1, + BPPARAM = BiocParallel::bpparam(), verbose = FALSE, debug = FALSE, ...) { @@ -66,7 +67,7 @@ find_motifs <- function(streme_out, ) return(res_x) }) - }, workers = workers, verbose = verbose + }, BPPARAM = BPPARAM, verbose = verbose ) return(res) } diff --git a/R/get_bpparam.R b/R/get_bpparam.R deleted file mode 100644 index 097dbc6..0000000 --- a/R/get_bpparam.R +++ /dev/null @@ -1,47 +0,0 @@ -#' Get parameters for \link[BiocParallel]{BiocParallel} -#' -#' Get appropriate parameters for \code{BiocParallel} based on the -#' number of workers specified. For less than 10 workers, the function returns a -#' \code{MulticoreParam} object. For 10 or more cores, the function -#' returns a \code{SnowParam} object. Since Windows supports -#' neither, the function returns a \code{SerialParam} object. As a -#' result, Windows users do not benefit from parallel processing. -#' -#' @param workers The number of workers to use for parallel processing. -#' @param force_snowparam A logical indicating whether to force the use of -#' \link[BiocParallel]{SnowParam} object. -#' @param verbose A logical indicating whether to print verbose messages while -#' running the function. (default = FALSE) -#' @inheritParams BiocParallel::SnowParam -#' -#' @import BiocParallel -#' -#' @returns A \code{BPPARAM} object. -#' -#' @seealso \link[BiocParallel]{BiocParallelParam} -#' -#' @keywords internal -get_bpparam <- function(workers, - progressbar = workers > 1, - force_snowparam = FALSE, - verbose = FALSE) { - if (.Platform$OS.type == "windows") { - custom_bpparam <- BiocParallel::SerialParam() - messager("Windows does not support parallel processing.", - "Returning SerialParam object for BiocParallel.", - v = verbose) - } else if (workers < 10 && !force_snowparam) { - custom_bpparam <- - BiocParallel::MulticoreParam(workers = workers, - progressbar = progressbar) - messager("Using MulticoreParam object for BiocParallel (workers =", - paste0(workers, ")."), v = verbose) - } else { - custom_bpparam <- BiocParallel::SnowParam(workers = workers, - progressbar = progressbar) - messager("Using SnowParam object for BiocParallel (workers =", - paste0(workers, ")."), v = verbose) - } - - return(custom_bpparam) -} diff --git a/R/get_df_distances.R b/R/get_df_distances.R index e399b31..459f437 100644 --- a/R/get_df_distances.R +++ b/R/get_df_distances.R @@ -53,8 +53,7 @@ #' #' if (requireNamespace("BSgenome.Hsapiens.UCSC.hg38")) { #' genome_build <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38 -#' distances_df <- get_df_distances(input, motifs, genome_build, -#' workers = 1) +#' distances_df <- get_df_distances(input, motifs, genome_build) #' print(distances_df) #' } #' } @@ -66,7 +65,7 @@ get_df_distances <- function(result, user_motifs, genome_build, out_dir = tempdir(), - workers = 1, + BPPARAM = BiocParallel::bpparam(), meme_path = NULL, verbose = FALSE) { if (!is.list(result$peaks)) result$peaks <- list(result$peaks) @@ -95,7 +94,7 @@ get_df_distances <- function(result, )$distance_to_summit ) }, - workers = workers, verbose = verbose) %>% + BPPARAM = BPPARAM, verbose = verbose) %>% purrr::map_df(as.data.frame) ## Output: Peak 1 - Motif 1, 2... diff --git a/R/get_df_enrichment.R b/R/get_df_enrichment.R index d753e0c..14cd77b 100644 --- a/R/get_df_enrichment.R +++ b/R/get_df_enrichment.R @@ -52,7 +52,7 @@ #' #' enrichment_df <- get_df_enrichment( #' input, segregated_input, motifs, genome_build, -#' reference_index = 1, workers = 1 +#' reference_index = 1 #' ) #' } #' } @@ -67,7 +67,7 @@ get_df_enrichment <- function(result, genome_build, reference_index = 1, out_dir = tempdir(), - workers = 1, + BPPARAM = BiocParallel::bpparam(), meme_path = NULL, verbose = FALSE) { if (!is.list(result$peaks)) result$peaks <- list(result$peaks) @@ -113,7 +113,7 @@ get_df_enrichment <- function(result, run_index = i ) }, - workers = workers, verbose = verbose) %>% + BPPARAM = BPPARAM, verbose = verbose) %>% purrr::map_df(as.data.frame) ## 2. Segregated peaks @@ -165,7 +165,7 @@ get_df_enrichment <- function(result, run_index = i ) }, - workers = workers, verbose = verbose) %>% + BPPARAM = BPPARAM, verbose = verbose) %>% purrr::map_df(as.data.frame) enrichment_df <- rbind(enrichment_df_all, enrichment_df_seg) diff --git a/R/motif_similarity.R b/R/motif_similarity.R index 1b5ee40..5516d04 100644 --- a/R/motif_similarity.R +++ b/R/motif_similarity.R @@ -10,6 +10,7 @@ #' @inheritDotParams universalmotif::compare_motifs #' #' @importFrom universalmotif compare_motifs +#' @importFrom BiocParallel bpnworkers #' #' @inherit universalmotif::compare_motifs details #' @@ -44,8 +45,7 @@ #' genome_build = genome_build, #' denovo_motifs = 2, #' filter_n = 6, -#' out_dir = tempdir(), -#' workers = 1) +#' out_dir = tempdir()) #' similarity_matrices <- motif_similarity(denovo_motifs) #' print(similarity_matrices) #' } @@ -56,7 +56,7 @@ motif_similarity <- function(streme_out, method = "PCC", normalise.scores = TRUE, - workers = 1, + BPPARAM = BiocParallel::bpparam(), ...) { ## Motif group sequence - #1 Common seqs - Reference (1) ## (4 Groups per #2 Common seqs - Comparison (2) @@ -81,7 +81,7 @@ motif_similarity <- function(streme_out, list(m1, m2), method = method, normalise.scores = normalise.scores, - nthreads = workers, + nthreads = BiocParallel::bpnworkers(BPPARAM), ... ) row_indices <- seq(1, length(m1)) diff --git a/R/plot_enrichment_individual.R b/R/plot_enrichment_individual.R index 7445fc1..cdf555f 100644 --- a/R/plot_enrichment_individual.R +++ b/R/plot_enrichment_individual.R @@ -48,7 +48,7 @@ #' BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38 #' enrichment_df <- get_df_enrichment( #' input, segregated_input, motifs, genome_build, -#' reference_index = 1, workers = 1 +#' reference_index = 1 #' ) #' label_colours <- c("red", "cyan") #' diff --git a/R/plot_enrichment_overall.R b/R/plot_enrichment_overall.R index c12e11a..f2e832d 100644 --- a/R/plot_enrichment_overall.R +++ b/R/plot_enrichment_overall.R @@ -46,7 +46,7 @@ #' BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38 #' enrichment_df <- get_df_enrichment( #' input, segregated_input, motifs, genome_build, -#' reference_index = 1, workers = 1 +#' reference_index = 1 #' ) #' label_colours <- c("red", "cyan") #' diff --git a/README.md b/README.md index 3e8db7b..e6c7370 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ status](https://github.com/neurogenomics/MotifPeeker/workflows/rworkflows/badge. **Authors:** ***Hiranyamaya (Hiru) Dash, Thomas Roberts, Nathan Skene*** -**Updated:** ***Nov-04-2024*** +**Updated:** ***Nov-05-2024*** ## Introduction diff --git a/inst/markdown/MotifPeeker.Rmd b/inst/markdown/MotifPeeker.Rmd index 276097b..70e4a18 100644 --- a/inst/markdown/MotifPeeker.Rmd +++ b/inst/markdown/MotifPeeker.Rmd @@ -45,7 +45,7 @@ params: value: NULL save_runfiles: value: NULL - workers: + BPPARAM: value: NULL debug: value: FALSE @@ -153,7 +153,7 @@ peak_width_df <- ## Motif-Summit Distances motif_summit_dist_df <- get_df_distances( - result, user_motifs, genome_build, out_dir_extra, params$workers, + result, user_motifs, genome_build, out_dir_extra, params$BPPARAM, params$meme_path, params$debug ) @@ -170,7 +170,7 @@ if (comparison_metrics) { if (user_motif_metrics) { enrichment_df <- get_df_enrichment( result, segregated_peaks, user_motifs, genome_build, - params$reference_index, out_dir_extra, params$workers, + params$reference_index, out_dir_extra, params$BPPARAM, params$meme_path, params$verbose ) } @@ -184,17 +184,17 @@ if (denovo_metrics) { unlist(segregated_peaks), params$trim_seq_width, genome_build, params$denovo_motifs, filter_n = params$filter_n, out_dir = out_dir_extra, meme_path = params$meme_path, - workers = params$workers, verbose = params$verbose, debug = params$debug + BPPARAM = params$BPPARAM, verbose = params$verbose, debug = params$debug ) ## Run TOMTOM denovo_res$similar_motifs <- find_motifs( denovo_res$streme, motif_db, out_dir = out_dir_extra, - meme_path = params$meme_path, workers = params$workers, + meme_path = params$meme_path, BPPARAM = params$BPPARAM, verbose = params$verbose, debug = params$debug ) ## Compare motifs denovo_res$comparisons <- motif_similarity( - denovo_res$streme, workers = params$workers + denovo_res$streme, BPPARAM = params$BPPARAM ) } ``` diff --git a/man/MotifPeeker.Rd b/man/MotifPeeker.Rd index f88967e..4896bb6 100644 --- a/man/MotifPeeker.Rd +++ b/man/MotifPeeker.Rd @@ -24,7 +24,7 @@ MotifPeeker( out_dir = tempdir(), save_runfiles = FALSE, display = if (interactive()) "browser", - workers = 2, + BPPARAM = BiocParallel::SerialParam(), quiet = TRUE, debug = FALSE, verbose = FALSE @@ -35,8 +35,8 @@ MotifPeeker( GRanges objects generated using \code{\link{read_peak_file}}. Currently, peak files from the following peak-calling tools are supported: \itemize{ - \item MACS2: \code{.narrowPeak} files - \item SEACR: \code{.bed} files + \item MACS2: \code{.narrowPeak} files + \item SEACR: \code{.bed} files } ENCODE file IDs can also be provided to automatically fetch peak file(s) from the ENCODE database.} @@ -126,14 +126,23 @@ generated during the run, such as those from FIMO and AME. (default = FALSE)} \item{display}{A character vector specifying the display mode for the HTML report once it is generated. (default = NULL) Options are: \itemize{ - \item \code{"browser"}: Open the report in the default web browser. - \item \code{"rstudio"}: Open the report in the RStudio Viewer. - \item \code{NULL}: Do not open the report. + \item \code{"browser"}: Open the report in the default web browser. + \item \code{"rstudio"}: Open the report in the RStudio Viewer. + \item \code{NULL}: Do not open the report. }} -\item{workers}{An integer specifying the number of threads to use for -parallel processing. (default = 1)\cr -\strong{IMPORTANT:} For each worker, please ensure a minimum of 6GB of +\item{BPPARAM}{A \code{\link[BiocParallel]{BiocParallelParam-class}} object +enabling parallel execution. (default = SerialParam(), single-CPU run)\cr\cr +Following are two examples of how to set up parallel processing: +\itemize{ + \item \code{BPPARAM = BiocParallel::MulticoreParam(4)}: Uses 4 + CPU cores for parallel processing. + \item \code{library("BiocParallel")} followed by + \code{register(MulticoreParam(4))} sets all subsequent BiocParallel + functions to use 4 CPU cores. \code{Motifpeeker()} must be run + with \code{BPPARAM = BiocParallel::MulticoreParam()}. +} +\strong{IMPORTANT:} For each worker, please ensure a minimum of 8GB of memory (RAM) is available as \code{denovo_motif_discovery} is memory-intensive.} @@ -162,20 +171,20 @@ denovo_motif_discovery disabled. However, de-novo motif discovery can take hours to complete. To make computation faster, we highly recommend tuning the following arguments: \describe{ - \item{\code{workers}}{Running motif discovery in parallel can - significantly reduce runtime, but it is very memory-intensive, consuming - upwards of 10GB of RAM per thread. Memory starvation can greatly slow the - process, so set \code{workers} with caution.} - \item{\code{denovo_motifs}}{The number of motifs to discover per sequence - group exponentially increases runtime. We recommend no more than 5 - motifs to make a meaningful inference.} - \item{\code{trim_seq_width}}{Trimming sequences before running de-novo - motif discovery can significantly reduce the search space. Sequence - length can exponentially increase runtime. We recommend running the - script with \code{denovo_motif_discovery = FALSE} and studying the - motif-summit distance distribution under general metrics to find the - sequence length that captures most motifs. A good starting point is 150 - but it can be reduced further if appropriate.} + \item{\code{BPPARAM=MulticoreParam(x)}}{Running motif discovery in + parallel can significantly reduce runtime, but it is very + memory-intensive, consuming 10+GB of RAM per thread. Memory starvation can + greatly slow the process, so set the number of cores with caution.} + \item{\code{denovo_motifs}}{The number of motifs to discover per sequence + group exponentially increases runtime. We recommend no more than 5 + motifs to make a meaningful inference.} + \item{\code{trim_seq_width}}{Trimming sequences before running de-novo + motif discovery can significantly reduce the search space. Sequence + length can exponentially increase runtime. We recommend running the + script with \code{denovo_motif_discovery = FALSE} and studying the + motif-summit distance distribution under general metrics to find the + sequence length that captures most motifs. A good starting point is 150 + but it can be reduced further if appropriate.} } } \note{ @@ -224,7 +233,6 @@ motifs <- list( motif_db = NULL, download_buttons = TRUE, out_dir = tempdir(), - workers = 1, debug = FALSE, quiet = TRUE, verbose = FALSE diff --git a/man/bpapply.Rd b/man/bpapply.Rd index 5b2301e..5012377 100644 --- a/man/bpapply.Rd +++ b/man/bpapply.Rd @@ -8,7 +8,7 @@ bpapply( X, FUN, apply_fun = BiocParallel::bplapply, - workers = 1, + BPPARAM = BiocParallel::bpparam(), progressbar = FALSE, force_snowparam = FALSE, verbose = FALSE, @@ -28,27 +28,12 @@ bpapply( \item{apply_fun}{A \code{\link[BiocParallel]{BiocParallel}} function to use for parallel processing. (default = \code{BiocParallel::bplapply})} -\item{workers}{The number of workers to use for parallel processing.} - -\item{progressbar}{ - \code{logical(1)} Enable progress bar (based on plyr:::progress_text). - } - -\item{force_snowparam}{A logical indicating whether to force the use of -\link[BiocParallel]{SnowParam} object.} - -\item{verbose}{A logical indicating whether to print verbose messages while -running the function. (default = FALSE)} +\item{BPPARAM}{A \code{\link[BiocParallel]{BiocParallelParam-class}} object +specifying run parameters. (default = bpparam())} \item{...}{ Arguments passed on to \code{\link[BiocParallel:bplapply]{BiocParallel::bplapply}}, \code{\link[BiocParallel:bpmapply]{BiocParallel::bpmapply}} \describe{ - \item{\code{BPPARAM}}{ - An optional \code{\link[BiocParallel]{BiocParallelParam}} instance - determining the parallel back-end to be used during evaluation, or a - \code{list} of \code{BiocParallelParam} instances, to be applied in - sequence for nested calls to \pkg{BiocParallel} functions. - } \item{\code{BPREDO}}{A \code{list} of output from \code{bplapply} with one or more failed elements. When a list is given in \code{BPREDO}, \code{bpok} is used to identify errors, tasks are rerun and inserted @@ -72,14 +57,13 @@ Output relevant to the \code{apply_fun} specified. } \description{ Light wrapper around \code{\link[BiocParallel]{BiocParallel}} functions that -automatically sets the appropriate parameters based on the number of workers -specified. +automatically applies appropriate parallel function. } \examples{ half_it <- function(arg1) return(arg1 / 2) x <- seq_len(10) -res <- MotifPeeker:::bpapply(x, half_it, workers = 2) +res <- MotifPeeker:::bpapply(x, half_it) print(res) } diff --git a/man/denovo_motifs.Rd b/man/denovo_motifs.Rd index 1690cb2..3c448b2 100644 --- a/man/denovo_motifs.Rd +++ b/man/denovo_motifs.Rd @@ -14,7 +14,7 @@ denovo_motifs( filter_n = 6, out_dir = tempdir(), meme_path = NULL, - workers = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE, debug = FALSE, ... @@ -54,13 +54,14 @@ results to. (default = \code{tempdir()})} \item{meme_path}{path to "meme/bin/" (default: \code{NULL}). Will use default search behavior as described in \code{check_meme_install()} if unset.} -\item{workers}{The number of workers to use for parallel processing.} +\item{BPPARAM}{A \code{\link[BiocParallel]{BiocParallelParam-class}} object +specifying run parameters. (default = SerialParam(), single core run)} \item{verbose}{A logical indicating whether to print verbose messages while running the function. (default = FALSE)} -\item{debug}{A logical indicating whether to print debug/error messages in -the HTML report. (default = FALSE)} +\item{debug}{A logical indicating whether to print debug messages while +running the function. (default = FALSE)} \item{...}{Additional arguments to pass to \code{STREME}. For more information, refer to the official MEME Suite documentation on diff --git a/man/find_motifs.Rd b/man/find_motifs.Rd index 434bf98..ed43901 100644 --- a/man/find_motifs.Rd +++ b/man/find_motifs.Rd @@ -9,7 +9,7 @@ find_motifs( motif_db, out_dir = tempdir(), meme_path = NULL, - workers = 1, + BPPARAM = BiocParallel::bpparam(), verbose = FALSE, debug = FALSE, ... @@ -31,13 +31,14 @@ results to. (default = \code{tempdir()})} \item{meme_path}{path to "meme/bin/" (default: \code{NULL}). Will use default search behavior as described in \code{check_meme_install()} if unset.} -\item{workers}{The number of workers to use for parallel processing.} +\item{BPPARAM}{A \code{\link[BiocParallel]{BiocParallelParam-class}} object +specifying run parameters. (default = bpparam())} \item{verbose}{A logical indicating whether to print verbose messages while running the function. (default = FALSE)} -\item{debug}{A logical indicating whether to print debug/error messages in -the HTML report. (default = FALSE)} +\item{debug}{A logical indicating whether to print debug messages while +running the function. (default = FALSE)} \item{...}{Additional arguments to pass to \code{TOMTOM}. For more information, refer to the official MEME Suite documentation on diff --git a/man/get_bpparam.Rd b/man/get_bpparam.Rd deleted file mode 100644 index 2a57b59..0000000 --- a/man/get_bpparam.Rd +++ /dev/null @@ -1,41 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_bpparam.R -\name{get_bpparam} -\alias{get_bpparam} -\title{Get parameters for \link[BiocParallel]{BiocParallel}} -\usage{ -get_bpparam( - workers, - progressbar = workers > 1, - force_snowparam = FALSE, - verbose = FALSE -) -} -\arguments{ -\item{workers}{The number of workers to use for parallel processing.} - -\item{progressbar}{ - \code{logical(1)} Enable progress bar (based on plyr:::progress_text). - } - -\item{force_snowparam}{A logical indicating whether to force the use of -\link[BiocParallel]{SnowParam} object.} - -\item{verbose}{A logical indicating whether to print verbose messages while -running the function. (default = FALSE)} -} -\value{ -A \code{BPPARAM} object. -} -\description{ -Get appropriate parameters for \code{BiocParallel} based on the -number of workers specified. For less than 10 workers, the function returns a -\code{MulticoreParam} object. For 10 or more cores, the function -returns a \code{SnowParam} object. Since Windows supports -neither, the function returns a \code{SerialParam} object. As a -result, Windows users do not benefit from parallel processing. -} -\seealso{ -\link[BiocParallel]{BiocParallelParam} -} -\keyword{internal} diff --git a/man/get_df_distances.Rd b/man/get_df_distances.Rd index f73af2c..b2e1afb 100644 --- a/man/get_df_distances.Rd +++ b/man/get_df_distances.Rd @@ -9,7 +9,7 @@ get_df_distances( user_motifs, genome_build, out_dir = tempdir(), - workers = 1, + BPPARAM = BiocParallel::bpparam(), meme_path = NULL, verbose = FALSE ) @@ -38,9 +38,18 @@ are supported as abbreviated input.} \item{out_dir}{A \code{character} vector of output directory.} -\item{workers}{An integer specifying the number of threads to use for -parallel processing. (default = 1)\cr -\strong{IMPORTANT:} For each worker, please ensure a minimum of 6GB of +\item{BPPARAM}{A \code{\link[BiocParallel]{BiocParallelParam-class}} object +enabling parallel execution. (default = SerialParam(), single-CPU run)\cr\cr +Following are two examples of how to set up parallel processing: +\itemize{ + \item \code{BPPARAM = BiocParallel::MulticoreParam(4)}: Uses 4 + CPU cores for parallel processing. + \item \code{library("BiocParallel")} followed by + \code{register(MulticoreParam(4))} sets all subsequent BiocParallel + functions to use 4 CPU cores. \code{Motifpeeker()} must be run + with \code{BPPARAM = BiocParallel::MulticoreParam()}. +} +\strong{IMPORTANT:} For each worker, please ensure a minimum of 8GB of memory (RAM) is available as \code{denovo_motif_discovery} is memory-intensive.} @@ -83,8 +92,7 @@ motifs <- list( if (requireNamespace("BSgenome.Hsapiens.UCSC.hg38")) { genome_build <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38 - distances_df <- get_df_distances(input, motifs, genome_build, - workers = 1) + distances_df <- get_df_distances(input, motifs, genome_build) print(distances_df) } } diff --git a/man/get_df_enrichment.Rd b/man/get_df_enrichment.Rd index e9d67be..10704da 100644 --- a/man/get_df_enrichment.Rd +++ b/man/get_df_enrichment.Rd @@ -11,7 +11,7 @@ get_df_enrichment( genome_build, reference_index = 1, out_dir = tempdir(), - workers = 1, + BPPARAM = BiocParallel::bpparam(), meme_path = NULL, verbose = FALSE ) @@ -47,9 +47,18 @@ use as the reference dataset for comparison. Indexing starts from 1. \item{out_dir}{A \code{character} vector of output directory.} -\item{workers}{An integer specifying the number of threads to use for -parallel processing. (default = 1)\cr -\strong{IMPORTANT:} For each worker, please ensure a minimum of 6GB of +\item{BPPARAM}{A \code{\link[BiocParallel]{BiocParallelParam-class}} object +enabling parallel execution. (default = SerialParam(), single-CPU run)\cr\cr +Following are two examples of how to set up parallel processing: +\itemize{ + \item \code{BPPARAM = BiocParallel::MulticoreParam(4)}: Uses 4 + CPU cores for parallel processing. + \item \code{library("BiocParallel")} followed by + \code{register(MulticoreParam(4))} sets all subsequent BiocParallel + functions to use 4 CPU cores. \code{Motifpeeker()} must be run + with \code{BPPARAM = BiocParallel::MulticoreParam()}. +} +\strong{IMPORTANT:} For each worker, please ensure a minimum of 8GB of memory (RAM) is available as \code{denovo_motif_discovery} is memory-intensive.} @@ -106,7 +115,7 @@ reference_index <- 1 enrichment_df <- get_df_enrichment( input, segregated_input, motifs, genome_build, - reference_index = 1, workers = 1 + reference_index = 1 ) } } diff --git a/man/motif_similarity.Rd b/man/motif_similarity.Rd index a928ee1..929929c 100644 --- a/man/motif_similarity.Rd +++ b/man/motif_similarity.Rd @@ -8,7 +8,7 @@ motif_similarity( streme_out, method = "PCC", normalise.scores = TRUE, - workers = 1, + BPPARAM = BiocParallel::bpparam(), ... ) } @@ -24,7 +24,8 @@ Similarity scores are multiplied by the ratio of aligned positions to the total number of positions in the larger motif, and the inverse for distance scores.} -\item{workers}{The number of workers to use for parallel processing.} +\item{BPPARAM}{A \code{\link[BiocParallel]{BiocParallelParam-class}} object +specifying run parameters. (default = bpparam())} \item{...}{ Arguments passed on to \code{\link[universalmotif:compare_motifs]{universalmotif::compare_motifs}} @@ -166,8 +167,7 @@ data("CTCF_ChIP_peaks", package = "MotifPeeker") genome_build = genome_build, denovo_motifs = 2, filter_n = 6, - out_dir = tempdir(), - workers = 1) + out_dir = tempdir()) similarity_matrices <- motif_similarity(denovo_motifs) print(similarity_matrices) } diff --git a/man/plot_enrichment_individual.Rd b/man/plot_enrichment_individual.Rd index 5c7e09a..d189a4c 100644 --- a/man/plot_enrichment_individual.Rd +++ b/man/plot_enrichment_individual.Rd @@ -78,7 +78,7 @@ motifs <- list( BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38 enrichment_df <- get_df_enrichment( input, segregated_input, motifs, genome_build, - reference_index = 1, workers = 1 + reference_index = 1 ) label_colours <- c("red", "cyan") diff --git a/man/plot_enrichment_overall.Rd b/man/plot_enrichment_overall.Rd index d961211..3acc797 100644 --- a/man/plot_enrichment_overall.Rd +++ b/man/plot_enrichment_overall.Rd @@ -66,7 +66,7 @@ motifs <- list( BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38 enrichment_df <- get_df_enrichment( input, segregated_input, motifs, genome_build, - reference_index = 1, workers = 1 + reference_index = 1 ) label_colours <- c("red", "cyan") diff --git a/tests/testthat/test-MotifPeeker.R b/tests/testthat/test-MotifPeeker.R index a4e9a4e..4645235 100644 --- a/tests/testthat/test-MotifPeeker.R +++ b/tests/testthat/test-MotifPeeker.R @@ -47,7 +47,6 @@ test_that("MotifPeeker produces output files", { denovo_motifs = 2, motif_db = NULL, download_buttons = TRUE, - workers = 1, out_dir = tempdir(), display = NULL, debug = FALSE, @@ -72,7 +71,6 @@ test_that("MotifPeeker produces output files", { motif_db = NULL, download_buttons = TRUE, out_dir = tempdir(), - workers = 1, display = NULL, debug = FALSE, verbose = FALSE @@ -99,7 +97,6 @@ test_that("MotifPeeker produces output files", { motif_db = NULL, download_buttons = TRUE, out_dir = tempdir(), - workers = 1, display = NULL, debug = FALSE, verbose = FALSE diff --git a/tests/testthat/test-bpapply.R b/tests/testthat/test-bpapply.R index 850dc2f..ddc4310 100644 --- a/tests/testthat/test-bpapply.R +++ b/tests/testthat/test-bpapply.R @@ -11,17 +11,12 @@ test_that("bpapply works", { apply_fun = "does_not_exist")) ### bplapply ### - res <- MotifPeeker:::bpapply(x, test_func, workers = 2) - expect_equal(unlist(res), x) - - ### SnowParam ### - res <- MotifPeeker:::bpapply(x, test_func, workers = 1, - force_snowparam = TRUE, progressbar = FALSE) + res <- MotifPeeker:::bpapply(x, test_func) expect_equal(unlist(res), x) ### bpmapply ### res <- MotifPeeker:::bpapply(x, test_func, - apply_fun = BiocParallel::bpmapply, workers = 2, + apply_fun = BiocParallel::bpmapply, MoreArgs = list(arg2 = y), progressbar = FALSE) expect_equal(res[1,2], 3) }) diff --git a/tests/testthat/test-denovo_motif_funcs.R b/tests/testthat/test-denovo_motif_funcs.R index 3e5add4..1babd39 100644 --- a/tests/testthat/test-denovo_motif_funcs.R +++ b/tests/testthat/test-denovo_motif_funcs.R @@ -14,7 +14,6 @@ test_that("De-novo motif enrichment functions works", { denovo_motifs = 2, filter_n = 6, out_dir = tempdir(), - workers = 1, verbose = FALSE, debug = FALSE)) @@ -29,14 +28,13 @@ test_that("De-novo motif enrichment functions works", { motif_db <- get_JASPARCORE() res2 <- find_motifs(res, motif_db = motif_db, - workers = 1, verbose = TRUE, debug = TRUE) expect_length(res2, 4) expect_equal(res2[[1]][[1]]$motif[[1]]@alphabet, "DNA") ## motif_similarity ### - res3 <- motif_similarity(res, workers = 1) + res3 <- motif_similarity(res) expect_true(all(vapply(res3, is.matrix, logical(1)))) ### plot_motif_comparison ### diff --git a/tests/testthat/test-enrichment_funcs.R b/tests/testthat/test-enrichment_funcs.R index e98b360..55c2384 100644 --- a/tests/testthat/test-enrichment_funcs.R +++ b/tests/testthat/test-enrichment_funcs.R @@ -22,8 +22,7 @@ test_that("enrichment plotting and datatable functions works", { genome_build <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38 enrichment_df <- get_df_enrichment( - input, segregated_input, motifs, genome_build, reference_index = 1, - workers = 1 + input, segregated_input, motifs, genome_build, reference_index = 1 ) label_colours <- c("red", "cyan") diff --git a/tests/testthat/test-get_df_distances.R b/tests/testthat/test-get_df_distances.R index b90b46e..6615432 100644 --- a/tests/testthat/test-get_df_distances.R +++ b/tests/testthat/test-get_df_distances.R @@ -17,7 +17,7 @@ test_that("get_df_distances works", { ) genome_build <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38 - distances_df <- get_df_distances(input, motifs, genome_build, workers = 1, + distances_df <- get_df_distances(input, motifs, genome_build, verbose = FALSE) expect_true(is.data.frame(distances_df)) diff --git a/vignettes/MotifPeeker.Rmd b/vignettes/MotifPeeker.Rmd index 710c2af..9653e58 100644 --- a/vignettes/MotifPeeker.Rmd +++ b/vignettes/MotifPeeker.Rmd @@ -201,7 +201,7 @@ if (MotifPeeker:::confirm_meme_install(continue = TRUE)) { motif_db = NULL, # Use default motif database (JASPAR) download_buttons = TRUE, out_dir = tempdir(), # Save output in a temporary directory - workers = 2, # Use two CPU cores on a 16GB RAM machine + BPPARAM = BiocParallel::SerialParam(), # Use two CPU cores on a 16GB RAM machine debug = FALSE, quiet = TRUE, verbose = TRUE @@ -289,10 +289,10 @@ To make computation faster, we highly recommend tuning the following arguments:
Details -- `workers`: Running motif discovery in parallel can significantly reduce - runtime, but it is very memory-intensive, consuming upwards of 10GB of RAM - per thread. Memory starvation can greatly slow the process, so set `workers` - with caution. +- `BPPARAM = Multicore(x)`: Running motif discovery in parallel can + significantly reduce runtime, but it is very memory-intensive, consuming + upwards of 10GB of RAM per thread. Memory starvation can greatly slow the + process, so set workers (x) with caution. - `denovo_motifs`: The number of motifs to discover per sequence group exponentially increases runtime. We recommend no more than 5 motifs to make a meaningful inference. diff --git a/vignettes/troubleshooting.Rmd b/vignettes/troubleshooting.Rmd index cb3f8d6..391cb10 100644 --- a/vignettes/troubleshooting.Rmd +++ b/vignettes/troubleshooting.Rmd @@ -37,7 +37,7 @@ If you encounter an issue that is not covered, please open an issue on the 1. **Function takes too long to run** It is likely de-novo motif discovery is what is taking too long to run. Try - reducing the number of `workers` if you are running out of memory while + reducing the number of workers if you are running out of memory while running the `MotifPeeker()` function. Additionally, follow the [runtime guidance](https://neurogenomics.github.io/MotifPeeker/articles/MotifPeeker.html#runtime) for `MotifPeeker()`.