From 453bc37a277cbf7776d7a75daed1c32a27255e2a Mon Sep 17 00:00:00 2001
From: HDash <16350928+HDash@users.noreply.github.com>
Date: Fri, 28 Jun 2024 10:47:54 +0100
Subject: [PATCH] Add segregate_seqs and BiocParallel wrappers for parallel
 processing

---
 DESCRIPTION                          |  3 +-
 NAMESPACE                            |  5 ++
 R/MotifPeeker.R                      |  2 +-
 R/bpapply.R                          | 46 +++++++++++++++
 R/get_bpparam.R                      | 45 +++++++++++++++
 R/segregate_seqs.R                   | 58 +++++++++++++++++++
 inst/markdown/MotifPeeker.Rmd        |  4 ++
 man/MotifPeeker.Rd                   |  2 +-
 man/bpapply.Rd                       | 85 ++++++++++++++++++++++++++++
 man/get_bpparam.Rd                   | 41 ++++++++++++++
 man/segregate_seqs.Rd                | 49 ++++++++++++++++
 tests/testthat/test-bpapply.R        | 26 +++++++++
 tests/testthat/test-segregate_seqs.R | 19 +++++++
 13 files changed, 382 insertions(+), 3 deletions(-)
 create mode 100644 R/bpapply.R
 create mode 100644 R/get_bpparam.R
 create mode 100644 R/segregate_seqs.R
 create mode 100644 man/bpapply.Rd
 create mode 100644 man/get_bpparam.Rd
 create mode 100644 man/segregate_seqs.Rd
 create mode 100644 tests/testthat/test-bpapply.R
 create mode 100644 tests/testthat/test-segregate_seqs.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 82a72de..3ddda8b 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -56,7 +56,8 @@ Imports:
     GenomeInfoDb,
     Biostrings,
     BSgenome,
-    memes
+    memes,
+    S4Vectors
 Suggests:
     BiocStyle,
     BSgenome.Hsapiens.UCSC.hg19,
diff --git a/NAMESPACE b/NAMESPACE
index c480423..898ef6f 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(MotifPeeker)
+export(bpapply)
 export(calc_frip)
 export(check_ENCODE)
 export(check_JASPAR)
@@ -12,6 +13,7 @@ export(read_motif_file)
 export(read_peak_file)
 export(report_command)
 export(report_header)
+export(segregate_seqs)
 export(summit_to_motif)
 export(to_plotly)
 export(trim_seqs)
@@ -29,6 +31,7 @@ importFrom(GenomeInfoDb,seqlengths)
 importFrom(GenomicAlignments,summarizeOverlaps)
 importFrom(GenomicRanges,GRanges)
 importFrom(GenomicRanges,end)
+importFrom(GenomicRanges,findOverlaps)
 importFrom(GenomicRanges,mcols)
 importFrom(GenomicRanges,seqnames)
 importFrom(GenomicRanges,start)
@@ -36,6 +39,8 @@ importFrom(GenomicRanges,strand)
 importFrom(GenomicRanges,width)
 importFrom(IRanges,IRanges)
 importFrom(Rsamtools,countBam)
+importFrom(S4Vectors,queryHits)
+importFrom(S4Vectors,subjectHits)
 importFrom(SummarizedExperiment,assay)
 importFrom(htmltools,tagList)
 importFrom(htmlwidgets,JS)
diff --git a/R/MotifPeeker.R b/R/MotifPeeker.R
index 697df1d..4eea867 100644
--- a/R/MotifPeeker.R
+++ b/R/MotifPeeker.R
@@ -176,7 +176,7 @@ MotifPeeker <- function(
         output_dir = tempdir(),
         display = NULL,
         use_cache = TRUE,
-        ncpus = 1,
+        ncpus = 2,
         quiet = TRUE,
         debug = FALSE,
         verbose = FALSE
diff --git a/R/bpapply.R b/R/bpapply.R
new file mode 100644
index 0000000..937291e
--- /dev/null
+++ b/R/bpapply.R
@@ -0,0 +1,46 @@
+#' Use BiocParallel functions with appropriate parameters
+#' 
+#' Light wrapper around \code{\link[BiocParallel]{BiocParallel}} functions that
+#' automatically sets the appropriate parameters based on the number of workers
+#' specified.
+#' 
+#' @param apply_fun A \code{\link[BiocParallel]{BiocParallel}} function to use
+#' for parallel processing. (default = \code{BiocParallel::bplapply})
+#' @inheritParams BiocParallel::bplapply
+#' @inheritDotParams BiocParallel::bplapply
+#' @inheritDotParams BiocParallel::bpmapply
+#' @inheritParams get_bpparam
+#' 
+#' @returns Output relevant to the \code{apply_fun} specified.
+#' 
+#' @examples
+#' half_it <- function(arg1) return(arg1 / 2)
+#' x <- seq_len(10)
+#' 
+#' res <- bpapply(x, half_it, workers = 2)
+#' print(res)
+#' 
+#' @export
+bpapply <- function(
+        X,
+        FUN,
+        apply_fun = BiocParallel::bplapply,
+        workers = 1,
+        progressbar = workers > 1,
+        force_snowparam = FALSE,
+        verbose = FALSE,
+        ...
+) {
+    stp_msg <- paste("Supplied apply_fun is not a valid BiocParallel function.")
+    apply_fun_package <- attr(apply_fun, "package")
+    if (length(apply_fun_package) == 0 ||
+        apply_fun_package != "BiocParallel")  stopper(stp_msg)
+    
+    BPPARAM <- get_bpparam(workers = workers,
+                            progressbar = progressbar,
+                            force_snowparam = force_snowparam,
+                            verbose = verbose)
+    
+    res <- apply_fun(X, FUN = FUN, BPPARAM = BPPARAM, ...)
+    return(res)
+}
diff --git a/R/get_bpparam.R b/R/get_bpparam.R
new file mode 100644
index 0000000..e957952
--- /dev/null
+++ b/R/get_bpparam.R
@@ -0,0 +1,45 @@
+#' Get parameters for \link[BiocParallel]{BiocParallel}
+#' 
+#' Get appropriate parameters for \code{BiocParallel} based on the
+#' number of workers specified. For less than 4 workers, the function returns a
+#' \code{MulticoreParam} object. For 4 or more cores, the function
+#' returns a \code{SnowParam} object. Since Windows supports
+#' neither, the function returns a \code{SerialParam} object. As a
+#' result, Windows users do not benefit from parallel processing.
+#' 
+#' @param workers The number of workers to use for parallel processing.
+#' @param force_snowparam A logical indicating whether to force the use of
+#' \link[BiocParallel]{SnowParam} object.
+#' @param verbose A logical indicating whether to print verbose messages while
+#' running the function. (default = FALSE)
+#' @inheritParams BiocParallel::SnowParam
+#' 
+#' @returns A \code{BPPARAM} object.
+#'
+#' @seealso \link[BiocParallel]{BiocParallelParam}
+#' 
+#' @keywords internal
+get_bpparam <- function(workers,
+                        progressbar = workers > 1,
+                        force_snowparam = FALSE,
+                        verbose = FALSE) {
+    if (.Platform$OS.type == "windows") {
+        custom_bpparam <- BiocParallel::SerialParam()
+        messager("Windows does not support parallel processing.",
+                "Returning SerialParam object for BiocParallel.",
+                v = verbose)
+    } else if (workers < 4 && !force_snowparam) {
+        custom_bpparam <- 
+            BiocParallel::MulticoreParam(workers = workers,
+                                        progressbar = progressbar)
+        messager("Using MulticoreParam object for BiocParallel (workers =",
+                paste0(workers, ")."), v = verbose)
+    } else {
+        custom_bpparam <- BiocParallel::SnowParam(workers = workers,
+                                                    progressbar = progressbar)
+        messager("Using SnowParam object for BiocParallel (workers =",
+                paste0(workers, ")."), v = verbose)
+    }
+    
+    return(custom_bpparam)
+}
diff --git a/R/segregate_seqs.R b/R/segregate_seqs.R
new file mode 100644
index 0000000..43f866d
--- /dev/null
+++ b/R/segregate_seqs.R
@@ -0,0 +1,58 @@
+#' Segregate input sequences into common and unique groups
+#' 
+#' This function takes two sets of sequences and segregates them into common and
+#' unique sequences. The common sequences are sequences that are present in both
+#' sets of sequences. The unique sequences are sequences that are present in
+#' only one of the sets of sequences.
+#' 
+#' Sequences are considered common if their base pairs align in any
+#' position, even if they vary in length. Consequently, while the number of
+#' common sequences remains consistent between both sets, but the length and
+#' composition of these sequences may differ. As a result, the function returns
+#' distinct sets of common sequences for each input set of sequences.
+#' 
+#' @param seqs1 A set of sequences (\code{GRanges} object)
+#' @param seqs2 A set of sequences (\code{GRanges} object)
+#' 
+#' @importFrom GenomicRanges findOverlaps
+#' @importFrom S4Vectors queryHits subjectHits
+#' 
+#' @returns A list containing the common sequences and unique sequences for each
+#' set of sequences. The list contains the following \code{GRanges} objects:
+#' \itemize{
+#'     \item \code{common_seqs1}: Common sequences in \code{seqs1}
+#'     \item \code{common_seqs2}: Common sequences in \code{seqs2}
+#'     \item \code{unique_seqs1}: Unique sequences in \code{seqs1}
+#'     \item \code{unique_seqs2}: Unique sequences in \code{seqs2}
+#' }
+#' 
+#' @examples
+#' data("CTCF_ChIP_peaks", package = "MotifPeeker")
+#' data("CTCF_TIP_peaks", package = "MotifPeeker")
+#' 
+#' seqs1 <- CTCF_ChIP_peaks
+#' seqs2 <- CTCF_TIP_peaks
+#' res <- segregate_seqs(seqs1, seqs2)
+#' print(res)
+#' 
+#' @seealso \link[GenomicRanges]{findOverlaps}
+#' 
+#' @export
+segregate_seqs <- function(seqs1, seqs2) {
+    common_seqs_ranges <- GenomicRanges::findOverlaps(seqs1, seqs2,
+                                                        type = "any")
+    
+    common_seqs1 <- seqs1[S4Vectors::queryHits(common_seqs_ranges)]
+    common_seqs2 <- seqs2[S4Vectors::subjectHits(common_seqs_ranges)]
+    unique_seqs1 <- seqs1[-S4Vectors::queryHits(common_seqs_ranges)]
+    unique_seqs2 <- seqs2[-S4Vectors::subjectHits(common_seqs_ranges)]
+    
+    return(
+        list(
+            common_seqs1 = common_seqs1,
+            common_seqs2 = common_seqs2,
+            unique_seqs1 = unique_seqs1,
+            unique_seqs2 = unique_seqs2
+        )
+    )
+}
diff --git a/inst/markdown/MotifPeeker.Rmd b/inst/markdown/MotifPeeker.Rmd
index c70a137..6f26b40 100644
--- a/inst/markdown/MotifPeeker.Rmd
+++ b/inst/markdown/MotifPeeker.Rmd
@@ -127,7 +127,11 @@ if (alignment_metrics) {
 
 ### Known-motif Analysis ###
 comparison_indices <- setdiff(length(result$peaks), params$reference_index)
+segregated_peaks <- lapply(comparison_indices, function(x) {
+    segregate_seqs(result$peaks[[params$reference_index]], result$peaks[[x]])
+})
 
+## Calculate enrichment for segregated peaks
 
 }
 
diff --git a/man/MotifPeeker.Rd b/man/MotifPeeker.Rd
index bcbb39e..6e8b785 100644
--- a/man/MotifPeeker.Rd
+++ b/man/MotifPeeker.Rd
@@ -23,7 +23,7 @@ MotifPeeker(
   output_dir = tempdir(),
   display = NULL,
   use_cache = TRUE,
-  ncpus = 1,
+  ncpus = 2,
   quiet = TRUE,
   debug = FALSE,
   verbose = FALSE
diff --git a/man/bpapply.Rd b/man/bpapply.Rd
new file mode 100644
index 0000000..8fb6594
--- /dev/null
+++ b/man/bpapply.Rd
@@ -0,0 +1,85 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/bpapply.R
+\name{bpapply}
+\alias{bpapply}
+\title{Use BiocParallel functions with appropriate parameters}
+\usage{
+bpapply(
+  X,
+  FUN,
+  apply_fun = BiocParallel::bplapply,
+  workers = 1,
+  progressbar = workers > 1,
+  force_snowparam = FALSE,
+  verbose = FALSE,
+  ...
+)
+}
+\arguments{
+\item{X}{
+    Any object for which methods \code{length}, \code{[}, and
+    \code{[[} are implemented.
+  }
+
+\item{FUN}{
+    The \code{function} to be applied to each element of \code{X}.
+  }
+
+\item{apply_fun}{A \code{\link[BiocParallel]{BiocParallel}} function to use
+for parallel processing. (default = \code{BiocParallel::bplapply})}
+
+\item{workers}{The number of workers to use for parallel processing.}
+
+\item{progressbar}{
+    \code{logical(1)} Enable progress bar (based on plyr:::progress_text).
+  }
+
+\item{force_snowparam}{A logical indicating whether to force the use of
+\link[BiocParallel]{SnowParam} object.}
+
+\item{verbose}{A logical indicating whether to print verbose messages while
+running the function. (default = FALSE)}
+
+\item{...}{
+  Arguments passed on to \code{\link[BiocParallel:bplapply]{BiocParallel::bplapply}}, \code{\link[BiocParallel:bpmapply]{BiocParallel::bpmapply}}
+  \describe{
+    \item{\code{BPPARAM}}{
+    An optional \code{\link[BiocParallel]{BiocParallelParam}} instance
+    determining the parallel back-end to be used during evaluation, or a
+    \code{list} of \code{BiocParallelParam} instances, to be applied in
+    sequence for nested calls to \pkg{BiocParallel} functions.
+  }
+    \item{\code{BPREDO}}{A \code{list} of output from \code{bplapply} with one or
+    more failed elements. When a list is given in \code{BPREDO},
+    \code{bpok} is used to identify errors, tasks are rerun and inserted
+    into the original results.
+  }
+    \item{\code{BPOPTIONS}}{
+    Additional options to control the behavior of the parallel evaluation, see \code{\link[BiocParallel]{bpoptions}}.
+  }
+    \item{\code{MoreArgs}}{List of additional arguments to \code{FUN}.
+  }
+    \item{\code{SIMPLIFY}}{
+    If \code{TRUE} the result will be simplified using
+    \code{\link{simplify2array}}.
+  }
+    \item{\code{USE.NAMES}}{If \code{TRUE} the result will be named.
+  }
+  }}
+}
+\value{
+Output relevant to the \code{apply_fun} specified.
+}
+\description{
+Light wrapper around \code{\link[BiocParallel]{BiocParallel}} functions that
+automatically sets the appropriate parameters based on the number of workers
+specified.
+}
+\examples{
+half_it <- function(arg1) return(arg1 / 2)
+x <- seq_len(10)
+
+res <- bpapply(x, half_it, workers = 2)
+print(res)
+
+}
diff --git a/man/get_bpparam.Rd b/man/get_bpparam.Rd
new file mode 100644
index 0000000..1e48ece
--- /dev/null
+++ b/man/get_bpparam.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_bpparam.R
+\name{get_bpparam}
+\alias{get_bpparam}
+\title{Get parameters for \link[BiocParallel]{BiocParallel}}
+\usage{
+get_bpparam(
+  workers,
+  progressbar = workers > 1,
+  force_snowparam = FALSE,
+  verbose = FALSE
+)
+}
+\arguments{
+\item{workers}{The number of workers to use for parallel processing.}
+
+\item{progressbar}{
+    \code{logical(1)} Enable progress bar (based on plyr:::progress_text).
+  }
+
+\item{force_snowparam}{A logical indicating whether to force the use of
+\link[BiocParallel]{SnowParam} object.}
+
+\item{verbose}{A logical indicating whether to print verbose messages while
+running the function. (default = FALSE)}
+}
+\value{
+A \code{BPPARAM} object.
+}
+\description{
+Get appropriate parameters for \code{BiocParallel} based on the
+number of workers specified. For less than 4 workers, the function returns a
+\code{MulticoreParam} object. For 4 or more cores, the function
+returns a \code{SnowParam} object. Since Windows supports
+neither, the function returns a \code{SerialParam} object. As a
+result, Windows users do not benefit from parallel processing.
+}
+\seealso{
+\link[BiocParallel]{BiocParallelParam}
+}
+\keyword{internal}
diff --git a/man/segregate_seqs.Rd b/man/segregate_seqs.Rd
new file mode 100644
index 0000000..656d39c
--- /dev/null
+++ b/man/segregate_seqs.Rd
@@ -0,0 +1,49 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/segregate_seqs.R
+\name{segregate_seqs}
+\alias{segregate_seqs}
+\title{Segregate input sequences into common and unique groups}
+\usage{
+segregate_seqs(seqs1, seqs2)
+}
+\arguments{
+\item{seqs1}{A set of sequences (\code{GRanges} object)}
+
+\item{seqs2}{A set of sequences (\code{GRanges} object)}
+}
+\value{
+A list containing the common sequences and unique sequences for each
+set of sequences. The list contains the following \code{GRanges} objects:
+\itemize{
+    \item \code{common_seqs1}: Common sequences in \code{seqs1}
+    \item \code{common_seqs2}: Common sequences in \code{seqs2}
+    \item \code{unique_seqs1}: Unique sequences in \code{seqs1}
+    \item \code{unique_seqs2}: Unique sequences in \code{seqs2}
+}
+}
+\description{
+This function takes two sets of sequences and segregates them into common and
+unique sequences. The common sequences are sequences that are present in both
+sets of sequences. The unique sequences are sequences that are present in
+only one of the sets of sequences.
+}
+\details{
+Sequences are considered common if their base pairs align in any
+position, even if they vary in length. Consequently, while the number of
+common sequences remains consistent between both sets, but the length and
+composition of these sequences may differ. As a result, the function returns
+distinct sets of common sequences for each input set of sequences.
+}
+\examples{
+data("CTCF_ChIP_peaks", package = "MotifPeeker")
+data("CTCF_TIP_peaks", package = "MotifPeeker")
+
+seqs1 <- CTCF_ChIP_peaks
+seqs2 <- CTCF_TIP_peaks
+res <- segregate_seqs(seqs1, seqs2)
+print(res)
+
+}
+\seealso{
+\link[GenomicRanges]{findOverlaps}
+}
diff --git a/tests/testthat/test-bpapply.R b/tests/testthat/test-bpapply.R
new file mode 100644
index 0000000..8c767e1
--- /dev/null
+++ b/tests/testthat/test-bpapply.R
@@ -0,0 +1,26 @@
+test_that("bpapply works", {
+    test_func <- function(arg1, arg2 = NULL) {
+        if (is.null(arg2)) return(arg1)
+        return(arg1 + arg2)
+    }
+    
+    x <- y <- seq_len(10)
+    
+    ### Non-existent apply_fun ###
+    expect_error(bpapply(x, test_func, apply_fun = "does_not_exist"))
+    
+    ### bplapply ###
+    res <- bpapply(x, test_func, workers = 2)
+    expect_equal(unlist(res), x)
+    
+    ### SnowParam ###
+    res <- bpapply(x, test_func, workers = 1, force_snowparam = TRUE,
+                   progressbar = FALSE)
+    expect_equal(unlist(res), x)
+    
+    ### bpmapply ###
+    res <- bpapply(x, test_func, apply_fun = BiocParallel::bpmapply,
+                    workers = 2, MoreArgs = list(arg2 = y),
+                    progressbar = FALSE)
+    expect_equal(res[1,2], 3)
+})
diff --git a/tests/testthat/test-segregate_seqs.R b/tests/testthat/test-segregate_seqs.R
new file mode 100644
index 0000000..36b103b
--- /dev/null
+++ b/tests/testthat/test-segregate_seqs.R
@@ -0,0 +1,19 @@
+test_that("seggregate_seqs work", {
+    data("CTCF_ChIP_peaks", package = "MotifPeeker")
+    data("CTCF_TIP_peaks", package = "MotifPeeker")
+    
+    seqs1 <- CTCF_ChIP_peaks
+    seqs2 <- CTCF_TIP_peaks
+    res <- segregate_seqs(seqs1, seqs2)
+    common_overlaps <- GenomicRanges::findOverlaps(res$common_seqs1,
+                                                    res$common_seqs2,
+                                                    type = "any")
+    unique_overlaps <- GenomicRanges::findOverlaps(res$unique_seqs1,
+                                                   res$unique_seqs2,
+                                                   type = "any")
+    
+    expect_equal(length(S4Vectors::queryHits(common_overlaps)),
+                 length(S4Vectors::subjectHits(common_overlaps)))
+    expect_equal(length(S4Vectors::queryHits(unique_overlaps)), 0)
+    expect_equal(length(S4Vectors::subjectHits(unique_overlaps)), 0)
+})