Add trim_seqs and additional arguments to MotifPeeker

neurogenomics · Jun 27, 2024 · 126023a · 126023a
1 parent 54c6708
commit 126023a
Show file tree

Hide file tree

Showing 11 changed files with 166 additions and 22 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -48,14 +48,12 @@ Imports:
     tools,
     htmltools,
     rmarkdown,
-    curl,
-    jsonlite,
     viridis,
-    emoji,
     SummarizedExperiment,
     htmlwidgets,
     Rsamtools,
     GenomicAlignments,
+    GenomeInfoDb,
     Biostrings,
     BSgenome,
     memes
@@ -71,7 +69,10 @@ Suggests:
     rworkflows,
     testthat (>= 3.0.0),
     utils,
-    withr
+    withr,
+    emoji,
+    curl,
+    jsonlite
 VignetteBuilder: 
     knitr
 biocViews: Epigenetics, Genetics, QualityControl, ChIPSeq,

diff --git a/NAMESPACE b/NAMESPACE
@@ -14,6 +14,7 @@ export(report_command)
 export(report_header)
 export(summit_to_motif)
 export(to_plotly)
+export(trim_seqs)
 import(ggplot2)
 import(plotly, except = last_plot)
 
@@ -24,19 +25,20 @@ importFrom(BiocFileCache,bfcinfo)
 importFrom(Biostrings,DNAString)
 importFrom(Biostrings,letterFrequency)
 importFrom(DT,datatable)
+importFrom(GenomeInfoDb,seqlengths)
 importFrom(GenomicAlignments,summarizeOverlaps)
+importFrom(GenomicRanges,GRanges)
 importFrom(GenomicRanges,end)
 importFrom(GenomicRanges,mcols)
 importFrom(GenomicRanges,seqnames)
 importFrom(GenomicRanges,start)
+importFrom(GenomicRanges,strand)
 importFrom(GenomicRanges,width)
+importFrom(IRanges,IRanges)
 importFrom(Rsamtools,countBam)
 importFrom(SummarizedExperiment,assay)
-importFrom(curl,curl_fetch_memory)
-importFrom(emoji,emoji)
 importFrom(htmltools,tagList)
 importFrom(htmlwidgets,JS)
-importFrom(jsonlite,fromJSON)
 importFrom(memes,meme_is_installed)
 importFrom(memes,runAme)
 importFrom(memes,runFimo)

diff --git a/R/MotifPeeker.R b/R/MotifPeeker.R
@@ -63,6 +63,13 @@
 #' this database to find similar motifs. If not provided, JASPAR CORE database
 #' will be used. \strong{NOTE}: p-value estimates are inaccurate when the
 #' database has fewer than 50 entries.
+#' @param trim_seq_width An integer specifying the width of the sequence to
+#' extract around the summit (default = NULL). This sequence is used to search
+#' for de novo motifs. If not provided, the entire peak region will be used.
+#' This parameter is intended to reduce the search space and speed up motif
+#' discovery; therefore, a value less than the average peak width is
+#' recommended. Peaks are trimmed symmetrically around the summit while
+#' respecting the peak bounds.
 #' @param download_buttons A logical indicating whether to include download
 #' buttons for various files within the HTML report. (default = TRUE)
 #' @param output_dir A character string specifying the directory to save the
@@ -92,7 +99,6 @@
 #' 
 #' @import ggplot2
 #' @import tidyverse
-#' @importFrom emoji emoji
 #' @importFrom viridis scale_fill_viridis
 #' @importFrom tools file_path_sans_ext
 #' @importFrom rmarkdown render
@@ -103,7 +109,8 @@
 #' 
 #' @note Running de-novo motif discovery is computationally expensive and can
 #' require from minutes to hours. \code{denovo_motifs} can widely affect the
-#' runtime (higher values take longer).
+#' runtime (higher values take longer). Setting \code{trim_seq_width} to a lower
+#' value can also reduce the runtime significantly.
 #' 
 #' @examples
 #' peaks <- list(
@@ -163,6 +170,7 @@ MotifPeeker <- function(
         denovo_motif_discovery = TRUE,
         denovo_motifs = 3,
         motif_db = NULL,
+        trim_seq_width = NULL,
         download_buttons = TRUE,
         meme_path = NULL,
         output_dir = tempdir(),
@@ -222,7 +230,9 @@ MotifPeeker <- function(
         denovo_motif_discovery = denovo_motif_discovery,
         denovo_motifs = denovo_motifs,
         motif_db = motif_db,
+        trim_seq_width = trim_seq_width,
         download_buttons = download_buttons,
+        meme_path = meme_path,
         output_dir = output_dir,
         use_cache = use_cache,
         ncpus = ncpus,

diff --git a/R/check_ENCODE.R b/R/check_ENCODE.R
@@ -12,11 +12,11 @@
 #' @returns A character string specifying the path to the downloaded file. If
 #' the input is not in ENCODE ID format, the input is returned as-is.
 #' 
-#' @importFrom curl curl_fetch_memory
-#' @importFrom jsonlite fromJSON
-#' 
 #' @examples
-#' check_ENCODE("ENCFF109VAD", expect_format = "bam")
+#' if (requireNamespace("curl", quietly = TRUE) &&
+#'     requireNamespace("jsonlite", quietly = TRUE)) {
+#'     check_ENCODE("ENCFF109VAD", expect_format = "bam")
+#' }
 #' 
 #' @export
 check_ENCODE <- function(encode_id, expect_format, verbose = FALSE) {

diff --git a/R/trim_seqs.R b/R/trim_seqs.R
@@ -0,0 +1,59 @@
+#' Trim sequences to a specified width around the summit
+#'
+#' @param peaks A GRanges object created using
+#' \code{\link[=read_peak_file]{read_peak_file()}}.
+#' @param peak_width Total expected width of the peak.
+#' @param genome_build The genome build that the peak sequences should be
+#' derived from.
+#' @param respect_bounds Logical indicating whether the peak width should be
+#' respected when trimming sequences. (default = TRUE) If \code{TRUE}, the
+#' trimmed sequences will not extend beyond the peak boundaries.
+#' 
+#' @importFrom IRanges IRanges
+#' @importFrom GenomicRanges seqnames GRanges strand mcols start end
+#' @importFrom GenomeInfoDb seqlengths
+#' 
+#' @return A GRanges object with the trimmed sequences. The sequences are
+#' guaranteed to not exceed the \code{peak width + 1} (peak width + the summit
+#' base).
+#' 
+#' @examples
+#' data("CTCF_TIP_peaks", package = "MotifPeeker")
+#' peaks <- CTCF_TIP_peaks
+#' genome_build <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
+#' 
+#' trimmed_seqs <- trim_seqs(peaks, peak_width = 100,
+#'                          genome_build = genome_build)
+#' summary(GenomicRanges::width(trimmed_seqs))
+#' 
+#' @export
+trim_seqs <- function(peaks, peak_width, genome_build, respect_bounds = TRUE) {
+    peak_width <- round(peak_width / 2, 0)
+    max_len <- GenomeInfoDb::seqlengths(genome_build)[as.character(
+        GenomicRanges::seqnames(peaks))]
+
+    if (respect_bounds) {
+        adjusted_iranges <- IRanges::IRanges(
+            ## Ensure start is not less than 1 or more than the sequence length
+            start = pmax(peaks$summit - peak_width, 1,
+                        GenomicRanges::start(peaks)),
+            ## Ensure end is not more than the sequence or peak length
+            end = pmin(peaks$summit + peak_width, max_len,
+                        GenomicRanges::end(peaks))
+        )
+    } else {
+        adjusted_iranges <- IRanges::IRanges(
+            start = pmax(peaks$summit - peak_width, 1),
+            end = pmin(peaks$summit + peak_width, max_len)
+        )
+    }
+
+    adjusted_granges <- GenomicRanges::GRanges(
+        seqnames = GenomicRanges::seqnames(peaks),
+        ranges = adjusted_iranges,
+        strand = GenomicRanges::strand(peaks)
+    )
+    GenomicRanges::mcols(adjusted_granges) <- GenomicRanges::mcols(peaks)
+
+    return(adjusted_granges)
+}
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ status](https://github.com/neurogenomics/MotifPeeker/workflows/rworkflows/badge.
 Authors: <i>Hiranyamaya Dash, Thomas Roberts, Nathan Skene</i>  
 </h4>
 <h4>  
-Updated: <i>Jun-26-2024</i>  
+Updated: <i>Jun-27-2024</i>  
 </h4>
 
 # Introduction

diff --git a/inst/markdown/MotifPeeker.Rmd b/inst/markdown/MotifPeeker.Rmd
@@ -33,8 +33,12 @@ params:
         value: 3
     motif_db:
         value: NULL
+    trim_seq_width:
+        value: NULL
     download_buttons:
         value: NULL
+    meme_path:
+        value: NULL
     output_dir:
         value: NULL
     use_cache:
@@ -86,14 +90,12 @@ cellcount_metrics <- ifelse((length(params$cell_counts) == 0), FALSE, TRUE)
 user_motif_metrics <- ifelse((length(user_motifs$motifs) == 0), FALSE, TRUE)
 
 ## Misc
-ex_emo <- emoji::emoji("exclamation")
+ex_emo <- ifelse(requireNamespace("emoji", quietly = TRUE),
+                emoji::emoji("exclamation"), "!!")
 
 ## Check motif_db
-if (params$denovo_motif_discovery && is.null(params$motif_db)) {
-    motif_db <- get_JASPARCORE()
-} else {
-    motif_db <- params$motif_db
-}
+motif_db <- ifelse(params$denovo_motif_discovery && is.null(params$motif_db),
+                    get_JASPARCORE(), params$motif_db)
 
 ### General Metrics ###
 if (alignment_metrics) {
@@ -124,6 +126,8 @@ if (alignment_metrics) {
                     )
 
 ### Known-motif Analysis ###
+comparison_indices <- setdiff(length(result$peaks), params$reference_index)
+
 
 }
 

diff --git a/man/MotifPeeker.Rd b/man/MotifPeeker.Rd
diff --git a/man/check_ENCODE.Rd b/man/check_ENCODE.Rd
diff --git a/man/trim_seqs.Rd b/man/trim_seqs.Rd
diff --git a/tests/testthat/test-trim_seqs.R b/tests/testthat/test-trim_seqs.R
@@ -0,0 +1,16 @@
+test_that("trim_seqs works", {
+    data("CTCF_TIP_peaks", package = "MotifPeeker")
+    peaks <- CTCF_TIP_peaks
+    genome_build <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
+
+    trimmed_seqs <- trim_seqs(peaks, peak_width = 100,
+                                genome_build = genome_build,
+                                respect_bounds = FALSE)
+    expect_true(all(GenomicRanges::width(trimmed_seqs) <= 101))
+
+    trimmed_seqs_bound <- trim_seqs(peaks, peak_width = 1000,
+                                    genome_build = genome_build,
+                                    respect_bounds = TRUE)
+    expect_true(all(GenomicRanges::width(trimmed_seqs_bound) <= 
+                        max(GenomicRanges::width(peaks)) + 1))
+})