Correct terminology (de-novo motif discovery -> motif discovery)

neurogenomics · Nov 12, 2024 · c1f0142 · c1f0142
1 parent 5736475
commit c1f0142
Show file tree

Hide file tree

Showing 20 changed files with 138 additions and 126 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: MotifPeeker
 Title: Benchmarking Epigenomic Profiling Methods Using Motif Enrichment
-Version: 0.99.10
+Version: 0.99.11
 Authors@R: c( 
     person(given = "Hiranyamaya",
            family = "Dash",

diff --git a/NAMESPACE b/NAMESPACE
@@ -73,6 +73,7 @@ importFrom(universalmotif,read_meme)
 importFrom(universalmotif,read_transfac)
 importFrom(universalmotif,read_uniprobe)
 importFrom(utils,capture.output)
+importFrom(utils,packageVersion)
 importFrom(utils,read.table)
 importFrom(utils,write.table)
 importFrom(viridis,scale_color_viridis)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,12 @@
+# MotifPeeker 0.99.11
+
+## Miscellaneous
+
+* Correct "de-novo motif discovery" term to "motif discovery". STREME does not
+perform de-novo motif discovery.
+* Add package version to report header.
+
+
 # MotifPeeker 0.99.9 / 0.99.10
 
 ## Bug Fixes

diff --git a/R/MotifPeeker.R b/R/MotifPeeker.R
@@ -4,24 +4,24 @@
 #' as the key metric. The output is an easy-to-interpret HTML document with the
 #' results. The report contains three main sections: (1) General Metrics on peak
 #' and alignment files (if provided), (2) Known Motif Enrichment Analysis and
-#' (3) De-novo Motif Enrichment Analysis.
+#' (3) Discovered Motif Enrichment Analysis.
 #' 
 #' Runtime guidance: For 4 datasets, the runtime is approximately 3 minutes with
-#' denovo_motif_discovery disabled. However, de-novo motif discovery can take
+#' motif_discovery disabled. However, motif discovery can take
 #' hours to complete. To make computation faster, we highly recommend tuning the
 #' following arguments:
 #' \describe{
 #'    \item{\code{BPPARAM=MulticoreParam(x)}}{Running motif discovery in
 #'    parallel can significantly reduce runtime, but it is very
 #'    memory-intensive, consuming 10+GB of RAM per thread. Memory starvation can
 #'    greatly slow the process, so set the number of cores with caution.}
-#'    \item{\code{denovo_motifs}}{The number of motifs to discover per sequence
-#'    group exponentially increases runtime. We recommend no more than 5
-#'    motifs to make a meaningful inference.}
-#'    \item{\code{trim_seq_width}}{Trimming sequences before running de-novo
+#'    \item{\code{motif_discovery_count}}{The number of motifs to discover per
+#'    sequence group exponentially increases runtime. We recommend no more than
+#'    5 motifs to make a meaningful inference.}
+#'    \item{\code{trim_seq_width}}{Trimming sequences before running
 #'    motif discovery can significantly reduce the search space. Sequence
 #'    length can exponentially increase runtime. We recommend running the
-#'    script with \code{denovo_motif_discovery = FALSE} and studying the
+#'    script with \code{motif_discovery = FALSE} and studying the
 #'    motif-summit distance distribution under general metrics to find the
 #'    sequence length that captures most motifs. A good starting point is 150
 #'    but it can be reduced further if appropriate.}
@@ -69,8 +69,10 @@
 #' labels.
 #' @param cell_counts An integer vector of experiment cell counts for each peak
 #' file. (optional) Creates additional comparisons based on cell counts.
-#' @param denovo_motif_discovery A logical indicating whether to perform
-#' de-novo motif discovery for the third section of the report. (default = TRUE)
+#' @param motif_discovery A logical indicating whether to perform
+#' motif discovery for the third section of the report. (default = TRUE)
+#' @param motif_discovery_count An integer specifying the number of motifs to
+#' discover. (default = 3) Note that higher values take longer to compute.
 #' @param download_buttons A logical indicating whether to include download
 #' buttons for various files within the HTML report. (default = TRUE)
 #' @param out_dir A character string specifying the directory to save the
@@ -97,7 +99,7 @@
 #'   with \code{BPPARAM = BiocParallel::MulticoreParam()}.
 #' }
 #' \strong{IMPORTANT:} For each worker, please ensure a minimum of 8GB of
-#' memory (RAM) is available as \code{denovo_motif_discovery} is
+#' memory (RAM) is available as \code{motif_discovery} is
 #' memory-intensive.
 #' @param quiet A logical indicating whether to print markdown knit messages.
 #' (default = FALSE)
@@ -124,7 +126,7 @@
 #' 
 #' @return Path to the output directory.
 #' 
-#' @note Running de-novo motif discovery is computationally expensive and can
+#' @note Running motif discovery is computationally expensive and can
 #' require from minutes to hours. \code{denovo_motifs} can widely affect the
 #' runtime (higher values take longer). Setting \code{trim_seq_width} to a lower
 #' value can also reduce the runtime significantly.
@@ -162,8 +164,8 @@
 #'         motif_files = motifs,
 #'         motif_labels = NULL,
 #'         cell_counts = NULL,
-#'         denovo_motif_discovery = TRUE,
-#'         denovo_motifs = 1,
+#'         motif_discovery = TRUE,
+#'         motif_discovery_count = 1,
 #'         motif_db = NULL,
 #'         download_buttons = TRUE,
 #'         out_dir = tempdir(),
@@ -184,8 +186,8 @@ MotifPeeker <- function(
         motif_files = NULL,
         motif_labels = NULL,
         cell_counts = NULL,
-        denovo_motif_discovery = TRUE,
-        denovo_motifs = 3,
+        motif_discovery = TRUE,
+        motif_discovery_count = 3,
         filter_n = 6,
         trim_seq_width = NULL,
         motif_db = NULL,
@@ -223,9 +225,9 @@ MotifPeeker <- function(
         "equal to ", shQuote("peak_files"), ".")
         stop(stp_msg)
     }
-    if (denovo_motif_discovery &&
-        (is.null(denovo_motifs) || denovo_motifs < 1)) {
-        stp_msg <- "Number of de-novo motifs to find must be greater than 0."
+    if (motif_discovery &&
+        (is.null(motif_discovery_count) || motif_discovery_count < 1)) {
+        stp_msg <- "Number of motifs to discover must be greater than 0."
         stop(stp_msg)
     }
 
@@ -266,8 +268,8 @@ MotifPeeker <- function(
         motif_files = motif_files,
         motif_labels = motif_labels,
         cell_counts = cell_counts,
-        denovo_motif_discovery = denovo_motif_discovery,
-        denovo_motifs = denovo_motifs,
+        motif_discovery = motif_discovery,
+        discover_motifs = motif_discovery_count,
         filter_n = filter_n,
         motif_db = motif_db,
         trim_seq_width = trim_seq_width,

diff --git a/R/denovo_motifs.R b/R/denovo_motifs.R
@@ -1,6 +1,6 @@
-#' Find de-novo motifs in sequences
+#' Discover motifs in sequences
 #' 
-#' Use STREME from MEME suite to find de-novo motifs in the provided sequences.
+#' Use STREME from MEME suite to find  motifs in the provided sequences.
 #' To speed up the process, the sequences can be optionally trimmed to reduce
 #' the search space. The result is then optionally filtered to remove motifs
 #' with a high number of nucleotide repeats
@@ -9,19 +9,19 @@
 #' sequences to search for motifs.
 #' @param trim_seq_width An integer specifying the width of the sequence to
 #' extract around the summit (default = NULL). This sequence is used to search
-#' for de novo motifs. If not provided, the entire peak region will be used.
+#' for discovered motifs. If not provided, the entire peak region will be used.
 #' This parameter is intended to reduce the search space and speed up motif
 #' discovery; therefore, a value less than the average peak width is
 #' recommended. Peaks are trimmed symmetrically around the summit while
 #' respecting the peak bounds.
-#' @param denovo_motifs An integer specifying the number of de-novo motifs to
+#' @param discover_motifs_count An integer specifying the number of motifs to
 #' discover. (default = 3) Note that higher values take longer to compute.
 #' @param minw An integer specifying the minimum width of the motif.
 #' (default = 8)
 #' @param maxw An integer specifying the maximum width of the motif.
 #' (default = 25)
 #' @param filter_n An integer specifying the number of consecutive nucleotide
-#' repeats a de-novo discovered motif must contain to be filtered out.
+#' repeats a discovered motif must contain to be filtered out.
 #' (default = 6)
 #' @param out_dir A \code{character} vector of output directory to save STREME
 #' results to. (default = \code{tempdir()})
@@ -47,7 +47,7 @@
 #'     res <- denovo_motifs(list(CTCF_TIP_peaks),
 #'                         trim_seq_width = 50,
 #'                         genome_build = genome_build,
-#'                         denovo_motifs = 1,
+#'                         discover_motifs_count = 1,
 #'                         filter_n = 6,
 #'                         minw = 8,
 #'                         maxw = 8,
@@ -59,7 +59,7 @@
 denovo_motifs <- function(seqs,
                             trim_seq_width,
                             genome_build,
-                            denovo_motifs = 3,
+                            discover_motifs_count = 3,
                             minw = 8,
                             maxw = 25,
                             filter_n = 6,
@@ -93,7 +93,7 @@ denovo_motifs <- function(seqs,
                 silent = !debug,
                 minw = 8,
                 maxw = 25,
-                nmotifs = denovo_motifs,
+                nmotifs = discover_motifs_count,
                 meme_path = meme_path,
                 ...
             )

diff --git a/R/find_motifs.R b/R/find_motifs.R
@@ -31,7 +31,7 @@
 #'         res <- denovo_motifs(list(CTCF_TIP_peaks),
 #'                         trim_seq_width = 50,
 #'                         genome_build = genome_build,
-#'                         denovo_motifs = 1,
+#'                         discover_motifs_count = 1,
 #'                         filter_n = 10,
 #'                         out_dir = tempdir())
 #'         res2 <- find_motifs(res, motif_db = get_JASPARCORE(),

diff --git a/R/motif_similarity.R b/R/motif_similarity.R
@@ -42,7 +42,7 @@
 #'         denovo_motifs <- denovo_motifs(unlist(segregated_peaks),
 #'                             trim_seq_width = 50,
 #'                             genome_build = genome_build,
-#'                             denovo_motifs = 1,
+#'                             discover_motifs_count = 1,
 #'                             filter_n = 6,
 #'                             maxw = 8,
 #'                             minw = 8,

diff --git a/README.Rmd b/README.Rmd
@@ -34,8 +34,8 @@ peaks, including FRiP scores, peak widths, and motif-to-summit distances.
 enriched user-supplied motifs in the datasets and compares them between the
 common and unique peaks from comparison and reference datasets.  
 
-3. **De-Novo Motif Enrichment Analysis**: Details the statistics of de-novo
-discovered motifs in common and unique peaks from comparison and reference
+3. **Discovered Motif Enrichment Analysis**: Details the statistics of
+motifs discovered in common and unique peaks from comparison and reference
 datasets. Examines motif similarities and identifies the closest known motifs in
 the JASPAR or the provided database.
 
@@ -126,8 +126,8 @@ MotifPeeker(
     genome_build = "hg38",
     motif_files = motif_files,
     cell_counts = NULL,  # No cell-count information
-    denovo_motif_discovery = TRUE,
-    denovo_motifs = 3,
+    motif_discovery = TRUE,
+    motif_discovery_count = 3,
     motif_db = NULL,
     download_buttons = TRUE,
     out_dir = tempdir(),
@@ -192,7 +192,7 @@ enhance them:
 - `cell_counts`: An integer vector of experiment cell counts for each peak file
   (if available). Creates additional comparisons based on cell counts.  
 - `motif_db`: Path to `.meme` format file to use as reference database, or a
-  list of `universalmotif-class` objects. Results from de-novo motif discovery
+  list of `universalmotif-class` objects. Results from motif discovery
   are searched against this database to find similar motifs. If not provided,
   JASPAR CORE database will be used, making this parameter **truly optional**.
   **NOTE**: p-value estimates are inaccurate when the database has fewer than
@@ -208,7 +208,7 @@ for [`MotifPeeker()`](https://neurogenomics.github.io/MotifPeeker/reference/Moti
 ### Runtime Guidance
 
 For 4 datasets, the runtime is approximately 3 minutes with
-denovo_motif_discovery disabled. However, de-novo motif discovery can take
+motif_discovery disabled. However, motif discovery can take
 hours to complete.  
 
 To make computation faster, we highly recommend tuning the following arguments:  
@@ -219,13 +219,13 @@ To make computation faster, we highly recommend tuning the following arguments:
   runtime, but it is very memory-intensive, consuming upwards of 10GB of RAM per
   thread. Memory starvation can greatly slow the process, so set `workers` with
   caution.  
-- `denovo_motifs`: The number of motifs to discover per sequence group
+- `motif_discovery_count`: The number of motifs to discover per sequence group
   exponentially increases runtime. We recommend no more than 5 motifs to make a
   meaningful inference.  
-- `trim_seq_width`: Trimming sequences before running de-novo motif discovery
+- `trim_seq_width`: Trimming sequences before running motif discovery
   can significantly reduce the search space. Sequence length can exponentially
   increase runtime. We recommend running the script with
-  `denovo_motif_discovery = FALSE` and studying the motif-summit distance
+  `motif_discovery = FALSE` and studying the motif-summit distance
   distribution under general metrics to find the sequence length that captures
   most motifs. A good starting point is 150 but it can be reduced further if
   appropriate.

diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ style="height: 300px !important;" />
 
 [![License: GPL (\>=
 3)](https://img.shields.io/badge/license-GPL%20(%3E=%203)-blue.svg)](https://cran.r-project.org/web/licenses/GPL%20(%3E=%203))
-[![](https://img.shields.io/badge/devel%20version-0.99.10-black.svg)](https://github.com/neurogenomics/MotifPeeker)
+[![](https://img.shields.io/badge/devel%20version-0.99.11-black.svg)](https://github.com/neurogenomics/MotifPeeker)
 [![](https://img.shields.io/github/languages/code-size/neurogenomics/MotifPeeker.svg)](https://github.com/neurogenomics/MotifPeeker)
 [![](https://img.shields.io/github/last-commit/neurogenomics/MotifPeeker.svg)](https://github.com/neurogenomics/MotifPeeker/commits/master)
 <br> [![R build
@@ -18,7 +18,7 @@ status](https://github.com/neurogenomics/MotifPeeker/workflows/rworkflows/badge.
 
 **Authors:** ***Hiranyamaya (Hiru) Dash, Thomas Roberts, Nathan
 Skene***  
-**Updated:** ***Nov-11-2024***
+**Updated:** ***Nov-12-2024***
 
 ## Introduction
 
@@ -35,10 +35,10 @@ package outputs an HTML report consisting of three sections:
     compares them between the common and unique peaks from comparison
     and reference datasets.
 
-3.  **De-Novo Motif Enrichment Analysis**: Details the statistics of
-    de-novo discovered motifs in common and unique peaks from comparison
-    and reference datasets. Examines motif similarities and identifies
-    the closest known motifs in the JASPAR or the provided database.
+3.  **Discovered Motif Enrichment Analysis**: Details the statistics of
+    motifs discovered in common and unique peaks from comparison and
+    reference datasets. Examines motif similarities and identifies the
+    closest known motifs in the JASPAR or the provided database.
 
 <!-- If you use `MotifPeeker`, please cite:  -->
 
@@ -141,8 +141,8 @@ MotifPeeker(
     genome_build = "hg38",
     motif_files = motif_files,
     cell_counts = NULL,  # No cell-count information
-    denovo_motif_discovery = TRUE,
-    denovo_motifs = 3,
+    motif_discovery = TRUE,
+    motif_discovery_count = 3,
     motif_db = NULL,
     download_buttons = TRUE,
     out_dir = tempdir(),
@@ -222,10 +222,10 @@ or enhance them:
   peak file (if available). Creates additional comparisons based on cell
   counts.  
 - `motif_db`: Path to `.meme` format file to use as reference database,
-  or a list of `universalmotif-class` objects. Results from de-novo
-  motif discovery are searched against this database to find similar
-  motifs. If not provided, JASPAR CORE database will be used, making
-  this parameter **truly optional**. **NOTE**: p-value estimates are
+  or a list of `universalmotif-class` objects. Results from motif
+  discovery are searched against this database to find similar motifs.
+  If not provided, JASPAR CORE database will be used, making this
+  parameter **truly optional**. **NOTE**: p-value estimates are
   inaccurate when the database has fewer than 50 entries.
 
 </details>
@@ -239,8 +239,8 @@ documentation for
 ### Runtime Guidance
 
 For 4 datasets, the runtime is approximately 3 minutes with
-denovo_motif_discovery disabled. However, de-novo motif discovery can
-take hours to complete.
+motif_discovery disabled. However, motif discovery can take hours to
+complete.
 
 To make computation faster, we highly recommend tuning the following
 arguments:
@@ -256,16 +256,16 @@ arguments:
   reduce runtime, but it is very memory-intensive, consuming upwards of
   10GB of RAM per thread. Memory starvation can greatly slow the
   process, so set `workers` with caution.  
-- `denovo_motifs`: The number of motifs to discover per sequence group
-  exponentially increases runtime. We recommend no more than 5 motifs to
-  make a meaningful inference.  
-- `trim_seq_width`: Trimming sequences before running de-novo motif
-  discovery can significantly reduce the search space. Sequence length
-  can exponentially increase runtime. We recommend running the script
-  with `denovo_motif_discovery = FALSE` and studying the motif-summit
-  distance distribution under general metrics to find the sequence
-  length that captures most motifs. A good starting point is 150 but it
-  can be reduced further if appropriate.
+- `motif_discovery_count`: The number of motifs to discover per sequence
+  group exponentially increases runtime. We recommend no more than 5
+  motifs to make a meaningful inference.  
+- `trim_seq_width`: Trimming sequences before running motif discovery
+  can significantly reduce the search space. Sequence length can
+  exponentially increase runtime. We recommend running the script with
+  `motif_discovery = FALSE` and studying the motif-summit distance
+  distribution under general metrics to find the sequence length that
+  captures most motifs. A good starting point is 150 but it can be
+  reduced further if appropriate.
 
 </details>