Merge pull request #765 from sjspielman/755-sweep-clustering

Add function to sweep clustering parameters
AlexsLemonade · Sep 19, 2024 · 6f67c73 · 6f67c73
2 parents 1b4e487 + 64d82b2
commit 6f67c73
Show file tree

Hide file tree

Showing 9 changed files with 414 additions and 49 deletions.
diff --git a/packages/rOpenScPCA/DESCRIPTION b/packages/rOpenScPCA/DESCRIPTION
@@ -26,7 +26,10 @@ Suggests:
 Config/testthat/edition: 3
 RoxygenNote: 7.3.2
 Imports: 
+    BiocParallel,
     bluster (>= 1.14),
     dplyr,
     methods,
-    SingleCellExperiment
+    purrr,
+    SingleCellExperiment,
+    tidyr
diff --git a/packages/rOpenScPCA/NAMESPACE b/packages/rOpenScPCA/NAMESPACE
@@ -2,5 +2,6 @@
 
 export(calculate_clusters)
 export(extract_pc_matrix)
+export(sweep_clusters)
 import(SingleCellExperiment)
 import(methods)
diff --git a/packages/rOpenScPCA/R/calculate-clusters.R b/packages/rOpenScPCA/R/calculate-clusters.R
@@ -20,6 +20,7 @@
 #' @param cluster_args List of additional arguments to pass to the chosen clustering function.
 #'   Only single values for each argument are supported (no vectors or lists).
 #'   See igraph documentation for details on each clustering function: https://igraph.org/r/html/latest
+#' @param threads Number of threads to use. Default is 1.
 #' @param seed Random seed to set for clustering.
 #' @param pc_name Name of principal components slot in provided object. This argument is only used if a SingleCellExperiment
 #'   or Seurat object is provided. If not provided, the SingleCellExperiment object name will default to "PCA" and the
@@ -36,6 +37,9 @@
 #' # cluster PCs from a SingleCellExperiment object using default parameters
 #' cluster_df <- calculate_clusters(sce_object)
 #'
+#' # cluster PCs from a SingleCellExperiment object using default parameters and 4 threads
+#' cluster_df <- calculate_clusters(sce_object, threads = 4)
+#'
 #' # cluster PCs from a Seurat object using default parameters
 #' cluster_df <- calculate_clusters(seurat_object)
 #'
@@ -60,6 +64,7 @@ calculate_clusters <- function(
     resolution = 1, # louvain or leiden
     objective_function = c("CPM", "modularity"), # leiden only
     cluster_args = list(),
+    threads = 1,
     seed = NULL,
     pc_name = NULL) {
   if (!is.null(seed)) {
@@ -81,7 +86,8 @@ calculate_clusters <- function(
   # Check input arguments
   stopifnot(
     "`resolution` must be numeric" = is.numeric(resolution),
-    "`nn` must be numeric" = is.numeric(nn)
+    "`nn` must be numeric" = is.numeric(nn),
+    "`threads` must be numeric" = is.numeric(threads)
   )
 
   algorithm <- match.arg(algorithm)
@@ -104,6 +110,12 @@ calculate_clusters <- function(
     cluster_args$objective_function <- objective_function
   }
 
+  if (threads > 1) {
+    bp_param <- BiocParallel::MulticoreParam(threads)
+  } else {
+    bp_param <- BiocParallel::SerialParam()
+  }
+
 
   # Perform clustering
   clusters <- bluster::clusterRows(
@@ -112,22 +124,27 @@ calculate_clusters <- function(
       k = nn,
       type = weighting,
       cluster.fun = algorithm,
-      cluster.args = cluster_args
+      cluster.args = cluster_args,
+      BPPARAM = bp_param
     )
   )
 
-
   # Transform results into a table and return
   cluster_df <- data.frame(
     cell_id = rownames(pca_matrix),
     cluster = clusters,
     algorithm = algorithm,
     weighting = weighting,
     nn = nn
-  ) |>
-    dplyr::bind_cols(
-      data.frame(cluster_args)
-    )
+  )
+
+  # Add in cluster_args if it has parameters to include
+  if (length(cluster_args) != 0) {
+    cluster_df <- cluster_df |>
+      dplyr::bind_cols(
+        data.frame(cluster_args)
+      )
+  }
 
   return(cluster_df)
 }

diff --git a/packages/rOpenScPCA/R/sweep-clusters.R b/packages/rOpenScPCA/R/sweep-clusters.R
@@ -0,0 +1,126 @@
+#' Calculate clusters across a set of parameters
+#'
+#' This function can be used to perform reproducible clustering while varying a set of parameters.
+#' Multiple values can be provided for any of:
+#'  - The algorithm (`algorithm`)
+#'  - The weighting scheme (`weighting`)
+#'  - Number of nearest neighrbors (`nn`)
+#'  - The resolution parameter (`resolution`)
+#'  - The objective function parameter (`objective_function`)
+#'
+#' For each algorithm specified, all parameters possible to use with that
+#' algorithm will be systematically varied. This function does not accept additional
+#' parameters besides those listed above.
+#' Note that defaults for some arguments may differ from the bluster::NNGraphParam() defaults.
+#' Specifically, the clustering algorithm defaults to "louvain" and the weighting scheme to "jaccard"
+#' to align with common practice in scRNA-seq analysis.
+#'
+#' @param x An object containing PCs that clustering can be performed in. This can be either
+#'   a SingleCellExperiment object, a Seurat object, or a matrix where columns are PCs and
+#'   rows are cells. If a matrix is provided, it must have row names of cell ids (e.g., barcodes).
+#' @param algorithm Clustering algorithm to use. Must be one of "louvain" (default), "walktrap",
+#'   or "leiden".
+#' @param weighting Weighting scheme(s) to consider when sweeping parameters.
+#' Provide a vector of unique values to vary this parameter. Options include "jaccard" (default),
+#'   "rank", or "number"
+#' @param nn Number of nearest neighbors to consider when sweeping parameters.
+#'  Provide a vector of unique values to vary this parameter. Default is 10.
+#' @param resolution Resolution parameter used by louvain and leiden clustering only.
+#'   Provide a vector of unique values to vary this parameter. Default is 1.
+#' @param objective_function Leiden-specific parameter for whether to use the
+#'   Constant Potts Model ("CPM"; default) or "modularity". Provide a vector of unique values
+#'   to vary this parameter.
+#' @param seed Random seed to set for clustering.
+#' @param threads Number of threads to use. Default is 1.
+#' @param pc_name Name of principal components slot in provided object. This argument is only used
+#'   if a SingleCellExperiment or Seurat object is provided. If not provided, the SingleCellExperiment
+#'   object name will default to "PCA" and the Seurat object name will default to "pca".
+#'
+#' @return A list of data frames from performing clustering across all parameter combinations.
+#'   Columns include `cluster_set` (identifier column for results from a single clustering run),
+#'   `cell_id`, and `cluster`. Additional columns represent algorithm parameters and include at least:
+#'   `algorithm`, `weighting`, and `nn`. Louvain and leiden clustering will also include `resolution`,
+#'   and leiden clustering will further include `objective_function`.
+#'
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' # perform louvain clustering with jaccard weighting (defaults),
+#' # varying the nearest neighobor parameter.
+#' cluster_df <- sweep_clusters(sce_object, nn = c(10, 15, 20, 25))
+#'
+#' # perform louvain clustering, with jaccard and rank weighting, and
+#' # varying the nearest neighbor and resolution parameters.
+#' cluster_df <- sweep_clusters(
+#'   sce_object,
+#'   algorithm = "louvain",
+#'   weighting = c("jaccard", "rank"),
+#'   nn = c(10, 15, 20, 25),
+#'   resolution = c(0.5, 1)
+#' )
+#'
+#' # perform walktrap and louvain clustering with jaccard weighting, and
+#' # varying the nearest neighbors for both algorithms, and resolution for louvain.
+#' cluster_df <- sweep_clusters(
+#'   sce_object,
+#'   algorithm = c("walktrap", "louvain"),
+#'   weighting = "jaccard",
+#'   nn = c(10, 15, 20, 25),
+#'   resolution = c(0.5, 1)
+#' )
+#' }
+sweep_clusters <- function(
+    x,
+    algorithm = "louvain",
+    weighting = "jaccard",
+    nn = 10,
+    resolution = 1, # louvain or leiden
+    objective_function = "CPM", # leiden only
+    threads = 1,
+    seed = NULL,
+    pc_name = NULL) {
+  # Ensure input is a matrix for slightly faster processing later
+  if (any(class(x) %in% c("matrix", "Matrix"))) {
+    stopifnot(
+      "The matrix must have row names representing cell ids, e.g. barcodes." = is.character(rownames(x))
+    )
+  } else if (is(x, "SingleCellExperiment") || is(x, "Seurat")) {
+    x <- extract_pc_matrix(x, pc_name = pc_name)
+  } else {
+    stop("The first argument should be one of: a SingleCellExperiment object, a Seurat object, or a matrix with row names.")
+  }
+
+  # Collect all specific inputs into a single list
+  sweep_params <- tidyr::expand_grid(
+    algorithm = unique(algorithm),
+    weighting = unique(weighting),
+    nn = unique(nn),
+    resolution = unique(resolution),
+    objective_function = unique(objective_function)
+  ) |>
+    # set unused parameters for each algorithm to default; this will allow duplicates to be removed by distinct()
+    dplyr::mutate(
+      resolution = ifelse(algorithm %in% c("louvain", "leiden"), resolution, 1),
+      objective_function = ifelse(algorithm == "leiden", objective_function, "CPM")
+    ) |>
+    dplyr::distinct()
+
+  sweep_results <- sweep_params |>
+    purrr::pmap(
+      \(algorithm, weighting, nn, resolution, objective_function) {
+        calculate_clusters(
+          x,
+          algorithm = algorithm,
+          weighting = weighting,
+          nn = nn,
+          resolution = resolution,
+          objective_function = objective_function,
+          threads = threads,
+          seed = seed
+        )
+      }
+    )
+
+  return(sweep_results)
+}
diff --git a/packages/rOpenScPCA/man/.gitkeep b/packages/rOpenScPCA/man/.gitkeep
diff --git a/packages/rOpenScPCA/man/calculate_clusters.Rd b/packages/rOpenScPCA/man/calculate_clusters.Rd
diff --git a/packages/rOpenScPCA/man/sweep_clusters.Rd b/packages/rOpenScPCA/man/sweep_clusters.Rd