Skip to content

Commit

Permalink
Merge pull request #765 from sjspielman/755-sweep-clustering
Browse files Browse the repository at this point in the history
Add function to sweep clustering parameters
  • Loading branch information
sjspielman authored Sep 19, 2024
2 parents 1b4e487 + 64d82b2 commit 6f67c73
Show file tree
Hide file tree
Showing 9 changed files with 414 additions and 49 deletions.
5 changes: 4 additions & 1 deletion packages/rOpenScPCA/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ Suggests:
Config/testthat/edition: 3
RoxygenNote: 7.3.2
Imports:
BiocParallel,
bluster (>= 1.14),
dplyr,
methods,
SingleCellExperiment
purrr,
SingleCellExperiment,
tidyr
1 change: 1 addition & 0 deletions packages/rOpenScPCA/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@

export(calculate_clusters)
export(extract_pc_matrix)
export(sweep_clusters)
import(SingleCellExperiment)
import(methods)
31 changes: 24 additions & 7 deletions packages/rOpenScPCA/R/calculate-clusters.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#' @param cluster_args List of additional arguments to pass to the chosen clustering function.
#' Only single values for each argument are supported (no vectors or lists).
#' See igraph documentation for details on each clustering function: https://igraph.org/r/html/latest
#' @param threads Number of threads to use. Default is 1.
#' @param seed Random seed to set for clustering.
#' @param pc_name Name of principal components slot in provided object. This argument is only used if a SingleCellExperiment
#' or Seurat object is provided. If not provided, the SingleCellExperiment object name will default to "PCA" and the
Expand All @@ -36,6 +37,9 @@
#' # cluster PCs from a SingleCellExperiment object using default parameters
#' cluster_df <- calculate_clusters(sce_object)
#'
#' # cluster PCs from a SingleCellExperiment object using default parameters and 4 threads
#' cluster_df <- calculate_clusters(sce_object, threads = 4)
#'
#' # cluster PCs from a Seurat object using default parameters
#' cluster_df <- calculate_clusters(seurat_object)
#'
Expand All @@ -60,6 +64,7 @@ calculate_clusters <- function(
resolution = 1, # louvain or leiden
objective_function = c("CPM", "modularity"), # leiden only
cluster_args = list(),
threads = 1,
seed = NULL,
pc_name = NULL) {
if (!is.null(seed)) {
Expand All @@ -81,7 +86,8 @@ calculate_clusters <- function(
# Check input arguments
stopifnot(
"`resolution` must be numeric" = is.numeric(resolution),
"`nn` must be numeric" = is.numeric(nn)
"`nn` must be numeric" = is.numeric(nn),
"`threads` must be numeric" = is.numeric(threads)
)

algorithm <- match.arg(algorithm)
Expand All @@ -104,6 +110,12 @@ calculate_clusters <- function(
cluster_args$objective_function <- objective_function
}

if (threads > 1) {
bp_param <- BiocParallel::MulticoreParam(threads)
} else {
bp_param <- BiocParallel::SerialParam()
}


# Perform clustering
clusters <- bluster::clusterRows(
Expand All @@ -112,22 +124,27 @@ calculate_clusters <- function(
k = nn,
type = weighting,
cluster.fun = algorithm,
cluster.args = cluster_args
cluster.args = cluster_args,
BPPARAM = bp_param
)
)


# Transform results into a table and return
cluster_df <- data.frame(
cell_id = rownames(pca_matrix),
cluster = clusters,
algorithm = algorithm,
weighting = weighting,
nn = nn
) |>
dplyr::bind_cols(
data.frame(cluster_args)
)
)

# Add in cluster_args if it has parameters to include
if (length(cluster_args) != 0) {
cluster_df <- cluster_df |>
dplyr::bind_cols(
data.frame(cluster_args)
)
}

return(cluster_df)
}
Expand Down
126 changes: 126 additions & 0 deletions packages/rOpenScPCA/R/sweep-clusters.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#' Calculate clusters across a set of parameters
#'
#' This function can be used to perform reproducible clustering while varying a set of parameters.
#' Multiple values can be provided for any of:
#' - The algorithm (`algorithm`)
#' - The weighting scheme (`weighting`)
#' - Number of nearest neighrbors (`nn`)
#' - The resolution parameter (`resolution`)
#' - The objective function parameter (`objective_function`)
#'
#' For each algorithm specified, all parameters possible to use with that
#' algorithm will be systematically varied. This function does not accept additional
#' parameters besides those listed above.
#' Note that defaults for some arguments may differ from the bluster::NNGraphParam() defaults.
#' Specifically, the clustering algorithm defaults to "louvain" and the weighting scheme to "jaccard"
#' to align with common practice in scRNA-seq analysis.
#'
#' @param x An object containing PCs that clustering can be performed in. This can be either
#' a SingleCellExperiment object, a Seurat object, or a matrix where columns are PCs and
#' rows are cells. If a matrix is provided, it must have row names of cell ids (e.g., barcodes).
#' @param algorithm Clustering algorithm to use. Must be one of "louvain" (default), "walktrap",
#' or "leiden".
#' @param weighting Weighting scheme(s) to consider when sweeping parameters.
#' Provide a vector of unique values to vary this parameter. Options include "jaccard" (default),
#' "rank", or "number"
#' @param nn Number of nearest neighbors to consider when sweeping parameters.
#' Provide a vector of unique values to vary this parameter. Default is 10.
#' @param resolution Resolution parameter used by louvain and leiden clustering only.
#' Provide a vector of unique values to vary this parameter. Default is 1.
#' @param objective_function Leiden-specific parameter for whether to use the
#' Constant Potts Model ("CPM"; default) or "modularity". Provide a vector of unique values
#' to vary this parameter.
#' @param seed Random seed to set for clustering.
#' @param threads Number of threads to use. Default is 1.
#' @param pc_name Name of principal components slot in provided object. This argument is only used
#' if a SingleCellExperiment or Seurat object is provided. If not provided, the SingleCellExperiment
#' object name will default to "PCA" and the Seurat object name will default to "pca".
#'
#' @return A list of data frames from performing clustering across all parameter combinations.
#' Columns include `cluster_set` (identifier column for results from a single clustering run),
#' `cell_id`, and `cluster`. Additional columns represent algorithm parameters and include at least:
#' `algorithm`, `weighting`, and `nn`. Louvain and leiden clustering will also include `resolution`,
#' and leiden clustering will further include `objective_function`.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # perform louvain clustering with jaccard weighting (defaults),
#' # varying the nearest neighobor parameter.
#' cluster_df <- sweep_clusters(sce_object, nn = c(10, 15, 20, 25))
#'
#' # perform louvain clustering, with jaccard and rank weighting, and
#' # varying the nearest neighbor and resolution parameters.
#' cluster_df <- sweep_clusters(
#' sce_object,
#' algorithm = "louvain",
#' weighting = c("jaccard", "rank"),
#' nn = c(10, 15, 20, 25),
#' resolution = c(0.5, 1)
#' )
#'
#' # perform walktrap and louvain clustering with jaccard weighting, and
#' # varying the nearest neighbors for both algorithms, and resolution for louvain.
#' cluster_df <- sweep_clusters(
#' sce_object,
#' algorithm = c("walktrap", "louvain"),
#' weighting = "jaccard",
#' nn = c(10, 15, 20, 25),
#' resolution = c(0.5, 1)
#' )
#' }
sweep_clusters <- function(
x,
algorithm = "louvain",
weighting = "jaccard",
nn = 10,
resolution = 1, # louvain or leiden
objective_function = "CPM", # leiden only
threads = 1,
seed = NULL,
pc_name = NULL) {
# Ensure input is a matrix for slightly faster processing later
if (any(class(x) %in% c("matrix", "Matrix"))) {
stopifnot(
"The matrix must have row names representing cell ids, e.g. barcodes." = is.character(rownames(x))
)
} else if (is(x, "SingleCellExperiment") || is(x, "Seurat")) {
x <- extract_pc_matrix(x, pc_name = pc_name)
} else {
stop("The first argument should be one of: a SingleCellExperiment object, a Seurat object, or a matrix with row names.")
}

# Collect all specific inputs into a single list
sweep_params <- tidyr::expand_grid(
algorithm = unique(algorithm),
weighting = unique(weighting),
nn = unique(nn),
resolution = unique(resolution),
objective_function = unique(objective_function)
) |>
# set unused parameters for each algorithm to default; this will allow duplicates to be removed by distinct()
dplyr::mutate(
resolution = ifelse(algorithm %in% c("louvain", "leiden"), resolution, 1),
objective_function = ifelse(algorithm == "leiden", objective_function, "CPM")
) |>
dplyr::distinct()

sweep_results <- sweep_params |>
purrr::pmap(
\(algorithm, weighting, nn, resolution, objective_function) {
calculate_clusters(
x,
algorithm = algorithm,
weighting = weighting,
nn = nn,
resolution = resolution,
objective_function = objective_function,
threads = threads,
seed = seed
)
}
)

return(sweep_results)
}
Empty file removed packages/rOpenScPCA/man/.gitkeep
Empty file.
6 changes: 6 additions & 0 deletions packages/rOpenScPCA/man/calculate_clusters.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

99 changes: 99 additions & 0 deletions packages/rOpenScPCA/man/sweep_clusters.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 6f67c73

Please sign in to comment.