diff --git a/packages/rOpenScPCA/DESCRIPTION b/packages/rOpenScPCA/DESCRIPTION index 5e3d6d565..3dcfee9de 100644 --- a/packages/rOpenScPCA/DESCRIPTION +++ b/packages/rOpenScPCA/DESCRIPTION @@ -26,7 +26,10 @@ Suggests: Config/testthat/edition: 3 RoxygenNote: 7.3.2 Imports: + BiocParallel, bluster (>= 1.14), dplyr, methods, - SingleCellExperiment + purrr, + SingleCellExperiment, + tidyr diff --git a/packages/rOpenScPCA/NAMESPACE b/packages/rOpenScPCA/NAMESPACE index 64b528a71..f08543285 100644 --- a/packages/rOpenScPCA/NAMESPACE +++ b/packages/rOpenScPCA/NAMESPACE @@ -2,5 +2,6 @@ export(calculate_clusters) export(extract_pc_matrix) +export(sweep_clusters) import(SingleCellExperiment) import(methods) diff --git a/packages/rOpenScPCA/R/calculate-clusters.R b/packages/rOpenScPCA/R/calculate-clusters.R index 3232fbf71..e395048f7 100644 --- a/packages/rOpenScPCA/R/calculate-clusters.R +++ b/packages/rOpenScPCA/R/calculate-clusters.R @@ -20,6 +20,7 @@ #' @param cluster_args List of additional arguments to pass to the chosen clustering function. #' Only single values for each argument are supported (no vectors or lists). #' See igraph documentation for details on each clustering function: https://igraph.org/r/html/latest +#' @param threads Number of threads to use. Default is 1. #' @param seed Random seed to set for clustering. #' @param pc_name Name of principal components slot in provided object. This argument is only used if a SingleCellExperiment #' or Seurat object is provided. If not provided, the SingleCellExperiment object name will default to "PCA" and the @@ -36,6 +37,9 @@ #' # cluster PCs from a SingleCellExperiment object using default parameters #' cluster_df <- calculate_clusters(sce_object) #' +#' # cluster PCs from a SingleCellExperiment object using default parameters and 4 threads +#' cluster_df <- calculate_clusters(sce_object, threads = 4) +#' #' # cluster PCs from a Seurat object using default parameters #' cluster_df <- calculate_clusters(seurat_object) #' @@ -60,6 +64,7 @@ calculate_clusters <- function( resolution = 1, # louvain or leiden objective_function = c("CPM", "modularity"), # leiden only cluster_args = list(), + threads = 1, seed = NULL, pc_name = NULL) { if (!is.null(seed)) { @@ -81,7 +86,8 @@ calculate_clusters <- function( # Check input arguments stopifnot( "`resolution` must be numeric" = is.numeric(resolution), - "`nn` must be numeric" = is.numeric(nn) + "`nn` must be numeric" = is.numeric(nn), + "`threads` must be numeric" = is.numeric(threads) ) algorithm <- match.arg(algorithm) @@ -104,6 +110,12 @@ calculate_clusters <- function( cluster_args$objective_function <- objective_function } + if (threads > 1) { + bp_param <- BiocParallel::MulticoreParam(threads) + } else { + bp_param <- BiocParallel::SerialParam() + } + # Perform clustering clusters <- bluster::clusterRows( @@ -112,11 +124,11 @@ calculate_clusters <- function( k = nn, type = weighting, cluster.fun = algorithm, - cluster.args = cluster_args + cluster.args = cluster_args, + BPPARAM = bp_param ) ) - # Transform results into a table and return cluster_df <- data.frame( cell_id = rownames(pca_matrix), @@ -124,10 +136,15 @@ calculate_clusters <- function( algorithm = algorithm, weighting = weighting, nn = nn - ) |> - dplyr::bind_cols( - data.frame(cluster_args) - ) + ) + + # Add in cluster_args if it has parameters to include + if (length(cluster_args) != 0) { + cluster_df <- cluster_df |> + dplyr::bind_cols( + data.frame(cluster_args) + ) + } return(cluster_df) } diff --git a/packages/rOpenScPCA/R/sweep-clusters.R b/packages/rOpenScPCA/R/sweep-clusters.R new file mode 100644 index 000000000..92d13e5ac --- /dev/null +++ b/packages/rOpenScPCA/R/sweep-clusters.R @@ -0,0 +1,126 @@ +#' Calculate clusters across a set of parameters +#' +#' This function can be used to perform reproducible clustering while varying a set of parameters. +#' Multiple values can be provided for any of: +#' - The algorithm (`algorithm`) +#' - The weighting scheme (`weighting`) +#' - Number of nearest neighrbors (`nn`) +#' - The resolution parameter (`resolution`) +#' - The objective function parameter (`objective_function`) +#' +#' For each algorithm specified, all parameters possible to use with that +#' algorithm will be systematically varied. This function does not accept additional +#' parameters besides those listed above. +#' Note that defaults for some arguments may differ from the bluster::NNGraphParam() defaults. +#' Specifically, the clustering algorithm defaults to "louvain" and the weighting scheme to "jaccard" +#' to align with common practice in scRNA-seq analysis. +#' +#' @param x An object containing PCs that clustering can be performed in. This can be either +#' a SingleCellExperiment object, a Seurat object, or a matrix where columns are PCs and +#' rows are cells. If a matrix is provided, it must have row names of cell ids (e.g., barcodes). +#' @param algorithm Clustering algorithm to use. Must be one of "louvain" (default), "walktrap", +#' or "leiden". +#' @param weighting Weighting scheme(s) to consider when sweeping parameters. +#' Provide a vector of unique values to vary this parameter. Options include "jaccard" (default), +#' "rank", or "number" +#' @param nn Number of nearest neighbors to consider when sweeping parameters. +#' Provide a vector of unique values to vary this parameter. Default is 10. +#' @param resolution Resolution parameter used by louvain and leiden clustering only. +#' Provide a vector of unique values to vary this parameter. Default is 1. +#' @param objective_function Leiden-specific parameter for whether to use the +#' Constant Potts Model ("CPM"; default) or "modularity". Provide a vector of unique values +#' to vary this parameter. +#' @param seed Random seed to set for clustering. +#' @param threads Number of threads to use. Default is 1. +#' @param pc_name Name of principal components slot in provided object. This argument is only used +#' if a SingleCellExperiment or Seurat object is provided. If not provided, the SingleCellExperiment +#' object name will default to "PCA" and the Seurat object name will default to "pca". +#' +#' @return A list of data frames from performing clustering across all parameter combinations. +#' Columns include `cluster_set` (identifier column for results from a single clustering run), +#' `cell_id`, and `cluster`. Additional columns represent algorithm parameters and include at least: +#' `algorithm`, `weighting`, and `nn`. Louvain and leiden clustering will also include `resolution`, +#' and leiden clustering will further include `objective_function`. +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' # perform louvain clustering with jaccard weighting (defaults), +#' # varying the nearest neighobor parameter. +#' cluster_df <- sweep_clusters(sce_object, nn = c(10, 15, 20, 25)) +#' +#' # perform louvain clustering, with jaccard and rank weighting, and +#' # varying the nearest neighbor and resolution parameters. +#' cluster_df <- sweep_clusters( +#' sce_object, +#' algorithm = "louvain", +#' weighting = c("jaccard", "rank"), +#' nn = c(10, 15, 20, 25), +#' resolution = c(0.5, 1) +#' ) +#' +#' # perform walktrap and louvain clustering with jaccard weighting, and +#' # varying the nearest neighbors for both algorithms, and resolution for louvain. +#' cluster_df <- sweep_clusters( +#' sce_object, +#' algorithm = c("walktrap", "louvain"), +#' weighting = "jaccard", +#' nn = c(10, 15, 20, 25), +#' resolution = c(0.5, 1) +#' ) +#' } +sweep_clusters <- function( + x, + algorithm = "louvain", + weighting = "jaccard", + nn = 10, + resolution = 1, # louvain or leiden + objective_function = "CPM", # leiden only + threads = 1, + seed = NULL, + pc_name = NULL) { + # Ensure input is a matrix for slightly faster processing later + if (any(class(x) %in% c("matrix", "Matrix"))) { + stopifnot( + "The matrix must have row names representing cell ids, e.g. barcodes." = is.character(rownames(x)) + ) + } else if (is(x, "SingleCellExperiment") || is(x, "Seurat")) { + x <- extract_pc_matrix(x, pc_name = pc_name) + } else { + stop("The first argument should be one of: a SingleCellExperiment object, a Seurat object, or a matrix with row names.") + } + + # Collect all specific inputs into a single list + sweep_params <- tidyr::expand_grid( + algorithm = unique(algorithm), + weighting = unique(weighting), + nn = unique(nn), + resolution = unique(resolution), + objective_function = unique(objective_function) + ) |> + # set unused parameters for each algorithm to default; this will allow duplicates to be removed by distinct() + dplyr::mutate( + resolution = ifelse(algorithm %in% c("louvain", "leiden"), resolution, 1), + objective_function = ifelse(algorithm == "leiden", objective_function, "CPM") + ) |> + dplyr::distinct() + + sweep_results <- sweep_params |> + purrr::pmap( + \(algorithm, weighting, nn, resolution, objective_function) { + calculate_clusters( + x, + algorithm = algorithm, + weighting = weighting, + nn = nn, + resolution = resolution, + objective_function = objective_function, + threads = threads, + seed = seed + ) + } + ) + + return(sweep_results) +} diff --git a/packages/rOpenScPCA/man/.gitkeep b/packages/rOpenScPCA/man/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/packages/rOpenScPCA/man/calculate_clusters.Rd b/packages/rOpenScPCA/man/calculate_clusters.Rd index c85f40ac5..71745dbe4 100644 --- a/packages/rOpenScPCA/man/calculate_clusters.Rd +++ b/packages/rOpenScPCA/man/calculate_clusters.Rd @@ -12,6 +12,7 @@ calculate_clusters( resolution = 1, objective_function = c("CPM", "modularity"), cluster_args = list(), + threads = 1, seed = NULL, pc_name = NULL ) @@ -35,6 +36,8 @@ have row names of cell ids (e.g., barcodes).} Only single values for each argument are supported (no vectors or lists). See igraph documentation for details on each clustering function: https://igraph.org/r/html/latest} +\item{threads}{Number of threads to use. Default is 1.} + \item{seed}{Random seed to set for clustering.} \item{pc_name}{Name of principal components slot in provided object. This argument is only used if a SingleCellExperiment @@ -59,6 +62,9 @@ to align with common practice in scRNA-seq analysis. # cluster PCs from a SingleCellExperiment object using default parameters cluster_df <- calculate_clusters(sce_object) +# cluster PCs from a SingleCellExperiment object using default parameters and 4 threads +cluster_df <- calculate_clusters(sce_object, threads = 4) + # cluster PCs from a Seurat object using default parameters cluster_df <- calculate_clusters(seurat_object) diff --git a/packages/rOpenScPCA/man/sweep_clusters.Rd b/packages/rOpenScPCA/man/sweep_clusters.Rd new file mode 100644 index 000000000..aa598e4b6 --- /dev/null +++ b/packages/rOpenScPCA/man/sweep_clusters.Rd @@ -0,0 +1,99 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sweep-clusters.R +\name{sweep_clusters} +\alias{sweep_clusters} +\title{Calculate clusters across a set of parameters} +\usage{ +sweep_clusters( + x, + algorithm = "louvain", + weighting = "jaccard", + nn = 10, + resolution = 1, + objective_function = "CPM", + threads = 1, + seed = NULL, + pc_name = NULL +) +} +\arguments{ +\item{x}{An object containing PCs that clustering can be performed in. This can be either +a SingleCellExperiment object, a Seurat object, or a matrix where columns are PCs and +rows are cells. If a matrix is provided, it must have row names of cell ids (e.g., barcodes).} + +\item{algorithm}{Clustering algorithm to use. Must be one of "louvain" (default), "walktrap", +or "leiden".} + +\item{weighting}{Weighting scheme(s) to consider when sweeping parameters. +Provide a vector of unique values to vary this parameter. Options include "jaccard" (default), + "rank", or "number"} + +\item{nn}{Number of nearest neighbors to consider when sweeping parameters. +Provide a vector of unique values to vary this parameter. Default is 10.} + +\item{resolution}{Resolution parameter used by louvain and leiden clustering only. +Provide a vector of unique values to vary this parameter. Default is 1.} + +\item{objective_function}{Leiden-specific parameter for whether to use the +Constant Potts Model ("CPM"; default) or "modularity". Provide a vector of unique values +to vary this parameter.} + +\item{threads}{Number of threads to use. Default is 1.} + +\item{seed}{Random seed to set for clustering.} + +\item{pc_name}{Name of principal components slot in provided object. This argument is only used +if a SingleCellExperiment or Seurat object is provided. If not provided, the SingleCellExperiment +object name will default to "PCA" and the Seurat object name will default to "pca".} +} +\value{ +A list of data frames from performing clustering across all parameter combinations. + Columns include `cluster_set` (identifier column for results from a single clustering run), + `cell_id`, and `cluster`. Additional columns represent algorithm parameters and include at least: + `algorithm`, `weighting`, and `nn`. Louvain and leiden clustering will also include `resolution`, + and leiden clustering will further include `objective_function`. +} +\description{ +This function can be used to perform reproducible clustering while varying a set of parameters. +Multiple values can be provided for any of: + - The algorithm (`algorithm`) + - The weighting scheme (`weighting`) + - Number of nearest neighrbors (`nn`) + - The resolution parameter (`resolution`) + - The objective function parameter (`objective_function`) +} +\details{ +For each algorithm specified, all parameters possible to use with that +algorithm will be systematically varied. This function does not accept additional +parameters besides those listed above. +Note that defaults for some arguments may differ from the bluster::NNGraphParam() defaults. +Specifically, the clustering algorithm defaults to "louvain" and the weighting scheme to "jaccard" +to align with common practice in scRNA-seq analysis. +} +\examples{ +\dontrun{ +# perform louvain clustering with jaccard weighting (defaults), +# varying the nearest neighobor parameter. +cluster_df <- sweep_clusters(sce_object, nn = c(10, 15, 20, 25)) + +# perform louvain clustering, with jaccard and rank weighting, and +# varying the nearest neighbor and resolution parameters. +cluster_df <- sweep_clusters( + sce_object, + algorithm = "louvain", + weighting = c("jaccard", "rank"), + nn = c(10, 15, 20, 25), + resolution = c(0.5, 1) +) + +# perform walktrap and louvain clustering with jaccard weighting, and +# varying the nearest neighbors for both algorithms, and resolution for louvain. +cluster_df <- sweep_clusters( + sce_object, + algorithm = c("walktrap", "louvain"), + weighting = "jaccard", + nn = c(10, 15, 20, 25), + resolution = c(0.5, 1) +) +} +} diff --git a/packages/rOpenScPCA/tests/testthat/test-calculate-clusters.R b/packages/rOpenScPCA/tests/testthat/test-calculate-clusters.R index aa2dcb45b..9512666ad 100644 --- a/packages/rOpenScPCA/tests/testthat/test-calculate-clusters.R +++ b/packages/rOpenScPCA/tests/testthat/test-calculate-clusters.R @@ -21,32 +21,12 @@ test_that("calculate_clusters runs with a matrix, defaults", { colnames(cluster_df), c("cell_id", "cluster", "algorithm", "weighting", "nn", "resolution") ) - expect_equal( - cluster_df$cell_id, - rownames(test_mat) - ) - - expect_s3_class( - cluster_df$cluster, - "factor" - ) - - expect_equal( - unique(cluster_df$algorithm), - "louvain" - ) - expect_equal( - unique(cluster_df$weighting), - "jaccard" - ) - expect_equal( - unique(cluster_df$nn), - 10 - ) - expect_equal( - unique(cluster_df$resolution), - 1 - ) + expect_equal(cluster_df$cell_id, rownames(test_mat)) + expect_s3_class(cluster_df$cluster, "factor") + expect_equal(unique(cluster_df$algorithm), "louvain") + expect_equal(unique(cluster_df$weighting), "jaccard") + expect_equal(unique(cluster_df$nn), 10) + expect_equal(unique(cluster_df$resolution), 1) }) @@ -61,32 +41,39 @@ test_that("calculate_clusters runs with additional cluster_args", { colnames(cluster_df), c("cell_id", "cluster", "algorithm", "weighting", "nn", "resolution", "objective_function", "n_iterations") ) - expect_equal( - unique(cluster_df$n_iterations), - 3 + expect_equal(unique(cluster_df$n_iterations), 3) +}) + + + +test_that("calculate_clusters runs when cluster_args is empty", { + cluster_df <- calculate_clusters( + test_mat, + algorithm = "walktrap" + ) + + expect_setequal( + colnames(cluster_df), + c("cell_id", "cluster", "algorithm", "weighting", "nn") ) + expect_equal(unique(cluster_df$algorithm), "walktrap") }) + test_that("calculate_clusters runs with an object, defaults", { cluster_df_sce <- calculate_clusters(sce) expect_setequal( colnames(cluster_df_sce), c("cell_id", "cluster", "algorithm", "weighting", "nn", "resolution") ) - expect_equal( - cluster_df_sce$cell_id, - rownames(test_mat) - ) + expect_equal(cluster_df_sce$cell_id, rownames(test_mat)) cluster_df_srat <- calculate_clusters(srat) expect_setequal( colnames(cluster_df_srat), c("cell_id", "cluster", "algorithm", "weighting", "nn", "resolution") ) - expect_equal( - cluster_df_srat$cell_id, - rownames(test_mat) - ) + expect_equal(cluster_df_srat$cell_id, rownames(test_mat)) }) @@ -119,10 +106,7 @@ test_that("extract_pc_matrix works as expected", { pc_mat_srt <- extract_pc_matrix(srat) # update test_mat column names to match what will have Seurat changed them to colnames(test_mat) <- gsub("^PC", "PC_", colnames(test_mat)) - expect_identical( - pc_mat_srt, - test_mat - ) + expect_identical(pc_mat_srt, test_mat) }) test_that("extract_pc_matrix errors as expected", { diff --git a/packages/rOpenScPCA/tests/testthat/test-sweep-clusters.R b/packages/rOpenScPCA/tests/testthat/test-sweep-clusters.R new file mode 100644 index 000000000..837aac9ec --- /dev/null +++ b/packages/rOpenScPCA/tests/testthat/test-sweep-clusters.R @@ -0,0 +1,129 @@ +suppressPackageStartupMessages(library(SingleCellExperiment)) + +set.seed(2024) +sce <- splatter::simpleSimulate(nGenes = 1000, verbose = FALSE) |> + scater::logNormCounts() |> + scater::runPCA(ncomponents = 10) + +test_mat <- reducedDim(sce, "PCA") + +srat <- Seurat::CreateSeuratObject(counts = counts(sce), assay = "RNA") +srat[["pca"]] <- Seurat::CreateDimReducObject( + embeddings = test_mat, + key = "PC_", # underscore avoids Seurat warning that it's adding an underscore + assay = "RNA" +) + + +test_that("sweep_clusters works as expected with default algorithm & weighting", { + sweep_list <- sweep_clusters( + sce, + nn = c(10, 15), + resolution = c(0.5, 1) + ) + + expect_length(sweep_list, 4) + + sweep_list |> + purrr::walk( + \(df) { + expect_setequal( + colnames(df), + c("cell_id", "cluster", "algorithm", "weighting", "nn", "resolution") + ) + + # these tests confirm the defaults went through + expect_equal(unique(df$algorithm), "louvain") + expect_equal(unique(df$weighting), "jaccard") + + expect_true( + all(df$nn == 10) || all(df$nn == 15) + ) + expect_true( + all(df$resolution == 0.5) || all(df$resolution == 1) + ) + } + ) +}) + + + +test_that("sweep_clusters works as expected with matrix input", { + sweep_list <- sweep_clusters(test_mat) + expect_length(sweep_list, 1) +}) + + +test_that("sweep_clusters works as expected with Seurat input", { + sweep_list <- sweep_clusters(srat) + expect_length(sweep_list, 1) +}) + + + +test_that("sweep_clusters works as expected with non-default algorithm", { + sweep_list <- sweep_clusters( + sce, + algorithm = "leiden", + objective_function = "modularity", + resolution = c(0.5, 1) + ) + + sweep_list |> + purrr::walk( + \(df) { + expect_setequal( + colnames(df), + c("cell_id", "cluster", "algorithm", "weighting", "nn", "resolution", "objective_function") + ) + + expect_equal(unique(df$algorithm), "leiden") + expect_equal(unique(df$objective_function), "modularity") + + expect_true( + all(df$resolution == 0.5) || all(df$resolution == 1) + ) + } + ) +}) + + + + +test_that("sweep_clusters works as expected with multiple algorithms", { + sweep_list <- sweep_clusters( + sce, + algorithm = c("walktrap", "louvain"), + # used by both + nn = c(10, 15), + # only used by louvain + resolution = c(0.5, 1) + ) + + # count algorithms + alg_counts <- sweep_list |> + purrr::map(\(df) unique(df$algorithm)) |> + purrr::reduce(c) + expect_length(alg_counts, 6) + expect_equal(sum(alg_counts == "louvain"), 4) + expect_equal(sum(alg_counts == "walktrap"), 2) + + + + sweep_list |> + purrr::walk( + \(df) { + if (unique(df$algorithm) == "walktrap") { + expect_setequal( + colnames(df), + c("cell_id", "cluster", "algorithm", "weighting", "nn") + ) + } else if (unique(df$algorithm) == "louvain") { + expect_setequal( + colnames(df), + c("cell_id", "cluster", "algorithm", "weighting", "nn", "resolution") + ) + } + } + ) +})