Merge pull request #72 from AlexsLemonade/sjspielman/68-doublet-module

Add doublet detection module
AlexsLemonade · Jul 23, 2024 · 846a22b · 846a22b
2 parents 8e50be6 + f7ca802
commit 846a22b
Show file tree

Hide file tree

Showing 5 changed files with 239 additions and 0 deletions.
diff --git a/main.nf b/main.nf
@@ -4,6 +4,7 @@
 include { example } from './modules/example'
 include { simulate_sce } from './modules/simulate-sce'
 include { merge_sce } from './modules/merge-sce'
+include { detect_doublets } from './modules/doublet-detection'
 
 // **** Parameter checks ****
 param_error = false
@@ -47,5 +48,9 @@ workflow {
   sample_ch = Channel.fromList(Utils.getSampleTuples(release_dir))
     .filter{ run_all || it[1] in project_ids }
 
+  // Run the merge workflow
   merge_sce(sample_ch)
+
+  // Run the doublet detection workflow
+  detect_doublets(sample_ch)
 }
diff --git a/modules/doublet-detection/README.md b/modules/doublet-detection/README.md
@@ -0,0 +1,5 @@
+This module performs doublet detection on processed SCE objects with [`scDblFinder`](https://bioconductor.org/packages/release/bioc/html/scDblFinder.html).
+
+Scripts are derived from the the `doublet-detection` module of the [OpenScPCA-analysis](https://github.com/AlexsLemonade/OpenScPCA-analysis) repository.
+
+Permalink to the version used: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/cdc71e4ac0ca4fcaf6ac225002c928453913f734/analyses/doublet-detection/scripts/01a_run-scdblfinder.R>
diff --git a/modules/doublet-detection/main.nf b/modules/doublet-detection/main.nf
@@ -0,0 +1,61 @@
+#!/usr/bin/env nextflow
+
+// Workflow to detect doublets in a SingleCellExperiment object using scDblFinder
+
+params.doublet_detection_container = 'public.ecr.aws/openscpca/doublet-detection:latest'
+
+process run_scdblfinder {
+  container params.doublet_detection_container
+  tag "${sample_id}"
+  publishDir "${params.results_bucket}/${params.release_prefix}/doublet-detection/${project_id}/${sample_id}", mode: 'copy'
+  input:
+    tuple val(sample_id),
+          val(project_id),
+          path(library_files)
+  output:
+    tuple val(sample_id),
+          val(project_id),
+          path(output_files)
+  script:
+    output_files = library_files
+      .collect{
+        it.name.replaceAll(/(?i).rds$/, "_scdblfinder.tsv")
+      }
+    """
+    for file in ${library_files}; do
+      run_scdblfinder.R \
+        --input_sce_file \$file \
+        --output_file \$(basename \${file%.rds}_scdblfinder.tsv) \
+        --random_seed 2024 \
+        --cores ${task.cpus}
+    done
+    """
+
+  stub:
+    output_files = library_files
+      .collect{
+        it.name.replaceAll(/(?i).rds$/, "_scdblfinder.tsv")
+      }
+    """
+    for file in ${library_files}; do
+      touch \$(basename \${file%.rds}_scdblfinder.tsv)
+    done
+    """
+}
+
+
+
+workflow detect_doublets {
+  take:
+    sample_ch  // [sample_id, project_id, sample_path]
+  main:
+    // create [sample_id, project_id, [list, of, processed, files]]
+    libraries_ch = sample_ch
+      .map{sample_id, project_id, sample_path ->
+        def library_files = Utils.getLibraryFiles(sample_path, format: "sce", process_level: "processed")
+        return [sample_id, project_id, library_files]
+      }
+
+    // detect doublets
+    run_scdblfinder(libraries_ch)
+}
diff --git a/modules/doublet-detection/resources/usr/bin/run_scdblfinder.R b/modules/doublet-detection/resources/usr/bin/run_scdblfinder.R
@@ -0,0 +1,167 @@
+#!/usr/bin/env Rscript
+
+# This script runs scDblFinder on an SCE file and exports a TSV file of results.
+# If the SCE has fewer than 10 droplets, results will not be calculated and the exported TSV will contain NA values.
+
+# Load libraries ------
+suppressPackageStartupMessages({
+  library(SingleCellExperiment)
+  library(optparse)
+})
+
+# Functions -----------
+
+#' Run scDblFinder on an SCE object
+#'
+#' @param sce SCE object to run scDblFinder on
+#' @param cores Number of cores to specify to BiocParallel::MulticoreParam()
+#' @param sample_var Required for SCE object with multiple samples, the SCE colData variable that indicates sample of origin.
+#'   This option should _not_ be used for a multiplexed library which has not been demultiplexed, but is suitable for merged objects.
+#' @param random_seed Required for SCE object with multiple samples, a random seed to ensure reproducibility when running on multiple samples.
+#' @param columns_to_keep Vector of columns in the final scDblFinder table to keep in the returned table. No checking is done on these column names.
+#' @param ... Additional arguments to pass to scDblFinder::scDblFinder
+#'
+#' @return Data frame object with scDblFinder inferences
+run_scdblfinder <- function(sce,
+                            cores = 4,
+                            sample_var = NULL,
+                            random_seed = NULL,
+                            columns_to_keep = c("barcodes", "score", "class"),
+                            ...) {
+  # first check multiple samples, which requires a random seed
+  if (!is.null(sample_var) && is.null(random_seed)) {
+    if (!sample_var %in% colnames(colData(sce))) {
+      stop(
+        glue::glue("The provided sample variable {sample_var} is not present in the input SCE's colData slot.")
+      )
+    }
+    if (is.null(random_seed)) {
+      # See section 1.5.6: https://bioconductor.org/packages/3.19/bioc/vignettes/scDblFinder/inst/doc/scDblFinder.html#usage
+      stop("If multiple samples are present in the input SCE object (e.g., the object contains multiple merged libraries), a random seed must be provided for reproducibility.")
+    }
+  }
+
+  # set up cores
+  if (cores == 1) {
+    bp <- BiocParallel::SerialParam(RNGseed = random_seed)
+  } else {
+    bp <- BiocParallel::MulticoreParam(cores, RNGseed = random_seed)
+  }
+
+  # Run doublet finder
+  result_df <- scDblFinder::scDblFinder(
+    sce,
+    samples = sample_var, # Default is NULL so use whatever was provided by the user or default
+    BPPARAM = bp,
+    returnType = "table", # return df, not sce
+    verbose = FALSE,
+    ...
+  )
+
+  result_df <- result_df |>
+    as.data.frame() |>
+    tibble::rownames_to_column("barcodes") |>
+    # remove artificial doublets
+    dplyr::filter(!stringr::str_starts(barcodes, "rDbl")) |>
+    # keep only columns of interest
+    dplyr::select(dplyr::all_of(columns_to_keep))
+
+  return(result_df)
+}
+
+
+# Parse options --------
+
+option_list <- list(
+  make_option(
+    "--input_sce_file",
+    type = "character",
+    help = "Path to input SCE file in RDS format."
+  ),
+  make_option(
+    "--output_file",
+    type = "character",
+    default = "scdblfinder_results.tsv",
+    help = "Path to output TSV file with doublet inferences."
+  ),
+  make_option(
+    "--cores",
+    type = "integer",
+    default = 4,
+    help = "Number of cores to use during scDblFinder inference."
+  ),
+  make_option(
+    "--benchmark",
+    action = "store_true",
+    default = FALSE,
+    help = "Whether the script is being run as part of the doublet benchmarking analysis. If this flag is invoked, the `cxds_score` column will be included in the output TSV along with the `barcodes`, `score`, and `class` columns that are always included."
+  ),
+  make_option(
+    "--sample_var",
+    type = "character",
+    help = "If multiple samples are present in the SCE, the colData column name with sample ids. This option should be used for merged objects."
+  ),
+  make_option(
+    "--random_seed",
+    type = "integer",
+    default = 2024,
+    help = "Random seed. Default is 2024."
+  )
+)
+opts <- parse_args(OptionParser(option_list = option_list))
+
+# Check input arguments and set up files, directories ------
+if (!file.exists(opts$input_sce_file)) {
+  glue::glue("Could not find input SCE file, expected at: `{opts$input_sce_file}`.")
+}
+if (!stringr::str_ends(opts$output_file, ".tsv")) {
+  stop("The output TSV file must end in .tsv.")
+}
+
+fs::dir_create(dirname(opts$output_file)) # create output directory as needed
+set.seed(opts$random_seed)
+
+# Detect doublets and export TSV file with inferences -----
+
+# Check the number of cells to see if scDblFinder can be run
+cell_threshold <- 10
+
+sce <- readRDS(opts$input_sce_file)
+ncells <- ncol(sce)
+
+if (ncells < cell_threshold) {
+  warning(
+    glue::glue(
+      "The provided SingleCellExperiment object only has {ncells} droplets, which is fewer than {cell_threshold} so `scDblFinder` will not be run.
+       An output TSV file will still be produced, but it will be populated with `NA` values."
+    )
+  )
+  na_df <- data.frame(
+    barcodes = colnames(sce),
+    score = NA,
+    class = NA
+  )
+  if (opts$benchmark) {
+    na_df$cxds_score <- NA
+  }
+  readr::write_tsv(na_df, opts$output_file)
+} else {
+  keep_columns <- c(
+    "barcodes",
+    "score",
+    "class"
+  )
+  if (opts$benchmark) {
+    keep_columns <- c(keep_columns, "cxds_score")
+  }
+
+  run_scdblfinder(
+    sce,
+    cores = opts$cores,
+    # only used if there are multiple samples, e.g. if this is processing a merged object
+    sample_var = opts$sample_var, # will be NULL if not provided as input argument
+    random_seed = opts$random_seed,
+    columns_to_keep = keep_columns
+  ) |>
+    readr::write_tsv(opts$output_file)
+}
diff --git a/nextflow.config b/nextflow.config
@@ -9,6 +9,7 @@ manifest {
   nextflowVersion = '>=24.04.0'
 }
 
+nextflow.enable.dsl = 2
 nextflow.enable.moduleBinaries = true
 
 // global default parameters for workflows: output buckets are set to staging by default