Skip to content

Commit

Permalink
Merge pull request #72 from AlexsLemonade/sjspielman/68-doublet-module
Browse files Browse the repository at this point in the history
Add doublet detection module
  • Loading branch information
sjspielman authored Jul 23, 2024
2 parents 8e50be6 + f7ca802 commit 846a22b
Show file tree
Hide file tree
Showing 5 changed files with 239 additions and 0 deletions.
5 changes: 5 additions & 0 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
include { example } from './modules/example'
include { simulate_sce } from './modules/simulate-sce'
include { merge_sce } from './modules/merge-sce'
include { detect_doublets } from './modules/doublet-detection'

// **** Parameter checks ****
param_error = false
Expand Down Expand Up @@ -47,5 +48,9 @@ workflow {
sample_ch = Channel.fromList(Utils.getSampleTuples(release_dir))
.filter{ run_all || it[1] in project_ids }

// Run the merge workflow
merge_sce(sample_ch)

// Run the doublet detection workflow
detect_doublets(sample_ch)
}
5 changes: 5 additions & 0 deletions modules/doublet-detection/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
This module performs doublet detection on processed SCE objects with [`scDblFinder`](https://bioconductor.org/packages/release/bioc/html/scDblFinder.html).

Scripts are derived from the the `doublet-detection` module of the [OpenScPCA-analysis](https://github.com/AlexsLemonade/OpenScPCA-analysis) repository.

Permalink to the version used: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/cdc71e4ac0ca4fcaf6ac225002c928453913f734/analyses/doublet-detection/scripts/01a_run-scdblfinder.R>
61 changes: 61 additions & 0 deletions modules/doublet-detection/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/env nextflow

// Workflow to detect doublets in a SingleCellExperiment object using scDblFinder

params.doublet_detection_container = 'public.ecr.aws/openscpca/doublet-detection:latest'

process run_scdblfinder {
container params.doublet_detection_container
tag "${sample_id}"
publishDir "${params.results_bucket}/${params.release_prefix}/doublet-detection/${project_id}/${sample_id}", mode: 'copy'
input:
tuple val(sample_id),
val(project_id),
path(library_files)
output:
tuple val(sample_id),
val(project_id),
path(output_files)
script:
output_files = library_files
.collect{
it.name.replaceAll(/(?i).rds$/, "_scdblfinder.tsv")
}
"""
for file in ${library_files}; do
run_scdblfinder.R \
--input_sce_file \$file \
--output_file \$(basename \${file%.rds}_scdblfinder.tsv) \
--random_seed 2024 \
--cores ${task.cpus}
done
"""

stub:
output_files = library_files
.collect{
it.name.replaceAll(/(?i).rds$/, "_scdblfinder.tsv")
}
"""
for file in ${library_files}; do
touch \$(basename \${file%.rds}_scdblfinder.tsv)
done
"""
}



workflow detect_doublets {
take:
sample_ch // [sample_id, project_id, sample_path]
main:
// create [sample_id, project_id, [list, of, processed, files]]
libraries_ch = sample_ch
.map{sample_id, project_id, sample_path ->
def library_files = Utils.getLibraryFiles(sample_path, format: "sce", process_level: "processed")
return [sample_id, project_id, library_files]
}

// detect doublets
run_scdblfinder(libraries_ch)
}
167 changes: 167 additions & 0 deletions modules/doublet-detection/resources/usr/bin/run_scdblfinder.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#!/usr/bin/env Rscript

# This script runs scDblFinder on an SCE file and exports a TSV file of results.
# If the SCE has fewer than 10 droplets, results will not be calculated and the exported TSV will contain NA values.

# Load libraries ------
suppressPackageStartupMessages({
library(SingleCellExperiment)
library(optparse)
})

# Functions -----------

#' Run scDblFinder on an SCE object
#'
#' @param sce SCE object to run scDblFinder on
#' @param cores Number of cores to specify to BiocParallel::MulticoreParam()
#' @param sample_var Required for SCE object with multiple samples, the SCE colData variable that indicates sample of origin.
#' This option should _not_ be used for a multiplexed library which has not been demultiplexed, but is suitable for merged objects.
#' @param random_seed Required for SCE object with multiple samples, a random seed to ensure reproducibility when running on multiple samples.
#' @param columns_to_keep Vector of columns in the final scDblFinder table to keep in the returned table. No checking is done on these column names.
#' @param ... Additional arguments to pass to scDblFinder::scDblFinder
#'
#' @return Data frame object with scDblFinder inferences
run_scdblfinder <- function(sce,
cores = 4,
sample_var = NULL,
random_seed = NULL,
columns_to_keep = c("barcodes", "score", "class"),
...) {
# first check multiple samples, which requires a random seed
if (!is.null(sample_var) && is.null(random_seed)) {
if (!sample_var %in% colnames(colData(sce))) {
stop(
glue::glue("The provided sample variable {sample_var} is not present in the input SCE's colData slot.")
)
}
if (is.null(random_seed)) {
# See section 1.5.6: https://bioconductor.org/packages/3.19/bioc/vignettes/scDblFinder/inst/doc/scDblFinder.html#usage
stop("If multiple samples are present in the input SCE object (e.g., the object contains multiple merged libraries), a random seed must be provided for reproducibility.")
}
}

# set up cores
if (cores == 1) {
bp <- BiocParallel::SerialParam(RNGseed = random_seed)
} else {
bp <- BiocParallel::MulticoreParam(cores, RNGseed = random_seed)
}

# Run doublet finder
result_df <- scDblFinder::scDblFinder(
sce,
samples = sample_var, # Default is NULL so use whatever was provided by the user or default
BPPARAM = bp,
returnType = "table", # return df, not sce
verbose = FALSE,
...
)

result_df <- result_df |>
as.data.frame() |>
tibble::rownames_to_column("barcodes") |>
# remove artificial doublets
dplyr::filter(!stringr::str_starts(barcodes, "rDbl")) |>
# keep only columns of interest
dplyr::select(dplyr::all_of(columns_to_keep))

return(result_df)
}


# Parse options --------

option_list <- list(
make_option(
"--input_sce_file",
type = "character",
help = "Path to input SCE file in RDS format."
),
make_option(
"--output_file",
type = "character",
default = "scdblfinder_results.tsv",
help = "Path to output TSV file with doublet inferences."
),
make_option(
"--cores",
type = "integer",
default = 4,
help = "Number of cores to use during scDblFinder inference."
),
make_option(
"--benchmark",
action = "store_true",
default = FALSE,
help = "Whether the script is being run as part of the doublet benchmarking analysis. If this flag is invoked, the `cxds_score` column will be included in the output TSV along with the `barcodes`, `score`, and `class` columns that are always included."
),
make_option(
"--sample_var",
type = "character",
help = "If multiple samples are present in the SCE, the colData column name with sample ids. This option should be used for merged objects."
),
make_option(
"--random_seed",
type = "integer",
default = 2024,
help = "Random seed. Default is 2024."
)
)
opts <- parse_args(OptionParser(option_list = option_list))

# Check input arguments and set up files, directories ------
if (!file.exists(opts$input_sce_file)) {
glue::glue("Could not find input SCE file, expected at: `{opts$input_sce_file}`.")
}
if (!stringr::str_ends(opts$output_file, ".tsv")) {
stop("The output TSV file must end in .tsv.")
}

fs::dir_create(dirname(opts$output_file)) # create output directory as needed
set.seed(opts$random_seed)

# Detect doublets and export TSV file with inferences -----

# Check the number of cells to see if scDblFinder can be run
cell_threshold <- 10

sce <- readRDS(opts$input_sce_file)
ncells <- ncol(sce)

if (ncells < cell_threshold) {
warning(
glue::glue(
"The provided SingleCellExperiment object only has {ncells} droplets, which is fewer than {cell_threshold} so `scDblFinder` will not be run.
An output TSV file will still be produced, but it will be populated with `NA` values."
)
)
na_df <- data.frame(
barcodes = colnames(sce),
score = NA,
class = NA
)
if (opts$benchmark) {
na_df$cxds_score <- NA
}
readr::write_tsv(na_df, opts$output_file)
} else {
keep_columns <- c(
"barcodes",
"score",
"class"
)
if (opts$benchmark) {
keep_columns <- c(keep_columns, "cxds_score")
}

run_scdblfinder(
sce,
cores = opts$cores,
# only used if there are multiple samples, e.g. if this is processing a merged object
sample_var = opts$sample_var, # will be NULL if not provided as input argument
random_seed = opts$random_seed,
columns_to_keep = keep_columns
) |>
readr::write_tsv(opts$output_file)
}
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ manifest {
nextflowVersion = '>=24.04.0'
}

nextflow.enable.dsl = 2
nextflow.enable.moduleBinaries = true

// global default parameters for workflows: output buckets are set to staging by default
Expand Down

0 comments on commit 846a22b

Please sign in to comment.