-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #72 from AlexsLemonade/sjspielman/68-doublet-module
Add doublet detection module
- Loading branch information
Showing
5 changed files
with
239 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
This module performs doublet detection on processed SCE objects with [`scDblFinder`](https://bioconductor.org/packages/release/bioc/html/scDblFinder.html). | ||
|
||
Scripts are derived from the the `doublet-detection` module of the [OpenScPCA-analysis](https://github.com/AlexsLemonade/OpenScPCA-analysis) repository. | ||
|
||
Permalink to the version used: <https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/cdc71e4ac0ca4fcaf6ac225002c928453913f734/analyses/doublet-detection/scripts/01a_run-scdblfinder.R> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
#!/usr/bin/env nextflow | ||
|
||
// Workflow to detect doublets in a SingleCellExperiment object using scDblFinder | ||
|
||
params.doublet_detection_container = 'public.ecr.aws/openscpca/doublet-detection:latest' | ||
|
||
process run_scdblfinder { | ||
container params.doublet_detection_container | ||
tag "${sample_id}" | ||
publishDir "${params.results_bucket}/${params.release_prefix}/doublet-detection/${project_id}/${sample_id}", mode: 'copy' | ||
input: | ||
tuple val(sample_id), | ||
val(project_id), | ||
path(library_files) | ||
output: | ||
tuple val(sample_id), | ||
val(project_id), | ||
path(output_files) | ||
script: | ||
output_files = library_files | ||
.collect{ | ||
it.name.replaceAll(/(?i).rds$/, "_scdblfinder.tsv") | ||
} | ||
""" | ||
for file in ${library_files}; do | ||
run_scdblfinder.R \ | ||
--input_sce_file \$file \ | ||
--output_file \$(basename \${file%.rds}_scdblfinder.tsv) \ | ||
--random_seed 2024 \ | ||
--cores ${task.cpus} | ||
done | ||
""" | ||
|
||
stub: | ||
output_files = library_files | ||
.collect{ | ||
it.name.replaceAll(/(?i).rds$/, "_scdblfinder.tsv") | ||
} | ||
""" | ||
for file in ${library_files}; do | ||
touch \$(basename \${file%.rds}_scdblfinder.tsv) | ||
done | ||
""" | ||
} | ||
|
||
|
||
|
||
workflow detect_doublets { | ||
take: | ||
sample_ch // [sample_id, project_id, sample_path] | ||
main: | ||
// create [sample_id, project_id, [list, of, processed, files]] | ||
libraries_ch = sample_ch | ||
.map{sample_id, project_id, sample_path -> | ||
def library_files = Utils.getLibraryFiles(sample_path, format: "sce", process_level: "processed") | ||
return [sample_id, project_id, library_files] | ||
} | ||
|
||
// detect doublets | ||
run_scdblfinder(libraries_ch) | ||
} |
167 changes: 167 additions & 0 deletions
167
modules/doublet-detection/resources/usr/bin/run_scdblfinder.R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
#!/usr/bin/env Rscript | ||
|
||
# This script runs scDblFinder on an SCE file and exports a TSV file of results. | ||
# If the SCE has fewer than 10 droplets, results will not be calculated and the exported TSV will contain NA values. | ||
|
||
# Load libraries ------ | ||
suppressPackageStartupMessages({ | ||
library(SingleCellExperiment) | ||
library(optparse) | ||
}) | ||
|
||
# Functions ----------- | ||
|
||
#' Run scDblFinder on an SCE object | ||
#' | ||
#' @param sce SCE object to run scDblFinder on | ||
#' @param cores Number of cores to specify to BiocParallel::MulticoreParam() | ||
#' @param sample_var Required for SCE object with multiple samples, the SCE colData variable that indicates sample of origin. | ||
#' This option should _not_ be used for a multiplexed library which has not been demultiplexed, but is suitable for merged objects. | ||
#' @param random_seed Required for SCE object with multiple samples, a random seed to ensure reproducibility when running on multiple samples. | ||
#' @param columns_to_keep Vector of columns in the final scDblFinder table to keep in the returned table. No checking is done on these column names. | ||
#' @param ... Additional arguments to pass to scDblFinder::scDblFinder | ||
#' | ||
#' @return Data frame object with scDblFinder inferences | ||
run_scdblfinder <- function(sce, | ||
cores = 4, | ||
sample_var = NULL, | ||
random_seed = NULL, | ||
columns_to_keep = c("barcodes", "score", "class"), | ||
...) { | ||
# first check multiple samples, which requires a random seed | ||
if (!is.null(sample_var) && is.null(random_seed)) { | ||
if (!sample_var %in% colnames(colData(sce))) { | ||
stop( | ||
glue::glue("The provided sample variable {sample_var} is not present in the input SCE's colData slot.") | ||
) | ||
} | ||
if (is.null(random_seed)) { | ||
# See section 1.5.6: https://bioconductor.org/packages/3.19/bioc/vignettes/scDblFinder/inst/doc/scDblFinder.html#usage | ||
stop("If multiple samples are present in the input SCE object (e.g., the object contains multiple merged libraries), a random seed must be provided for reproducibility.") | ||
} | ||
} | ||
|
||
# set up cores | ||
if (cores == 1) { | ||
bp <- BiocParallel::SerialParam(RNGseed = random_seed) | ||
} else { | ||
bp <- BiocParallel::MulticoreParam(cores, RNGseed = random_seed) | ||
} | ||
|
||
# Run doublet finder | ||
result_df <- scDblFinder::scDblFinder( | ||
sce, | ||
samples = sample_var, # Default is NULL so use whatever was provided by the user or default | ||
BPPARAM = bp, | ||
returnType = "table", # return df, not sce | ||
verbose = FALSE, | ||
... | ||
) | ||
|
||
result_df <- result_df |> | ||
as.data.frame() |> | ||
tibble::rownames_to_column("barcodes") |> | ||
# remove artificial doublets | ||
dplyr::filter(!stringr::str_starts(barcodes, "rDbl")) |> | ||
# keep only columns of interest | ||
dplyr::select(dplyr::all_of(columns_to_keep)) | ||
|
||
return(result_df) | ||
} | ||
|
||
|
||
# Parse options -------- | ||
|
||
option_list <- list( | ||
make_option( | ||
"--input_sce_file", | ||
type = "character", | ||
help = "Path to input SCE file in RDS format." | ||
), | ||
make_option( | ||
"--output_file", | ||
type = "character", | ||
default = "scdblfinder_results.tsv", | ||
help = "Path to output TSV file with doublet inferences." | ||
), | ||
make_option( | ||
"--cores", | ||
type = "integer", | ||
default = 4, | ||
help = "Number of cores to use during scDblFinder inference." | ||
), | ||
make_option( | ||
"--benchmark", | ||
action = "store_true", | ||
default = FALSE, | ||
help = "Whether the script is being run as part of the doublet benchmarking analysis. If this flag is invoked, the `cxds_score` column will be included in the output TSV along with the `barcodes`, `score`, and `class` columns that are always included." | ||
), | ||
make_option( | ||
"--sample_var", | ||
type = "character", | ||
help = "If multiple samples are present in the SCE, the colData column name with sample ids. This option should be used for merged objects." | ||
), | ||
make_option( | ||
"--random_seed", | ||
type = "integer", | ||
default = 2024, | ||
help = "Random seed. Default is 2024." | ||
) | ||
) | ||
opts <- parse_args(OptionParser(option_list = option_list)) | ||
|
||
# Check input arguments and set up files, directories ------ | ||
if (!file.exists(opts$input_sce_file)) { | ||
glue::glue("Could not find input SCE file, expected at: `{opts$input_sce_file}`.") | ||
} | ||
if (!stringr::str_ends(opts$output_file, ".tsv")) { | ||
stop("The output TSV file must end in .tsv.") | ||
} | ||
|
||
fs::dir_create(dirname(opts$output_file)) # create output directory as needed | ||
set.seed(opts$random_seed) | ||
|
||
# Detect doublets and export TSV file with inferences ----- | ||
|
||
# Check the number of cells to see if scDblFinder can be run | ||
cell_threshold <- 10 | ||
|
||
sce <- readRDS(opts$input_sce_file) | ||
ncells <- ncol(sce) | ||
|
||
if (ncells < cell_threshold) { | ||
warning( | ||
glue::glue( | ||
"The provided SingleCellExperiment object only has {ncells} droplets, which is fewer than {cell_threshold} so `scDblFinder` will not be run. | ||
An output TSV file will still be produced, but it will be populated with `NA` values." | ||
) | ||
) | ||
na_df <- data.frame( | ||
barcodes = colnames(sce), | ||
score = NA, | ||
class = NA | ||
) | ||
if (opts$benchmark) { | ||
na_df$cxds_score <- NA | ||
} | ||
readr::write_tsv(na_df, opts$output_file) | ||
} else { | ||
keep_columns <- c( | ||
"barcodes", | ||
"score", | ||
"class" | ||
) | ||
if (opts$benchmark) { | ||
keep_columns <- c(keep_columns, "cxds_score") | ||
} | ||
|
||
run_scdblfinder( | ||
sce, | ||
cores = opts$cores, | ||
# only used if there are multiple samples, e.g. if this is processing a merged object | ||
sample_var = opts$sample_var, # will be NULL if not provided as input argument | ||
random_seed = opts$random_seed, | ||
columns_to_keep = keep_columns | ||
) |> | ||
readr::write_tsv(opts$output_file) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters