diff --git a/bin/QC.ipynb b/bin/QC.ipynb index f376902..6845427 100644 --- a/bin/QC.ipynb +++ b/bin/QC.ipynb @@ -208,7 +208,7 @@ "metadata": {}, "outputs": [], "source": [ - "if not plots == \"\": \n", + "if plots is not None: \n", " plots = pd.read_csv(plots, header=None)\n", " plots = plots.iloc[0,:].values.tolist()\n", " for plot in plots: \n", @@ -361,9 +361,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:postprocessing]", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "postprocessing" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -375,7 +375,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/bin/aggregation.py b/bin/aggregation.py index b565ea8..e90b858 100644 --- a/bin/aggregation.py +++ b/bin/aggregation.py @@ -1,7 +1,6 @@ import argparse import scanpy as sc import numpy as np -from utils import get_basename_without_extension parser = argparse.ArgumentParser(description="wrapper for concatenating the samples.") parser.add_argument( @@ -16,9 +15,6 @@ for idx, h5_file_path in enumerate(args.inputs, start=1): print(h5_file_path) adata = sc.read(h5_file_path) - adata.obs["sample_name"] = get_basename_without_extension(h5_file_path).rsplit( - "_", 1 - )[0] adata.obs.index = adata.obs["sample_name"] + "_" + adata.obs.index adata.var_names_make_unique() adata_list.append(adata) @@ -30,25 +26,4 @@ permuted_indices = np.random.permutation(combined_adata.n_obs) combined_adata = combined_adata[permuted_indices, :] -# Compute QC metrics useful for downstream analysis -combined_adata.var["mito"] = combined_adata.var_names.str.upper().str.startswith( - ("MT-") -) -combined_adata.var["ribo"] = combined_adata.var_names.str.upper().str.startswith( - ("RPS", "RPL") -) - -combined_adata.obs["mito_frac"] = np.sum( - combined_adata[:, combined_adata.var["mito"]].X, axis=1 -) / np.sum(combined_adata.X, axis=1) - -sc.pp.calculate_qc_metrics(combined_adata, percent_top=[20], inplace=True) -sc.pp.calculate_qc_metrics( - combined_adata, - qc_vars=["ribo", "mito"], - percent_top=None, - log1p=False, - inplace=True, -) - combined_adata.write_h5ad(args.output) diff --git a/bin/doublet_detection.py b/bin/doublet_detection.py index d649923..26d2981 100644 --- a/bin/doublet_detection.py +++ b/bin/doublet_detection.py @@ -3,7 +3,7 @@ import argparse import scanpy as sc import doubletdetection -from utils import anndata_from_h5 +from utils import anndata_from_h5, get_basename_without_extension os.environ["NUMBA_CACHE_DIR"] = "/tmp/" @@ -98,4 +98,7 @@ adata_batch.obs["doublet"] = doublets.astype(bool) adata_batch.obs["doublet_score"] = doublet_score +adata_batch.obs["sample_name"] = get_basename_without_extension(args.input_h5).rsplit( + "_", 1 +)[0] adata_batch.write_h5ad(args.output_h5ad) diff --git a/bin/outlier_filter.py b/bin/outlier_filter.py index f42c517..5d67e07 100644 --- a/bin/outlier_filter.py +++ b/bin/outlier_filter.py @@ -142,6 +142,23 @@ def designate_outliers( # Load data adata = sc.read_h5ad(args.input_h5ad) +# Compute QC metrics useful for downstream analysis +adata.var["mito"] = adata.var_names.str.upper().str.startswith(("MT-")) +adata.var["ribo"] = adata.var_names.str.upper().str.startswith(("RPS", "RPL")) + +adata.obs["mito_frac"] = np.sum(adata[:, adata.var["mito"]].X, axis=1) / np.sum( + adata.X, axis=1 +) + +sc.pp.calculate_qc_metrics(adata, percent_top=[20], inplace=True) +sc.pp.calculate_qc_metrics( + adata, + qc_vars=["ribo", "mito"], + percent_top=None, + log1p=False, + inplace=True, +) + calculate_group_featcount_dist(adata, group_key=args.sample_col) # Mark outliers diff --git a/bin/run_celltypist.py b/bin/run_celltypist.py index 9feae14..d6c5838 100644 --- a/bin/run_celltypist.py +++ b/bin/run_celltypist.py @@ -54,11 +54,14 @@ # Check if the model path exists, if not if os.path.exists(args.model): # Load the model + print(f"Found model at {args.models}") model = models.Model.load(model=args.model) print(f"Loaded {args.model}") else: - # Download and download the model + # Download the model + print("Downloading the model...") models.download_models(model=args.model) + model = args.model print(f"Downloaded {args.model} at {models.models_path}") # prediction diff --git a/main.nf b/main.nf index 96655d7..518a205 100644 --- a/main.nf +++ b/main.nf @@ -25,34 +25,43 @@ workflow { def raw_path = file(row[1]) def filtered_path = row[2] def demultiplexing = row[3].toLowerCase() - return tuple(name, raw_path, filtered_path, demultiplexing) + def expected_droplets = row[4] + return tuple(name, raw_path, filtered_path, demultiplexing, expected_droplets) } // run Cellbender CELLBENDER(ch_input) - // run DoubletDetection with optional demultiplexing + // run DoubletDetection (Optional: demultiplexing) DOUBLETDETECTION(CELLBENDER.out.name, CELLBENDER.out.raw_h5, CELLBENDER.out.filtered_path, CELLBENDER.out.demultiplexing) - // aggregate the outputs - AGGREGATION(DOUBLETDETECTION.out.doublet_h5ad.collect()) + // Optional: aggregate the outputs + if (params.aggregation) { + AGGREGATION(DOUBLETDETECTION.out.doublet_h5ad.collect()) + outlier_input_name = AGGREGATION.out.name + outlier_input = AGGREGATION.out.aggregation_h5ad + } else { + outlier_input_name = DOUBLETDETECTION.out.name + outlier_input = DOUBLETDETECTION.out.doublet_h5ad + } // filter out outliers - OUTLIER_FILTER(AGGREGATION.out.aggregation_h5ad) + OUTLIER_FILTER(outlier_input_name, outlier_input) // SCRAN normalization - SCRAN(OUTLIER_FILTER.out.outlier_filtered_h5ad) + SCRAN(OUTLIER_FILTER.out.name, OUTLIER_FILTER.out.outlier_filtered_h5ad) // SCVI batch correction - SCVI(SCRAN.out.scran_h5ad) + SCVI(SCRAN.out.name, SCRAN.out.scran_h5ad) // Postprocessing - POSTPROCESSING(SCVI.out.scvi_h5ad) + POSTPROCESSING(SCVI.out.name, SCVI.out.scvi_h5ad) // Celltypist - CELLTYPIST(POSTPROCESSING.out.postprocessing_scvi_h5ad) + CELLTYPIST(POSTPROCESSING.out.name, POSTPROCESSING.out.postprocessing_scvi_h5ad) // Report - REPORT(POSTPROCESSING.out.postprocessing_h5ad, - CELLTYPIST.out.celltypist_scvi_h5ad) + REPORT(CELLTYPIST.out.name, + CELLTYPIST.out.postprocessing_h5ad, + CELLTYPIST.out.celltypist_scvi_h5ad) } diff --git a/modules/aggregation/main.nf b/modules/aggregation/main.nf index 03de500..fa62d88 100644 --- a/modules/aggregation/main.nf +++ b/modules/aggregation/main.nf @@ -8,6 +8,7 @@ process AGGREGATION { val doublet_h5ads output: + val "${params.experiment.name}", emit: name path "${params.experiment.name ? params.experiment.name + '_' : ''}aggregation.h5ad", emit: aggregation_h5ad script: diff --git a/modules/cellbender/main.nf b/modules/cellbender/main.nf index 1f32df8..7d8c1b9 100644 --- a/modules/cellbender/main.nf +++ b/modules/cellbender/main.nf @@ -5,7 +5,7 @@ process CELLBENDER { publishDir "${params.outdir}/cellbender/", mode: 'copy' input: - tuple val(name), path(raw_path), val(filtered_path), val(demultiplexing) + tuple val(name), path(raw_path), val(filtered_path), val(demultiplexing), val(expected_droplets) output: val "${name}", emit: name @@ -21,7 +21,7 @@ process CELLBENDER { python ${baseDir}/bin/run_cellbender.py \ ${raw_path} \ ${name}_cellbender.h5 \ - ${params.cellbender.total_droplets_included} \ + ${expected_droplets} \ --filtered ${filtered_path} """ else @@ -29,7 +29,7 @@ process CELLBENDER { python ${baseDir}/bin/run_cellbender.py \ ${raw_path} \ ${name}_cellbender.h5 \ - ${params.cellbender.total_droplets_included} \ + ${expected_droplets} \ --filtered ${filtered_path} """ } diff --git a/modules/celltypist/main.nf b/modules/celltypist/main.nf index 42d8447..5d7e14b 100644 --- a/modules/celltypist/main.nf +++ b/modules/celltypist/main.nf @@ -1,14 +1,17 @@ process CELLTYPIST { label 'gpus' container 'quay.io/teichlab/celltypist:latest' - containerOptions '--nv' + containerOptions '--nv --bind ${params.mount}' publishDir "${params.outdir}/celltypist/", mode: 'copy' input: + val name path postprocessing_scvi_h5ad output: - path "${params.experiment.name}_celltypist_scvi.h5ad", emit: celltypist_scvi_h5ad + val "${name}", emit: name + path "${name}_celltypist_scvi.h5ad", emit: celltypist_scvi_h5ad + path "${postprocessing_scvi_h5ad}", emit: postprocessing_h5ad script: def gpu_index = task.index % params.maxForks @@ -19,7 +22,7 @@ process CELLTYPIST { export MPLCONFIGDIR=\$PWD python ${baseDir}/bin/run_celltypist.py \ ${postprocessing_scvi_h5ad} \ - ${params.experiment.name}_celltypist_scvi.h5ad \ + ${name}_celltypist_scvi.h5ad \ --model ${params.celltypist.model} \ --majority_voting \ --use_gpu \ @@ -31,7 +34,7 @@ process CELLTYPIST { export MPLCONFIGDIR=\$PWD python ${baseDir}/bin/run_celltypist.py \ ${postprocessing_scvi_h5ad} \ - ${params.experiment.name}_celltypist_scvi.h5ad \ + ${name}_celltypist_scvi.h5ad \ --model ${params.celltypist.model} \ --majority_voting \ --use_gpu \ diff --git a/modules/doubletdetection/main.nf b/modules/doubletdetection/main.nf index c204d6f..44002d0 100644 --- a/modules/doubletdetection/main.nf +++ b/modules/doubletdetection/main.nf @@ -11,6 +11,7 @@ process DOUBLETDETECTION { val demultiplexing output: + val "${name}", emit: name path "${name}_doubletdetection.h5ad", emit: doublet_h5ad script: diff --git a/modules/outlierfilter/main.nf b/modules/outlierfilter/main.nf index a6e97bc..ab3e69d 100644 --- a/modules/outlierfilter/main.nf +++ b/modules/outlierfilter/main.nf @@ -4,16 +4,18 @@ process OUTLIER_FILTER { publishDir "${params.outdir}/outlier_filtered/", mode: 'copy' input: + val name path aggregation_h5ad output: - path "${params.experiment.name ? params.experiment.name + '_' : ''}outlier_filtered.h5ad", emit: outlier_filtered_h5ad + val "${name}", emit: name + path "${name}_outlier_filtered.h5ad", emit: outlier_filtered_h5ad script: """ export NUMBA_CACHE_DIR=\$PWD python ${baseDir}/bin/outlier_filter.py \ ${aggregation_h5ad} \ - ${params.experiment.name ? params.experiment.name + '_' : ''}outlier_filtered.h5ad + ${name}_outlier_filtered.h5ad """ } diff --git a/modules/postprocessing/main.nf b/modules/postprocessing/main.nf index 6d11704..01882df 100644 --- a/modules/postprocessing/main.nf +++ b/modules/postprocessing/main.nf @@ -4,23 +4,25 @@ process POSTPROCESSING { publishDir "${params.outdir}/postprocessing/", mode: 'copy' input: + val name path scvi_h5ad output: - path "${params.experiment.name ? params.experiment.name + '_' : ''}postprocessing.h5ad", emit: postprocessing_h5ad - path "${params.experiment.name ? params.experiment.name + '_' : ''}postprocessing_scvi.h5ad", emit: postprocessing_scvi_h5ad + val "${name}", emit: name + path "${name}_postprocessing.h5ad", emit: postprocessing_h5ad + path "${name}_postprocessing_scvi.h5ad", emit: postprocessing_scvi_h5ad script: """ export NUMBA_CACHE_DIR=\$PWD python ${baseDir}/bin/postprocessing.py \ ${scvi_h5ad} \ - ${params.experiment.name ? params.experiment.name + '_' : ''}postprocessing.h5ad \ + ${name}_postprocessing.h5ad \ --n_pca_components ${params.postprocessing.n_pca_components} \ --metadata ${params.postprocessing.metadata} python ${baseDir}/bin/postprocessing.py \ ${scvi_h5ad} \ - ${params.experiment.name ? params.experiment.name + '_' : ''}postprocessing_scvi.h5ad \ + ${name}_postprocessing_scvi.h5ad \ --use_scvi \ --metadata ${params.postprocessing.metadata} """ diff --git a/modules/report/main.nf b/modules/report/main.nf index 466f8de..6ad2f74 100644 --- a/modules/report/main.nf +++ b/modules/report/main.nf @@ -4,18 +4,19 @@ process REPORT { publishDir "${params.outdir}/report/", mode: 'copy' input: + val name path postprocessing_h5ad path celltypist_scvi_h5ad output: - path "${params.experiment.name}_report.ipynb", emit: report_ipynb - path "${params.experiment.name}_report.html", emit: report_html + path "${name}_report.ipynb", emit: report_ipynb + path "${name}_report.html", emit: report_html script: """ export HOME=\$PWD python -m ipykernel install --user --name postprocessing - papermill ${baseDir}/bin/QC.ipynb ${params.experiment.name}_report.ipynb -p plots ${params.report.plots} - jupyter nbconvert --to html ${params.experiment.name}_report.ipynb + papermill ${baseDir}/bin/QC.ipynb ${name}_report.ipynb -p plots ${params.report.plots} + jupyter nbconvert --to html ${name}_report.ipynb """ } diff --git a/modules/scran/main.nf b/modules/scran/main.nf index 41135d7..95f76ff 100644 --- a/modules/scran/main.nf +++ b/modules/scran/main.nf @@ -4,10 +4,12 @@ process SCRAN { publishDir "${params.outdir}/scran/", mode: 'copy' input: + val name file aggregation_h5ad output: - path "${params.experiment.name ? params.experiment.name + '_' : ''}scran.h5ad", emit: scran_h5ad + val "${name}", emit: name + path "${name}_scran.h5ad", emit: scran_h5ad script: // set home directory to cache basilisk """ @@ -15,6 +17,6 @@ process SCRAN { export HOME=\$PWD Rscript ${baseDir}/bin/scran.R \ ${aggregation_h5ad} \ - "${params.experiment.name ? params.experiment.name + '_' : ''}scran.h5ad" + "${name}_scran.h5ad" """ } diff --git a/modules/scvi/main.nf b/modules/scvi/main.nf index 66ba294..d30c1b1 100644 --- a/modules/scvi/main.nf +++ b/modules/scvi/main.nf @@ -5,17 +5,19 @@ process SCVI { publishDir "${params.outdir}/scvi/", mode: 'copy' input: + val name path scran_h5ad output: - path "${params.experiment.name ? params.experiment.name + '_' : ''}scvi.h5ad", emit: scvi_h5ad + val "${name}", emit: name + path "${name}_scvi.h5ad", emit: scvi_h5ad script: """ export NUMBA_CACHE_DIR=\$PWD python ${baseDir}/bin/scvi_norm.py \ ${scran_h5ad} \ - ${params.experiment.name ? params.experiment.name + '_' : ''}scvi.h5ad \ + ${name}_scvi.h5ad \ --n_latent ${params.scvi.n_latent} \ --n_top_genes ${params.scvi.n_top_genes} """ diff --git a/nextflow.config b/nextflow.config index 31025b1..8af5017 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,6 +1,6 @@ params { samplesheet = "./data/sampleSheet.csv" - experiment.name = "" + experiment.name = "nf-scrnaseq" cellbender.total_droplets_included = 2000 scvi.n_latent = 50 scvi.n_top_genes = 1000 @@ -8,6 +8,7 @@ params { postprocessing.n_diffmap_components = 20 postprocessing.metadata = "" report.plot = "" + mount = "/home" outdir = "results" with_gpu = false max_memory = "20.GB" diff --git a/params.yml b/params.yml index b92f168..23e843e 100644 --- a/params.yml +++ b/params.yml @@ -2,8 +2,7 @@ samplesheet: "./data/sampleSheet.csv" outdir: "./data/results/" experiment: name: "toy_data" -cellbender: - total_droplets_included: 2000 +aggregation: false scvi: n_latent: 10 n_top_genes: 10 @@ -13,6 +12,8 @@ postprocessing: metadata: "None" celltypist: model: "Immune_All_Low.pkl" +report: + plots: "None" mount: "/data1,/home,/scratch" with_gpu: true max_memory: "6.GB" diff --git a/run_test.sh b/run_test.sh index d2c2d25..297c156 100644 --- a/run_test.sh +++ b/run_test.sh @@ -1,12 +1,3 @@ -module load singularity/3.7.1 -module load gcc/12.2.0 -module load cuda/12.0 -#export NXF_SINGULARITY_CACHEDIR="/lila/data/chanjlab/wangm10/work-nf-scrnaseq/singularity/" export NXF_SINGULARITY_CACHEDIR="/usersoftware/chanj3/singularity/" -nextflow run ./main.nf -resume -profile iris -params-file ./params.yml -w "/scratch/chanj3/wangm10/work" -# nextflow run ./main.nf -resume 0b2aab78-3380-47e5-bf3a-d2babb788b9f -profile lilac -params-file ./params.yml -w "../work-cellbender-test/" - -# nextflow run ./main.nf -resume -profile singularity -params-file ../oliver_RPMN_2024/oliver_RPMN_2024_params.yml -w "/lila/data/chanjlab/wangm10/work-nf-scrnaseq/" -#nextflow run ./main.nf -resume 68995d8a-02a8-478c-b01e-d1b67095de9a -profile lilac -params-file ../oliver_RPMN_2024/oliver_RPMN_2024_primary_tumor_params.yml -w "/lila/data/chanjlab/wangm10/work-nf-scrnaseq/" -# nextflow run ./main.nf -resume 5022abf3-61bc-4fe8-ab1f-18232051d5a2 -profile lilac -params-file ../oliver_RPMN_2024/oliver_RPMN_2024_params.yml -w "/lila/data/chanjlab/wangm10/work-nf-scrnaseq/" +nextflow run ./main.nf -profile iris -params-file ./params.yml -w "/scratch/chanj3/wangm10/work"