joechanlab · Mamie · Aug 26, 2024 · Aug 23, 2024 · Aug 23, 2024 · Aug 24, 2024
diff --git a/bin/QC.ipynb b/bin/QC.ipynb
@@ -208,7 +208,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if not plots == \"\": \n",
+    "if plots is not None: \n",
     "    plots = pd.read_csv(plots, header=None)\n",
     "    plots = plots.iloc[0,:].values.tolist()\n",
     "    for plot in plots: \n",
@@ -361,9 +361,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python [conda env:postprocessing]",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "postprocessing"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -375,7 +375,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.4"
+   "version": "3.12.2"
   }
  },
  "nbformat": 4,

diff --git a/bin/aggregation.py b/bin/aggregation.py
@@ -1,7 +1,6 @@
 import argparse
 import scanpy as sc
 import numpy as np
-from utils import get_basename_without_extension
 
 parser = argparse.ArgumentParser(description="wrapper for concatenating the samples.")
 parser.add_argument(
@@ -16,9 +15,6 @@
 for idx, h5_file_path in enumerate(args.inputs, start=1):
     print(h5_file_path)
     adata = sc.read(h5_file_path)
-    adata.obs["sample_name"] = get_basename_without_extension(h5_file_path).rsplit(
-        "_", 1
-    )[0]
     adata.obs.index = adata.obs["sample_name"] + "_" + adata.obs.index
     adata.var_names_make_unique()
     adata_list.append(adata)
@@ -30,25 +26,4 @@
 permuted_indices = np.random.permutation(combined_adata.n_obs)
 combined_adata = combined_adata[permuted_indices, :]
 
-# Compute QC metrics useful for downstream analysis
-combined_adata.var["mito"] = combined_adata.var_names.str.upper().str.startswith(
-    ("MT-")
-)
-combined_adata.var["ribo"] = combined_adata.var_names.str.upper().str.startswith(
-    ("RPS", "RPL")
-)
-
-combined_adata.obs["mito_frac"] = np.sum(
-    combined_adata[:, combined_adata.var["mito"]].X, axis=1
-) / np.sum(combined_adata.X, axis=1)
-
-sc.pp.calculate_qc_metrics(combined_adata, percent_top=[20], inplace=True)
-sc.pp.calculate_qc_metrics(
-    combined_adata,
-    qc_vars=["ribo", "mito"],
-    percent_top=None,
-    log1p=False,
-    inplace=True,
-)
-
 combined_adata.write_h5ad(args.output)
diff --git a/bin/doublet_detection.py b/bin/doublet_detection.py
@@ -3,7 +3,7 @@
 import argparse
 import scanpy as sc
 import doubletdetection
-from utils import anndata_from_h5
+from utils import anndata_from_h5, get_basename_without_extension
 
 os.environ["NUMBA_CACHE_DIR"] = "/tmp/"
 
@@ -98,4 +98,7 @@
 adata_batch.obs["doublet"] = doublets.astype(bool)
 adata_batch.obs["doublet_score"] = doublet_score
 
+adata_batch.obs["sample_name"] = get_basename_without_extension(args.input_h5).rsplit(
+    "_", 1
+)[0]
 adata_batch.write_h5ad(args.output_h5ad)
diff --git a/bin/outlier_filter.py b/bin/outlier_filter.py
@@ -142,6 +142,23 @@ def designate_outliers(
 # Load data
 adata = sc.read_h5ad(args.input_h5ad)
 
+# Compute QC metrics useful for downstream analysis
+adata.var["mito"] = adata.var_names.str.upper().str.startswith(("MT-"))
+adata.var["ribo"] = adata.var_names.str.upper().str.startswith(("RPS", "RPL"))
+
+adata.obs["mito_frac"] = np.sum(adata[:, adata.var["mito"]].X, axis=1) / np.sum(
+    adata.X, axis=1
+)
+
+sc.pp.calculate_qc_metrics(adata, percent_top=[20], inplace=True)
+sc.pp.calculate_qc_metrics(
+    adata,
+    qc_vars=["ribo", "mito"],
+    percent_top=None,
+    log1p=False,
+    inplace=True,
+)
+
 calculate_group_featcount_dist(adata, group_key=args.sample_col)
 
 # Mark outliers

diff --git a/bin/run_celltypist.py b/bin/run_celltypist.py
@@ -54,11 +54,14 @@
 # Check if the model path exists, if not
 if os.path.exists(args.model):
     # Load the model
+    print(f"Found model at {args.models}")
     model = models.Model.load(model=args.model)
     print(f"Loaded {args.model}")
 else:
-    # Download and download the model
+    # Download the model
+    print("Downloading the model...")
     models.download_models(model=args.model)
+    model = args.model
     print(f"Downloaded {args.model} at {models.models_path}")
 
 # prediction

diff --git a/main.nf b/main.nf
@@ -25,34 +25,43 @@ workflow {
         def raw_path = file(row[1])
         def filtered_path = row[2]
         def demultiplexing = row[3].toLowerCase()
-        return tuple(name, raw_path, filtered_path, demultiplexing)
+        def expected_droplets = row[4]
+        return tuple(name, raw_path, filtered_path, demultiplexing, expected_droplets)
     }
 
     // run Cellbender
     CELLBENDER(ch_input)
 
-    // run DoubletDetection with optional demultiplexing
+    // run DoubletDetection (Optional: demultiplexing)
     DOUBLETDETECTION(CELLBENDER.out.name, CELLBENDER.out.raw_h5, CELLBENDER.out.filtered_path, CELLBENDER.out.demultiplexing)
 
-    // aggregate the outputs
-    AGGREGATION(DOUBLETDETECTION.out.doublet_h5ad.collect())
+    // Optional: aggregate the outputs
+    if (params.aggregation) {
+        AGGREGATION(DOUBLETDETECTION.out.doublet_h5ad.collect())
+        outlier_input_name = AGGREGATION.out.name
+        outlier_input = AGGREGATION.out.aggregation_h5ad
+    } else {
+        outlier_input_name = DOUBLETDETECTION.out.name
+        outlier_input = DOUBLETDETECTION.out.doublet_h5ad
+    }
 
     // filter out outliers
-    OUTLIER_FILTER(AGGREGATION.out.aggregation_h5ad)
+    OUTLIER_FILTER(outlier_input_name, outlier_input)
 
     // SCRAN normalization
-    SCRAN(OUTLIER_FILTER.out.outlier_filtered_h5ad)
+    SCRAN(OUTLIER_FILTER.out.name, OUTLIER_FILTER.out.outlier_filtered_h5ad)
 
     // SCVI batch correction
-    SCVI(SCRAN.out.scran_h5ad)
+    SCVI(SCRAN.out.name, SCRAN.out.scran_h5ad)
 
     // Postprocessing
-    POSTPROCESSING(SCVI.out.scvi_h5ad)
+    POSTPROCESSING(SCVI.out.name, SCVI.out.scvi_h5ad)
 
     // Celltypist
-    CELLTYPIST(POSTPROCESSING.out.postprocessing_scvi_h5ad)
+    CELLTYPIST(POSTPROCESSING.out.name, POSTPROCESSING.out.postprocessing_scvi_h5ad)
 
     // Report
-    REPORT(POSTPROCESSING.out.postprocessing_h5ad,
-    CELLTYPIST.out.celltypist_scvi_h5ad)
+    REPORT(CELLTYPIST.out.name,
+        CELLTYPIST.out.postprocessing_h5ad,
+        CELLTYPIST.out.celltypist_scvi_h5ad)
 }
diff --git a/modules/aggregation/main.nf b/modules/aggregation/main.nf
@@ -8,6 +8,7 @@ process AGGREGATION {
     val doublet_h5ads
 
     output:
+    val "${params.experiment.name}", emit: name
     path "${params.experiment.name ? params.experiment.name + '_' : ''}aggregation.h5ad", emit: aggregation_h5ad
 
     script:

diff --git a/modules/cellbender/main.nf b/modules/cellbender/main.nf
@@ -5,7 +5,7 @@ process CELLBENDER {
     publishDir "${params.outdir}/cellbender/", mode: 'copy'
 
     input:
-    tuple val(name), path(raw_path), val(filtered_path), val(demultiplexing)
+    tuple val(name), path(raw_path), val(filtered_path), val(demultiplexing), val(expected_droplets)
 
     output:
     val "${name}", emit: name
@@ -21,15 +21,15 @@ process CELLBENDER {
         python ${baseDir}/bin/run_cellbender.py \
             ${raw_path} \
             ${name}_cellbender.h5 \
-            ${params.cellbender.total_droplets_included} \
+            ${expected_droplets} \
             --filtered ${filtered_path}
         """
     else
         """
         python ${baseDir}/bin/run_cellbender.py \
             ${raw_path} \
             ${name}_cellbender.h5 \
-            ${params.cellbender.total_droplets_included} \
+            ${expected_droplets} \
             --filtered ${filtered_path}
         """
 }
diff --git a/modules/celltypist/main.nf b/modules/celltypist/main.nf
@@ -1,14 +1,17 @@
 process CELLTYPIST {
     label 'gpus'
     container 'quay.io/teichlab/celltypist:latest'
-    containerOptions '--nv'
+    containerOptions '--nv --bind ${params.mount}'
     publishDir "${params.outdir}/celltypist/", mode: 'copy'
 
     input:
+    val name
     path postprocessing_scvi_h5ad
 
     output:
-    path "${params.experiment.name}_celltypist_scvi.h5ad", emit: celltypist_scvi_h5ad
+    val "${name}", emit: name
+    path "${name}_celltypist_scvi.h5ad", emit: celltypist_scvi_h5ad
+    path "${postprocessing_scvi_h5ad}", emit: postprocessing_h5ad
 
     script:
     def gpu_index = task.index % params.maxForks
@@ -19,7 +22,7 @@ process CELLTYPIST {
         export MPLCONFIGDIR=\$PWD
         python ${baseDir}/bin/run_celltypist.py \
             ${postprocessing_scvi_h5ad} \
-            ${params.experiment.name}_celltypist_scvi.h5ad \
+            ${name}_celltypist_scvi.h5ad \
             --model ${params.celltypist.model} \
             --majority_voting \
             --use_gpu \
@@ -31,7 +34,7 @@ process CELLTYPIST {
         export MPLCONFIGDIR=\$PWD
         python ${baseDir}/bin/run_celltypist.py \
             ${postprocessing_scvi_h5ad} \
-            ${params.experiment.name}_celltypist_scvi.h5ad \
+            ${name}_celltypist_scvi.h5ad \
             --model ${params.celltypist.model} \
             --majority_voting \
             --use_gpu \

diff --git a/modules/doubletdetection/main.nf b/modules/doubletdetection/main.nf
@@ -11,6 +11,7 @@ process DOUBLETDETECTION {
     val demultiplexing
 
     output:
+    val "${name}", emit: name
     path "${name}_doubletdetection.h5ad", emit: doublet_h5ad
 
     script:

diff --git a/modules/outlierfilter/main.nf b/modules/outlierfilter/main.nf
@@ -4,16 +4,18 @@ process OUTLIER_FILTER {
     publishDir "${params.outdir}/outlier_filtered/", mode: 'copy'
 
     input:
+    val name
     path aggregation_h5ad
 
     output:
-    path "${params.experiment.name ? params.experiment.name + '_' : ''}outlier_filtered.h5ad", emit: outlier_filtered_h5ad
+    val "${name}", emit: name
+    path "${name}_outlier_filtered.h5ad", emit: outlier_filtered_h5ad
 
     script:
     """
     export NUMBA_CACHE_DIR=\$PWD
     python ${baseDir}/bin/outlier_filter.py \
         ${aggregation_h5ad} \
-        ${params.experiment.name ? params.experiment.name + '_' : ''}outlier_filtered.h5ad
+        ${name}_outlier_filtered.h5ad
     """
 }
diff --git a/modules/postprocessing/main.nf b/modules/postprocessing/main.nf
@@ -4,23 +4,25 @@ process POSTPROCESSING {
     publishDir "${params.outdir}/postprocessing/", mode: 'copy'
 
     input:
+    val name
     path scvi_h5ad
 
     output:
-    path "${params.experiment.name ? params.experiment.name + '_' : ''}postprocessing.h5ad", emit: postprocessing_h5ad
-    path "${params.experiment.name ? params.experiment.name + '_' : ''}postprocessing_scvi.h5ad", emit: postprocessing_scvi_h5ad
+    val "${name}", emit: name
+    path "${name}_postprocessing.h5ad", emit: postprocessing_h5ad
+    path "${name}_postprocessing_scvi.h5ad", emit: postprocessing_scvi_h5ad
 
     script:
     """
     export NUMBA_CACHE_DIR=\$PWD
     python ${baseDir}/bin/postprocessing.py \
         ${scvi_h5ad} \
-        ${params.experiment.name ? params.experiment.name + '_' : ''}postprocessing.h5ad \
+        ${name}_postprocessing.h5ad \
         --n_pca_components ${params.postprocessing.n_pca_components} \
         --metadata ${params.postprocessing.metadata}
     python ${baseDir}/bin/postprocessing.py \
         ${scvi_h5ad} \
-        ${params.experiment.name ? params.experiment.name + '_' : ''}postprocessing_scvi.h5ad \
+        ${name}_postprocessing_scvi.h5ad \
         --use_scvi \
         --metadata ${params.postprocessing.metadata}
     """

diff --git a/modules/report/main.nf b/modules/report/main.nf
@@ -4,18 +4,19 @@ process REPORT {
     publishDir "${params.outdir}/report/", mode: 'copy'
 
     input:
+    val name
     path postprocessing_h5ad
     path celltypist_scvi_h5ad
 
     output:
-    path "${params.experiment.name}_report.ipynb", emit: report_ipynb
-    path "${params.experiment.name}_report.html", emit: report_html
+    path "${name}_report.ipynb", emit: report_ipynb
+    path "${name}_report.html", emit: report_html
 
     script:
     """
     export HOME=\$PWD
     python -m ipykernel install --user --name postprocessing
-    papermill ${baseDir}/bin/QC.ipynb ${params.experiment.name}_report.ipynb -p plots ${params.report.plots}
-    jupyter nbconvert --to html ${params.experiment.name}_report.ipynb
+    papermill ${baseDir}/bin/QC.ipynb ${name}_report.ipynb -p plots ${params.report.plots}
+    jupyter nbconvert --to html ${name}_report.ipynb
     """
 }
diff --git a/modules/scran/main.nf b/modules/scran/main.nf
@@ -4,17 +4,19 @@ process SCRAN {
     publishDir "${params.outdir}/scran/", mode: 'copy'
 
     input:
+    val name
     file aggregation_h5ad
 
     output:
-    path "${params.experiment.name ? params.experiment.name + '_' : ''}scran.h5ad", emit: scran_h5ad
+    val "${name}", emit: name
+    path "${name}_scran.h5ad", emit: scran_h5ad
 
     script: // set home directory to cache basilisk
     """
     export PATH=/opt/conda/envs/scran/bin/:$PATH
     export HOME=\$PWD
     Rscript ${baseDir}/bin/scran.R \
         ${aggregation_h5ad} \
-        "${params.experiment.name ? params.experiment.name + '_' : ''}scran.h5ad"
+        "${name}_scran.h5ad"
     """
 }
diff --git a/modules/scvi/main.nf b/modules/scvi/main.nf
@@ -5,17 +5,19 @@ process SCVI {
     publishDir "${params.outdir}/scvi/", mode: 'copy'
 
     input:
+    val name
     path scran_h5ad
 
     output:
-    path "${params.experiment.name ? params.experiment.name + '_' : ''}scvi.h5ad", emit: scvi_h5ad
+    val "${name}", emit: name
+    path "${name}_scvi.h5ad", emit: scvi_h5ad
 
     script:
     """
     export NUMBA_CACHE_DIR=\$PWD
     python ${baseDir}/bin/scvi_norm.py \
         ${scran_h5ad} \
-        ${params.experiment.name ? params.experiment.name + '_' : ''}scvi.h5ad \
+        ${name}_scvi.h5ad \
         --n_latent ${params.scvi.n_latent} \
         --n_top_genes ${params.scvi.n_top_genes}
     """
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,7 @@ process AGGREGATION { @@
         val doublet_h5ads
         output:
+        val "${params.experiment.name}", emit: name
         path "${params.experiment.name ? params.experiment.name + '_' : ''}aggregation.h5ad", emit: aggregation_h5ad
         script:
@@ Expand Down @@