Merge pull request #33 from christopher-mohr/add_gene_heatmap_process

nf-core · Jun 19, 2023 · efb0d7b · efb0d7b
2 parents dd7c75e + 70bc1b8
commit efb0d7b
Show file tree

Hide file tree

Showing 10 changed files with 175 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Added`
 
+- [#33](https://github.com/nf-core/nanostring/pull/33) - Add functionality to generate gene-count heatmaps [#17](https://github.com/nf-core/nanostring/issues/17)
+
 ### `Fixed`
 
 ### `Dependencies`

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -5,7 +5,7 @@ report_comment: >
 
 report_section_order:
   BD:
-    order: 960
+    order: 970
   FOV:
     order: 950
   Posctrl_linearity:
@@ -52,6 +52,10 @@ report_section_order:
     order: 730
   No_HK_Norm_GEX_ENDO:
     order: 720
+  gene_heatmap:
+    order: 710
+  wo_HKnorm_gene_heatmap:
+    order: 700
   "nf-core-nanostring-methods-description":
     order: -1000
   software_versions:
@@ -236,8 +240,11 @@ custom_data:
     section_name: "Non-HK-Normalized count values for endogenous genes."
     description: "Non-Housekeeping-Normalized Endogenous gene count table, including available metadata."
   gene_heatmap:
-    section_name: "Heatmap for selected set of target genes"
-    description: "Normalized and log2+1 transformed Gene Expression for selected set of target genes."
+    section_name: "Count-Heatmap for all or selected set of target genes"
+    description: "Normalized and log2+1 transformed gene expression for all or selected set of target genes."
+  wo_HKnorm_gene_heatmap:
+    section_name: "Count-Heatmap for all or selected set of target genes (Non-HK-normalized)"
+    description: "Normalized and log2+1 transformed gene expression for all or selected set of target genes based on the Non-HK-normalized results."
   genescore_plage:
     section_name: "GeneScores"
     description: "Genescores computed with the PLAGE algorithm using the normalized gene expression values for all samples."

diff --git a/bin/compute_gene_heatmap.R b/bin/compute_gene_heatmap.R
@@ -0,0 +1,63 @@
+#!/usr/bin/env Rscript
+library(tidyverse)
+library(ggplot2)
+library(fs)
+library(ComplexHeatmap)
+library(circlize)
+library(yaml)
+library(ragg)
+library(tidylog)
+
+###Command line argument parsing###
+args = commandArgs(trailingOnly=TRUE)
+if (length(args) < 1) {
+    stop("Usage: compute_gene_heatmap.R <annotated_counts.tsv> or compute_gene_heatmap.R <annotated_counts.tsv> <genes.yaml>", call.=FALSE)
+}
+input_counts <- args[1]
+
+#Read annotated counts
+# HEADER is always RCC_FILE + GENES + SAMPLE_ID and additional metadata such as GROUP TREATMENT OTHER_METADATA
+counts <- read.table(input_counts, sep="\t", check.names = FALSE, header=TRUE, stringsAsFactors = FALSE)
+
+if (length(args) == 2) {
+    input_genes <- args[2]
+    genes <- read_yaml(input_genes)
+} else {
+    gene_cols <- counts %>% dplyr::select(- any_of(c("RCC_FILE", "SAMPLE_ID", "TIME", "TREATMENT", "OTHER_METADATA")))
+    genes <- colnames(gene_cols)
+}
+
+#Select counts of interest
+counts_selected <- counts %>% dplyr::select(all_of(genes))
+
+#Add proper Rownames
+rownames(counts_selected) <- counts$SAMPLE_ID
+
+#sort dataframe by rownames to make it easier comparable across heatmaps
+counts_selected[order(row.names(counts_selected)), ]
+
+#log2+1
+counts_selected <- log2(counts_selected + 1)
+
+#Find max
+colMax <- function(data) sapply(data, max, na.rm = TRUE)
+#Find min
+colMin <- function(data) sapply(data, min, na.rm = TRUE)
+
+max_value <- max(colMax(counts_selected))
+min_value <- min(colMin(counts_selected))
+
+#Save as PDF
+
+prefix <- ""
+if (grepl("wo_HKnorm",input_counts)) {
+    prefix <- "wo_HKnorm_"
+}
+
+agg_png(file = paste0(prefix, "gene_heatmap_mqc.png"), width = 1200, height = 2000, unit = "px")
+
+Heatmap(counts_selected, name = "Gene-Count Heatmap", column_title = "Gene (log2 +1)",
+        row_title_rot = 90, row_title = "SampleID",row_dend_reorder = FALSE, show_row_dend = FALSE, row_names_side = "left",
+        show_column_dend = FALSE, col = colorRamp2(c(min_value, max_value), c("#f7f7f7", "#67a9cf")))
+
+dev.off()
diff --git a/conf/modules.config b/conf/modules.config
@@ -50,6 +50,14 @@ process {
         ]
     }
 
+    withName: CREATE_GENE_HEATMAP {
+        publishDir = [
+            path: { "${params.outdir}/gene_heatmaps" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },

diff --git a/docs/output.md b/docs/output.md
@@ -61,6 +61,19 @@ This holds the normalized and non-housekeeping-normalized annotated gene express
 
 </details>
 
+### Gene-Count Heatmaps
+
+<details markdown="1">
+<summary>Output files</summary>
+
+This holds the gene-count heatmaps generated for the normalized and non-housekeeping-normalized annotated gene expression data. These heatmaps are also part of the MultiQC report.
+
+- `gene_heatmaps/`
+- `gene_heatmap_mqc.png`: Gene-count heatmap for HK-normalized data.
+- `wo_HKnorm_gene_heatmap_mqc.png`: Gene-count heatmap for non-HK-normalized data.
+
+</details>
+
 ### MultiQC
 
 <details markdown="1">

diff --git a/docs/usage.md b/docs/usage.md
@@ -31,7 +31,9 @@ RCC_FILE,SAMPLE_ID
 
 The samplesheet can have as many columns as you desire, however, there is a strict requirement for the two columns `RCC_FILE` and `SAMPLE_ID`.
 
-A final samplesheet with additional metadata may look something like the one below. This is for 3 samples. If the column `RCC_FILE_NAME` is not specified, the pipeline will fill it automatically from the `RCC_FILE` column.
+A final samplesheet with additional metadata may look something like the one below. This is for 3 samples. If you want to use additional metadata columns please follow the instructions provided in the section on [Gene-Count Heatmap](#gene-count-heatmap).
+
+If the column `RCC_FILE_NAME` is not specified, the pipeline will fill it automatically from the `RCC_FILE` column.
 
 ```console
 RCC_FILE,RCC_FILE_NAME,SAMPLE_ID,TIME,TREATMENT,INCLUDE,OTHER_METADATA
@@ -88,6 +90,18 @@ input: 'data'
 
 You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch).
 
+### Gene-Count Heatmap
+
+The pipeline will generate one heatmap each, for the Housekeeping-normalized and non-Housekeeping-normalized data. These heatmaps will also be included in the MultiQC report. Per default the heatmap will include all endogenous genes. If you want to generate the heatmap for a subset of genes, please specify a `yml` file using the parameter `--heatmap_genes_to_filter` with the following format:
+
+```
+- geneA
+- geneB
+...
+```
+
+> ⚠️ If you want to use other metadata in your samplesheet than the one shown in the section [Full samplesheet](#full-samplesheet), please make sure to specify the `yml` file with all endogenous genes or a subset of it.
+
 ### Updating the pipeline
 
 When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline:

diff --git a/modules/local/create_gene_heatmap.nf b/modules/local/create_gene_heatmap.nf
@@ -0,0 +1,35 @@
+process CREATE_GENE_HEATMAP {
+    label 'process_single'
+
+    conda "r-nacho=2.0.4 r-tidyverse=2.0.0 r-ggplot2=3.4.2 r-rlang=1.1.1 r-tidylog=1.0.2 r-fs=1.6.2 bioconductor-complexheatmap=2.14.0 r-circlize=0.4.15 r-yaml=2.3.7 r-ragg=1.2.5 r-rcolorbrewer=1.1_3 r-pheatmap=1.0.12"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mulled-v2-68b3ca19fcb1f8b052324cb635ab60f8b17a3058:e4b1ecb1e69304213c695190148317b26caa2841-0' :
+        'biocontainers/mulled-v2-68b3ca19fcb1f8b052324cb635ab60f8b17a3058:e4b1ecb1e69304213c695190148317b26caa2841-0' }"
+
+    input:
+    path annotated_counts
+
+    output:
+    path "*gene_heatmap_mqc.png", emit: gene_heatmap
+    path "versions.yml"         , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+
+    def gene_filter = params.heatmap_genes_to_filter ?: ""
+
+    """
+    compute_gene_heatmap.R $annotated_counts $gene_filter
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        r-base: \$(echo \$(R --version 2>&1) | sed 's/^.*R version //; s/ .*\$//')
+        r-nacho: \$(Rscript -e "library(NACHO); cat(as.character(packageVersion('NACHO')))")
+        r-tidyverse: \$(Rscript -e "library(tidyverse); cat(as.character(packageVersion('tidyverse')))")
+        r-fs: \$(Rscript -e "library(fs); cat(as.character(packageVersion('fs')))")
+    END_VERSIONS
+    """
+}
diff --git a/nextflow.config b/nextflow.config
@@ -12,6 +12,9 @@ params {
     // Input options
     input                      = null
 
+    // Pipeline options
+    heatmap_genes_to_filter    = null
+
     // References
     genome                     = null
     igenomes_base              = 's3://ngi-igenomes/igenomes'

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -42,6 +42,20 @@
                 }
             }
         },
+        "processing_options": {
+            "title": "Processing options",
+            "type": "object",
+            "description": "Define how the pipeline should process the data.",
+            "default": "",
+            "fa_icon": "fas fa-exchange-alt",
+            "properties": {
+                "heatmap_genes_to_filter": {
+                    "type": "string",
+                    "description": "Path to yml file (list, one item per line) to specify which genes should be used for the gene-count heatmap.",
+                    "default": null
+                }
+            }
+        },
         "reference_genome_options": {
             "title": "Reference genome options",
             "type": "object",
@@ -256,6 +270,9 @@
         {
             "$ref": "#/definitions/input_output_options"
         },
+        {
+            "$ref": "#/definitions/processing_options"
+        },
         {
             "$ref": "#/definitions/reference_genome_options"
         },

diff --git a/workflows/nanostring.nf b/workflows/nanostring.nf
@@ -44,6 +44,7 @@ include { NORMALIZE }       from '../subworkflows/local/normalize'
 // MODULES
 //
 include { CREATE_ANNOTATED_TABLES } from '../modules/local/create_annotated_tables'
+include { CREATE_GENE_HEATMAP     } from '../modules/local/create_gene_heatmap'
 
 
 /*
@@ -114,6 +115,13 @@ workflow NANOSTRING {
     )
     ch_versions = ch_versions.mix(CREATE_ANNOTATED_TABLES.out.versions)
 
+    //
+    // MODULE: Compute gene-count heatmap for MultiQC report based on annotated (ENDO) counts
+    //
+    CREATE_GENE_HEATMAP (
+        CREATE_ANNOTATED_TABLES.out.annotated_endo_data
+    )
+    ch_versions = ch_versions.mix(CREATE_GENE_HEATMAP.out.versions)
 
     //
     // DUMP SOFTWARE VERSIONS
@@ -137,6 +145,7 @@ workflow NANOSTRING {
     ch_multiqc_files = ch_multiqc_files.mix(QUALITY_CONTROL.out.nacho_qc_multiqc_metrics.collect())
     ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
     ch_multiqc_files = ch_multiqc_files.mix(CREATE_ANNOTATED_TABLES.out.annotated_data_mqc.collect())
+    ch_multiqc_files = ch_multiqc_files.mix(CREATE_GENE_HEATMAP.out.gene_heatmap.collect())
 
     MULTIQC (
         ch_multiqc_files.collect(),