feat: add gseapy gene set enrichment (dryrun working, not tested othe…

…rwise)
dlaehnemann · Feb 6, 2024 · 9fadae7 · 9fadae7
1 parent 93958f3
commit 9fadae7
Show file tree

Hide file tree

Showing 9 changed files with 220 additions and 4 deletions.
diff --git a/.test/config/config.yaml b/.test/config/config.yaml
@@ -34,4 +34,12 @@ enrichment:
     activate_gfold: True
     # pathway database to use in SPIA, needs to be available for
     # the species specified by resources -> ref -> species above
-    pathway_database: "panther"
+    pathway_database: "panther"
+  gseapy:
+    # tool is only run if set to `True`
+    activate_gfold: True
+  # list of enrichr libraries/genesets to use for gseapy, for options see:
+  # https://maayanlab.cloud/Enrichr/#libraries
+  enrichr_libraries:
+    - MSigDB_Hallmark_2020
+    - MSigDB_Computational
diff --git a/config/config.yaml b/config/config.yaml
@@ -38,4 +38,17 @@ enrichment:
     activate_gfold: True
     # pathway database to use in SPIA, needs to be available for
     # the species specified by resources -> ref -> species above
-    pathway_database: "reactome"
+    pathway_database: "reactome"
+  gseapy:
+    # tool is only run if set to `True`
+    activate_gfold: True
+  # list of enrichr libraries/genesets to use for gseapy, for options see:
+  # https://maayanlab.cloud/Enrichr/#libraries
+  enrichr_libraries:
+    - KEGG_2021_Human
+    - GO_Molecular_Function_2021
+    - GO_Cellular_Component_2021
+    - GO_Biological_Process_2021
+    - MSigDB_Oncogenic_Signatures
+    - MSigDB_Hallmark_2020
+    - MSigDB_Computational
diff --git a/workflow/envs/gseapy.yaml b/workflow/envs/gseapy.yaml
@@ -0,0 +1,9 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - gseapy =1.0.3
+  - python =3.7
+  - ipykernel
+  - jupyter =1.0
diff --git a/workflow/report/gseapy.rst b/workflow/report/gseapy.rst
@@ -0,0 +1 @@
+Enrichment of {{snakemake.wildcards.enrichr_library}} for {{snakemake.params.model["formula"]}} for the following parameters: {{snakemake.wildcards.paramspace_instance}}. {% if snakemake.wildcards.diffexp_filter != "nofilter" %} Here, genes have been additionally filtered by the following statement: {{ snakemake.params.diffexp_query }} {% endif %} Enrichment was performed with GSEApy, using a hypergeometric test for the null hypothesis that the genes in the gene set are independent of the significantly differentially expressed genes. The background was adjusted for the set of measured genes in the panel.
diff --git a/workflow/resources/datavzrd/gseapy_template.yaml b/workflow/resources/datavzrd/gseapy_template.yaml
@@ -0,0 +1,72 @@
+name: ?f"Enrichment of {wildcards.enrichr_library} for {wildcards.contrast}"
+
+datasets:
+  enrichment:
+    path: ?input.enrichment
+    separator: "\t"
+
+views:
+  enrichment:
+    dataset: enrichment
+    page-size: 18
+    desc: |
+      Enrichment analysis of all gene sets in {wildcards.enrichr_library}.
+
+      * term: gene set name
+      * overlap: number of genes that have a significantly non-zero fold change (according to GFOLD) vs. all measured genes in the gene set
+      * p-value: p-value of hypergeometric test for the null hypothesis that the gene set is not enriched
+      * FDR: false discovery rate calculated with Benjamini-Hochberg method
+      * genes: genes that have a significantly non-zero fold change (according to GFOLD)
+
+    render-table:
+      columns:
+        term:
+          link-to-url:
+            linkout:
+              url: "{linkout}"
+        linkout:
+          display-mode: hidden
+        p-value:
+          plot:
+            heatmap:
+              scale: linear
+              domain:
+                - 0.0
+                - 0.049
+                - 0.05
+                - 0.051
+                - 1.0
+              range:
+                - "#a1d99b"
+                - "#ecf7eb"
+                - white
+                - "#ffeedf"
+                - "#fdae6b"
+        FDR:
+          plot:
+            heatmap:
+              scale: linear
+              domain:
+                - 0.0
+                - 0.049
+                - 0.05
+                - 0.051
+                - 1.0
+              range:
+                - "#a1d99b"
+                - "#ecf7eb"
+                - white
+                - "#ffeedf"
+                - "#fdae6b"
+        odds ratio:
+          plot:
+            heatmap:
+              scale: linear
+              color-scheme: blues
+        genes:
+          custom: |
+            function(value) {
+              return value.split(";").map(function(gene) {
+                return `<a href="https://www.genecards.org/cgi-bin/carddisp.pl?gene=${gene}"><span class="badge badge-info">${gene}</span></a>`
+              })
+            }
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -20,6 +20,15 @@ def final_output(wildcards):
                     f"results/datavzrd-reports/spia/{contrast['changed']}_vs_{contrast['baseline']}",
                 ]
             )
+        if config["enrichment"]["gseapy"]["activate_gfold"]:
+            final_output.extend(
+                expand(
+                    "results/datavzrd/gseapy/{contrast}/{enrichr_library}",
+                    contrast=f"{contrast['changed']}_vs_{contrast['baseline']}",
+                    enrichr_library=config["enrichment"]["enrichr_libraries"],
+                )
+            )
+
     return final_output
 
 

diff --git a/workflow/rules/enrichment.smk b/workflow/rules/enrichment.smk
@@ -43,13 +43,62 @@ rule spia_datavzrd:
             ),
             htmlindex="index.html",
             caption="../report/spia_table.rst",
-            category="Pathway enrichment",
+            category="Enrichment analysis",
             patterns=["index.html"],
             labels={
-                "contrast": "{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}"
+                "contrast": "{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}",
+                "enrichment": "pathway",
             },
         ),
     log:
         "logs/datavzrd-reports/spia/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}.log",
     wrapper:
         "v2.12.0/utils/datavzrd"
+
+
+rule gseapy:
+    input:
+        filtered="results/gfold/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}.cleaned_and_sorted.tsv",
+        unfiltered="results/gfold/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}.all_tested_annotated.tsv",
+    output:
+        "results/gseapy/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}/{enrichr_library}.tsv",
+    log:
+        "logs/gseapy/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}/{enrichr_library}.log",
+    params:
+        species=config["resources"]["ref"]["species"],
+    conda:
+        "../envs/gseapy.yaml"
+    script:
+        "../scripts/gsea.py"
+
+
+rule render_gseapy_datavzrd_config:
+    input:
+        template=workflow.source_path("../resources/datavzrd/gseapy_template.yaml"),
+        enrichment="results/gseapy/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}/{enrichr_library}.tsv",
+    output:
+        "resources/datavzrd/gseapy/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}/{enrichr_library}.yaml",
+    template_engine:
+        "yte"
+
+
+use rule spia_datavzrd as gseapy_datavzrd with:
+    input:
+        config="resources/datavzrd/gseapy/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}/{enrichr_library}.yaml",
+        table="results/gseapy/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}/{enrichr_library}.tsv",
+    output:
+        report(
+            directory(
+                "results/datavzrd/gseapy/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}/{enrichr_library}"
+            ),
+            htmlindex="index.html",
+            category="Enrichment analysis",
+            subcategory="{enrichr_library}",
+            caption="../report/gseapy.rst",
+            labels={
+                "contrast": "{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}",
+                "enrichment": "gene set",
+            },
+        ),
+    log:
+        "logs/datavzrd-reports/gseapy/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}.{enrichr_library}.log",
diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
@@ -39,8 +39,17 @@ properties:
             type: string
       required:
         - activate_gfold
+    properties:
+      gseapy:
+        type: object
+        properties:
+          activate_gfold:
+            type: string
+      required:
+        - activate_gfold
   required:
     - spia
+    - gseapy
 required:
   - gfold
   - enrichment
diff --git a/workflow/scripts/gsea.py b/workflow/scripts/gsea.py
@@ -0,0 +1,46 @@
+import sys
+sys.stderr = open(snakemake.log[0], "w")
+
+import pandas as pd
+import gseapy as gp
+
+background = pd.read_csv(snakemake.input.unfiltered, sep="\t").loc[:, "gene_symbol"].drop_duplicates()
+
+gfold_change = pd.read_csv(snakemake.input.filtered, sep="\t").loc[:, "gene_symbol"].drop_duplicates()
+
+if snakemake.wildcards.enrichr_library == "MSigDB_Hallmark_2020":
+    def linkout_func(term):
+        return f"https://www.gsea-msigdb.org/gsea/msigdb/cards/HALLMARK_{term.upper().replace(' ', '_')}"
+elif snakemake.wildcards.enrichr_library in ("GO_Biological_Process_2020", "GO_Molecular_Function_2021", "GO_Cellular_Component_2021"):
+    def linkout_func(term):
+        goid = term.split(" ")[-1].strip("()")
+        return f"https://www.ebi.ac.uk/QuickGO/term/{goid}"
+else:
+    def linkout_func(term):
+        return f"https://www.google.com/search?q={term}"
+
+
+if len(gfold_change) <= 1:
+    # enrichment does only work with at least 2 genes
+    pd.DataFrame(columns=["term", "linkout", "p-value", "FDR", "odds ratio", "genes"]).to_csv(snakemake.output[0], sep='\t', index=False)
+else:
+    gene_set = gp.parser.download_library(snakemake.wildcards.enrichr_library, snakemake.params.species)
+
+    enrichment = gp.enrich(
+        gene_list=gfold_change,
+        gene_sets=[gene_set],
+        background=background,
+    )
+
+    results = enrichment.results.sort_values("P-value")
+    results.drop(columns="Gene_set", inplace=True)
+    results.columns = results.columns.str.lower()
+    results.rename(columns={"adjusted p-value": "FDR"}, inplace=True)
+
+    # simplify floats
+    for col in ["p-value", "FDR", "odds ratio"]:
+        results[col] = results[col].map("{:.3g}".format)
+
+    results.insert(1, "linkout", results["term"].map(linkout_func))
+
+    results.to_csv(snakemake.output[0], sep='\t', index=False)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Enrichment of {{snakemake.wildcards.enrichr_library}} for {{snakemake.params.model["formula"]}} for the following parameters: {{snakemake.wildcards.paramspace_instance}}. {% if snakemake.wildcards.diffexp_filter != "nofilter" %} Here, genes have been additionally filtered by the following statement: {{ snakemake.params.diffexp_query }} {% endif %} Enrichment was performed with GSEApy, using a hypergeometric test for the null hypothesis that the genes in the gene set are independent of the significantly differentially expressed genes. The background was adjusted for the set of measured genes in the panel.