Skip to content

Commit

Permalink
feat: add gseapy gene set enrichment (dryrun working, not tested othe…
Browse files Browse the repository at this point in the history
…rwise)
  • Loading branch information
dlaehnemann committed Feb 6, 2024
1 parent 93958f3 commit 9fadae7
Show file tree
Hide file tree
Showing 9 changed files with 220 additions and 4 deletions.
10 changes: 9 additions & 1 deletion .test/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,12 @@ enrichment:
activate_gfold: True
# pathway database to use in SPIA, needs to be available for
# the species specified by resources -> ref -> species above
pathway_database: "panther"
pathway_database: "panther"
gseapy:
# tool is only run if set to `True`
activate_gfold: True
# list of enrichr libraries/genesets to use for gseapy, for options see:
# https://maayanlab.cloud/Enrichr/#libraries
enrichr_libraries:
- MSigDB_Hallmark_2020
- MSigDB_Computational
15 changes: 14 additions & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,17 @@ enrichment:
activate_gfold: True
# pathway database to use in SPIA, needs to be available for
# the species specified by resources -> ref -> species above
pathway_database: "reactome"
pathway_database: "reactome"
gseapy:
# tool is only run if set to `True`
activate_gfold: True
# list of enrichr libraries/genesets to use for gseapy, for options see:
# https://maayanlab.cloud/Enrichr/#libraries
enrichr_libraries:
- KEGG_2021_Human
- GO_Molecular_Function_2021
- GO_Cellular_Component_2021
- GO_Biological_Process_2021
- MSigDB_Oncogenic_Signatures
- MSigDB_Hallmark_2020
- MSigDB_Computational
9 changes: 9 additions & 0 deletions workflow/envs/gseapy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
channels:
- conda-forge
- bioconda
- nodefaults
dependencies:
- gseapy =1.0.3
- python =3.7
- ipykernel
- jupyter =1.0
1 change: 1 addition & 0 deletions workflow/report/gseapy.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Enrichment of {{snakemake.wildcards.enrichr_library}} for {{snakemake.params.model["formula"]}} for the following parameters: {{snakemake.wildcards.paramspace_instance}}. {% if snakemake.wildcards.diffexp_filter != "nofilter" %} Here, genes have been additionally filtered by the following statement: {{ snakemake.params.diffexp_query }} {% endif %} Enrichment was performed with GSEApy, using a hypergeometric test for the null hypothesis that the genes in the gene set are independent of the significantly differentially expressed genes. The background was adjusted for the set of measured genes in the panel.
72 changes: 72 additions & 0 deletions workflow/resources/datavzrd/gseapy_template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
name: ?f"Enrichment of {wildcards.enrichr_library} for {wildcards.contrast}"

datasets:
enrichment:
path: ?input.enrichment
separator: "\t"

views:
enrichment:
dataset: enrichment
page-size: 18
desc: |
Enrichment analysis of all gene sets in {wildcards.enrichr_library}.
* term: gene set name
* overlap: number of genes that have a significantly non-zero fold change (according to GFOLD) vs. all measured genes in the gene set
* p-value: p-value of hypergeometric test for the null hypothesis that the gene set is not enriched
* FDR: false discovery rate calculated with Benjamini-Hochberg method
* genes: genes that have a significantly non-zero fold change (according to GFOLD)
render-table:
columns:
term:
link-to-url:
linkout:
url: "{linkout}"
linkout:
display-mode: hidden
p-value:
plot:
heatmap:
scale: linear
domain:
- 0.0
- 0.049
- 0.05
- 0.051
- 1.0
range:
- "#a1d99b"
- "#ecf7eb"
- white
- "#ffeedf"
- "#fdae6b"
FDR:
plot:
heatmap:
scale: linear
domain:
- 0.0
- 0.049
- 0.05
- 0.051
- 1.0
range:
- "#a1d99b"
- "#ecf7eb"
- white
- "#ffeedf"
- "#fdae6b"
odds ratio:
plot:
heatmap:
scale: linear
color-scheme: blues
genes:
custom: |
function(value) {
return value.split(";").map(function(gene) {
return `<a href="https://www.genecards.org/cgi-bin/carddisp.pl?gene=${gene}"><span class="badge badge-info">${gene}</span></a>`
})
}
9 changes: 9 additions & 0 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@ def final_output(wildcards):
f"results/datavzrd-reports/spia/{contrast['changed']}_vs_{contrast['baseline']}",
]
)
if config["enrichment"]["gseapy"]["activate_gfold"]:
final_output.extend(
expand(
"results/datavzrd/gseapy/{contrast}/{enrichr_library}",
contrast=f"{contrast['changed']}_vs_{contrast['baseline']}",
enrichr_library=config["enrichment"]["enrichr_libraries"],
)
)

return final_output


Expand Down
53 changes: 51 additions & 2 deletions workflow/rules/enrichment.smk
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,62 @@ rule spia_datavzrd:
),
htmlindex="index.html",
caption="../report/spia_table.rst",
category="Pathway enrichment",
category="Enrichment analysis",
patterns=["index.html"],
labels={
"contrast": "{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}"
"contrast": "{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}",
"enrichment": "pathway",
},
),
log:
"logs/datavzrd-reports/spia/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}.log",
wrapper:
"v2.12.0/utils/datavzrd"


rule gseapy:
input:
filtered="results/gfold/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}.cleaned_and_sorted.tsv",
unfiltered="results/gfold/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}.all_tested_annotated.tsv",
output:
"results/gseapy/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}/{enrichr_library}.tsv",
log:
"logs/gseapy/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}/{enrichr_library}.log",
params:
species=config["resources"]["ref"]["species"],
conda:
"../envs/gseapy.yaml"
script:
"../scripts/gsea.py"


rule render_gseapy_datavzrd_config:
input:
template=workflow.source_path("../resources/datavzrd/gseapy_template.yaml"),
enrichment="results/gseapy/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}/{enrichr_library}.tsv",
output:
"resources/datavzrd/gseapy/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}/{enrichr_library}.yaml",
template_engine:
"yte"


use rule spia_datavzrd as gseapy_datavzrd with:
input:
config="resources/datavzrd/gseapy/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}/{enrichr_library}.yaml",
table="results/gseapy/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}/{enrichr_library}.tsv",
output:
report(
directory(
"results/datavzrd/gseapy/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}/{enrichr_library}"
),
htmlindex="index.html",
category="Enrichment analysis",
subcategory="{enrichr_library}",
caption="../report/gseapy.rst",
labels={
"contrast": "{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}",
"enrichment": "gene set",
},
),
log:
"logs/datavzrd-reports/gseapy/{sample_changed}-{unit_changed}_vs_{sample_baseline}-{unit_baseline}.{enrichr_library}.log",
9 changes: 9 additions & 0 deletions workflow/schemas/config.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,17 @@ properties:
type: string
required:
- activate_gfold
properties:
gseapy:
type: object
properties:
activate_gfold:
type: string
required:
- activate_gfold
required:
- spia
- gseapy
required:
- gfold
- enrichment
46 changes: 46 additions & 0 deletions workflow/scripts/gsea.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import sys
sys.stderr = open(snakemake.log[0], "w")

import pandas as pd
import gseapy as gp

background = pd.read_csv(snakemake.input.unfiltered, sep="\t").loc[:, "gene_symbol"].drop_duplicates()

gfold_change = pd.read_csv(snakemake.input.filtered, sep="\t").loc[:, "gene_symbol"].drop_duplicates()

if snakemake.wildcards.enrichr_library == "MSigDB_Hallmark_2020":
def linkout_func(term):
return f"https://www.gsea-msigdb.org/gsea/msigdb/cards/HALLMARK_{term.upper().replace(' ', '_')}"
elif snakemake.wildcards.enrichr_library in ("GO_Biological_Process_2020", "GO_Molecular_Function_2021", "GO_Cellular_Component_2021"):
def linkout_func(term):
goid = term.split(" ")[-1].strip("()")
return f"https://www.ebi.ac.uk/QuickGO/term/{goid}"
else:
def linkout_func(term):
return f"https://www.google.com/search?q={term}"


if len(gfold_change) <= 1:
# enrichment does only work with at least 2 genes
pd.DataFrame(columns=["term", "linkout", "p-value", "FDR", "odds ratio", "genes"]).to_csv(snakemake.output[0], sep='\t', index=False)
else:
gene_set = gp.parser.download_library(snakemake.wildcards.enrichr_library, snakemake.params.species)

enrichment = gp.enrich(
gene_list=gfold_change,
gene_sets=[gene_set],
background=background,
)

results = enrichment.results.sort_values("P-value")
results.drop(columns="Gene_set", inplace=True)
results.columns = results.columns.str.lower()
results.rename(columns={"adjusted p-value": "FDR"}, inplace=True)

# simplify floats
for col in ["p-value", "FDR", "odds ratio"]:
results[col] = results[col].map("{:.3g}".format)

results.insert(1, "linkout", results["term"].map(linkout_func))

results.to_csv(snakemake.output[0], sep='\t', index=False)

0 comments on commit 9fadae7

Please sign in to comment.