Add enrichment

koesterlab · Oct 9, 2024 · b80edbd · b80edbd
1 parent 934504f
commit b80edbd
Show file tree

Hide file tree

Showing 9 changed files with 210 additions and 56 deletions.
diff --git a/config/config.yaml b/config/config.yaml
@@ -27,27 +27,46 @@ enrichment:
     activate: true
     fdr_genes: 0.05
     fdr_go_terms: 0.05
+    pathvars:
+      input_file: "results/dmr_calls/{group}/genes_transcripts/chipseeker_postprocessed"
+      output_file: "results/enrichment/{group}/genes_transcripts/go_term_enrichment.gene_fdr_{gene_fdr}.go_term_fdr_{go_term_fdr}"
+      # Must contain ns
+      plot_file: "results/enrichment/{{group}}/plots/go_term_enrichment_{ns}.gene_fdr_{{gene_fdr}}.go_term_fdr_{{go_term_fdr}}"
+      datavzrd_file: "results/enrichment/{group}/datavzrd/go_enrichment-{gene_fdr}.go_term_fdr_{go_term_fdr}"
+      # Must contain all wildcards from the other files
+      log_file_name: "{group}_{gene_fdr}_{go_term_fdr}"      
+      wildcards:
+        # group: 'read_sample_tsv(config["sample_path"])'
+        group: '["BC02-ref-sorted", "BC03-ref-sorted", "BC04-ref-sorted",]'
+        gene_fdr: "str(config['enrichment']['goatools']['fdr_genes']).replace('.', '-')"
+        go_term_fdr: "str(config['enrichment']['goatools']['fdr_go_terms']).replace('.', '-')"
+        ns: '["BP", "CC", "MF"]'
   fgsea:
     gene_sets_file: "resources/gene_sets/dummy.gmt"
     # tool is only run if set to `true`
-    activate: true
+    activate: false
     # if activated, you need to provide a GMT file with gene sets of interest
     fdr_gene_set: 0.05
     eps: 1.0e-50
-  spia:
-    # tool is only run if set to `true`
-    activate: true
-    # pathway database to use in SPIA, needs to be available for
-    # the species specified by resources -> ref -> species above
-    pathway_database: "reactome"
-    # OrgDB Genome wide annotation package (https://www.bioconductor.org/packages/release/BiocViews.html#___OrgDb) for the species under consideration. 
-    # Only required if you want to have a gene analysis for your pathways. Else NA
-    orgDb: org.Hs.eg.db
+    pathvars:
+      sleuth_sample: "results/sleuth/{group}.samples.tsv"
+      input_file: "results/dmr_calls/{group}/genes_transcripts/chipseeker_postprocessed"
+      output_file: "results/tables/fgsea/{group}"
+      plot_file: "results/plots/fgsea/{group}"
+      log_file_name: "{group}"
+      wildcards:
+        # group: 'read_sample_tsv(config["sample_path"])'
+        group: '["BC02-ref-sorted", "BC03-ref-sorted", "BC04-ref-sorted",]'
+
+  effect_col:
+    dynamic: false
+    dynamic_expression: ""
+    static_value: "mean_methylation_difference"
 
 # Not used right now
 meta_comparisons:
   # comparison is only run if set to `true`
-  activate: true
+  activate: false
   # Define here the comparisons under interest
   comparisons:
     # Define any name for comparison. You can add as many comparisions as you want

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -21,20 +21,21 @@ include: "rules/common.smk"
 include: "rules/dmr_heatmap.smk"
 include: "rules/dmr_calling.smk"
 include: "rules/prepare_meth_calling.smk"
+include: "rules/prepare_enrichment.smk"
 include: "rules/methylation_calling.smk"
 include: "rules/annotate_dmrs.smk"
 
 
 # declare https://github.com/Addimator/enrichment as a module
 module enrichment:
     snakefile:
-        github(
-            "Addimator/enrichment",
-            path="workflow/Snakefile",
-            # tag="v2.5.4",
-            branch="main",
-        )
-        # "/home/adrian/Documents/Promotion/locotact_stuff/enrichment/workflow/Snakefile"
+        # github(
+        #     "Addimator/enrichment",
+        #     path="workflow/Snakefile",
+        #     # tag="v2.5.4",
+        #     branch="main",
+        # )
+        "/homes/aprinz/enrichment/workflow/Snakefile"
     config:
         config
 
@@ -46,5 +47,5 @@ use rule * from enrichment
 rule modified_all:
     default_target: True
     input:
-        # rules.all.input,
         all_input,
+        rules.all.input,
diff --git a/workflow/envs/biomart.yaml b/workflow/envs/biomart.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - bioconductor-biomart =2.58
+  - r-tidyverse =2.0
diff --git a/workflow/envs/varlociraptor.yaml b/workflow/envs/varlociraptor.yaml
@@ -1,36 +1,3 @@
-# name: varlociraptor
-# channels:
-#   - conda-forge
-#   - bioconda
-#   - rmg
-#   - nodefaults
-# dependencies:
-#   - alignoth=0.7.3
-#   # - bcftools=1.16
-#   # - bwa=0.7.17
-#   - bzip2=1.0.8
-#   - ca-certificates=2022.12.7
-#   - cmake=3.25.1
-#   - curl=7.87.0
-#   - expat=2.5.0
-#   - gcc=12.2.0
-#   - gsl=2.7
-#   # - htslib=1.16
-#   - make=4.3
-#   - mason=2.0.9
-#   - perl=5.32.1
-#   - pkg-config=0.29.2
-#   - rust=1.80.0
-#   # - samtools=1.16.1
-#   - xz=5.2.6
-#   # - zlib=1.2.13
-#   - zlib-ng=2.0.6
-#   - zstd=1.5.2
-#   - libclang=16.0.6
-#   - clangdev=16.0.6
-#   - libgcc-ng
-
-
 channels:
   - conda-forge
   - bioconda

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -15,6 +15,12 @@ def read_sample_tsv(sample_tsv_path):
 samples = read_sample_tsv(sample_tsv_path)
 
 
+def get_bioc_species_name():
+    first_letter = config["resources"]["ref"]["species"][0]
+    subspecies = config["resources"]["ref"]["species"].split("_")[1]
+    return first_letter + subspecies
+
+
 def all_input(wildcards):
     wanted_input = []
     wanted_input.extend(

diff --git a/workflow/rules/methylation_calling.smk b/workflow/rules/methylation_calling.smk
@@ -5,7 +5,7 @@ scattergather:
 
 rule find_candidates:
     input:
-        "resources/genome.fasta",
+        fasta="resources/genome.fasta",
         varlo_path="resources/tools/varlociraptor",
     output:
         "resources/candidates/candidates.bcf",
@@ -15,8 +15,9 @@ rule find_candidates:
         "../envs/varlociraptor.yaml"
     shell:
         """ 
+        PIPELINE_PATH=$(pwd)
         cd {input.varlo_path}
-        cargo run -- methylation-candidates {input} {output}
+        cargo run -- methylation-candidates $PIPELINE_PATH/{input.fasta} $PIPELINE_PATH/{output}
         """
 
 
@@ -51,8 +52,9 @@ rule compute_meth_observations:
         sequencer=lambda wildcards: samples[wildcards.sample][1],
     shell:
         """ 
+        PIPELINE_PATH=$(pwd)
         cd {input.varlo_path}
-        cargo run --release -- preprocess variants {input.genome} --candidates {input.candidates} --bam {input.alignments} --read-type {params.sequencer} --max-depth 1000 > {output}
+        cargo run --release -- preprocess variants $PIPELINE_PATH/{input.genome} --candidates $PIPELINE_PATH/{input.candidates} --bam $PIPELINE_PATH/{input.alignments} --read-type {params.sequencer} --max-depth 1000 > $PIPELINE_PATH/{output}
         """
 
 
@@ -69,8 +71,9 @@ rule call_methylation:
         "../envs/varlociraptor.yaml"
     shell:
         """ 
+        PIPELINE_PATH=$(pwd)
         cd {input.varlo_path}
-        cargo run --release -- call variants --omit-strand-bias generic --scenario {input.scenario} --obs normal={input.preprocess_obs} > {output}
+        cargo run --release -- call variants --omit-strand-bias generic --scenario $PIPELINE_PATH/{input.scenario} --obs normal=$PIPELINE_PATH/{input.preprocess_obs} > $PIPELINE_PATH/{output}
         """
 
 

diff --git a/workflow/rules/prepare_enrichment.smk b/workflow/rules/prepare_enrichment.smk
@@ -0,0 +1,47 @@
+rule get_ensembl_genes:
+    input:
+        "results/dmr_calls/{group2}/genes_transcripts/chipseeker.tsv",
+    output:
+        "results/dmr_calls/{group2}/genes_transcripts/ensembl_genes.tsv",
+    conda:
+        "../envs/biomart.yaml"
+    log:
+        "logs/get_ensembl_genes_{group2}.log",
+    params:
+        species=get_bioc_species_name(),
+        version=config["resources"]["ref"]["release"],
+    script:
+        "../scripts/get_ensembl_genes.R"
+
+
+# We only merge with polars because the join in get_ensembl_genes.R does not work
+rule annotate_chipseeker:
+    input:
+        chipseeker="results/dmr_calls/{group2}/genes_transcripts/chipseeker.tsv",
+        genes="results/dmr_calls/{group2}/genes_transcripts/ensembl_genes.tsv",
+    output:
+        "results/dmr_calls/{group2}/genes_transcripts/chipseeker_postprocessed.tsv",
+    conda:
+        "../envs/python_standard.yaml"
+    log:
+        "logs/annotate_chipseeker{group2}.log",
+    script:
+        "../scripts/annotate_chipseeker.py"
+
+
+# rule compose_sample_sheet:
+#     input:
+#         config["samples"],
+#         config["units"],
+#         kallisto_output=kallisto_output,
+#     output:
+#         "results/sleuth/{group}.samples.tsv",
+#     log:
+#         "logs/{group}.compose-sample-sheet.log",
+#     params:
+#         units=units,
+#         samples=samples,
+#     group:
+#         "sleuth-init"
+#     script:
+#         "../scripts/compose-sample-sheet.py"
diff --git a/workflow/scripts/annotate_chipseeker.py b/workflow/scripts/annotate_chipseeker.py
@@ -0,0 +1,33 @@
+import pandas as pd
+import sys
+
+# Define file paths from Snakemake
+chipseeker_file = snakemake.input["chipseeker"]
+ensembl_genes_file = snakemake.input["genes"]
+output_file = snakemake.output[0]
+
+# Load the input files
+chipseeker_df = pd.read_csv(chipseeker_file, sep="\t")
+ensembl_genes_df = pd.read_csv(ensembl_genes_file, sep="\t")
+
+# Rename columns to match for merging
+ensembl_genes_df.rename(columns={"ensembl_transcript_id": "transcriptId"}, inplace=True)
+
+# Perform the inner join
+merged_df = pd.merge(chipseeker_df, ensembl_genes_df, on="transcriptId", how="inner")
+
+# Drop rows with missing values in the specified columns
+filtered_df = merged_df.dropna(
+    subset=["transcriptId", "ensembl_gene_id", "external_gene_name"]
+)
+
+filtered_df = filtered_df.rename(
+    columns={
+        "q_value": "qval",
+        "ensembl_gene_id": "ens_gene",
+        "external_gene_name": "ext_gene",
+    }
+)
+
+# Save the processed data to the output file
+filtered_df.to_csv(output_file, sep="\t", index=False)
diff --git a/workflow/scripts/get_ensembl_genes.R b/workflow/scripts/get_ensembl_genes.R
@@ -0,0 +1,72 @@
+#  log <- file(snakemake@log[[1]], open="wt")
+#  sink(log)
+#  sink(log, type="message")
+
+library(biomaRt)
+library("tidyverse")
+library("cli")
+
+data <- read.table(snakemake@input[[1]], sep="\t", header=TRUE)
+
+mart <- "useast"
+rounds <- 0
+while (class(mart)[[1]] != "Mart") {
+  mart <- tryCatch(
+    {
+      # done here, because error function does not
+      # modify outer scope variables, I tried
+      if (mart == "www") rounds <- rounds + 1
+      # equivalent to useMart, but you can choose
+      # the mirror instead of specifying a host
+      biomaRt::useEnsembl(
+        biomart = "ENSEMBL_MART_ENSEMBL",
+        dataset = str_c(snakemake@params[["species"]], "_gene_ensembl"),
+        version = snakemake@params[["version"]],
+        mirror = mart
+      )
+    },
+    error = function(e) {
+      # change or make configurable if you want more or
+      # less rounds of tries of all the mirrors
+      if (rounds >= 3) {
+        cli_abort(
+          str_c(
+            "Have tried all 4 available Ensembl biomaRt mirrors ",
+            rounds,
+            " times. You might have a connection problem, or no mirror is responsive.\n",
+            "The last error message was:\n",
+            message(e)
+          )
+        )
+      }
+      # hop to next mirror
+      mart <- switch(mart,
+        useast = "uswest",
+        uswest = "asia",
+        asia = "www",
+        www = {
+          # wait before starting another round through the mirrors,
+          # hoping that intermittent problems disappear
+          Sys.sleep(30)
+          "useast"
+        }
+      )
+    }
+  )
+}
+
+gene_info <- getBM(attributes=c('ensembl_transcript_id', 'ensembl_gene_id', 'external_gene_name'),
+                   filters='ensembl_transcript_id', 
+                   values=data$transcriptId, 
+                   mart=mart)
+
+
+# Join does not work as intended
+# data_with_genes <- 
+#                     inner_join(
+#                     data,
+#                     gene_info,
+#                     by = join_by(transcriptId == ensembl_transcript_id))
+
+
+write.table(gene_info, file=snakemake@output[[1]], sep="\t", row.names=FALSE, quote=FALSE)