Skip to content

Commit

Permalink
Add enrichment
Browse files Browse the repository at this point in the history
  • Loading branch information
Addimator committed Oct 9, 2024
1 parent 934504f commit b80edbd
Show file tree
Hide file tree
Showing 9 changed files with 210 additions and 56 deletions.
41 changes: 30 additions & 11 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,27 +27,46 @@ enrichment:
activate: true
fdr_genes: 0.05
fdr_go_terms: 0.05
pathvars:
input_file: "results/dmr_calls/{group}/genes_transcripts/chipseeker_postprocessed"
output_file: "results/enrichment/{group}/genes_transcripts/go_term_enrichment.gene_fdr_{gene_fdr}.go_term_fdr_{go_term_fdr}"
# Must contain ns
plot_file: "results/enrichment/{{group}}/plots/go_term_enrichment_{ns}.gene_fdr_{{gene_fdr}}.go_term_fdr_{{go_term_fdr}}"
datavzrd_file: "results/enrichment/{group}/datavzrd/go_enrichment-{gene_fdr}.go_term_fdr_{go_term_fdr}"
# Must contain all wildcards from the other files
log_file_name: "{group}_{gene_fdr}_{go_term_fdr}"
wildcards:
# group: 'read_sample_tsv(config["sample_path"])'
group: '["BC02-ref-sorted", "BC03-ref-sorted", "BC04-ref-sorted",]'
gene_fdr: "str(config['enrichment']['goatools']['fdr_genes']).replace('.', '-')"
go_term_fdr: "str(config['enrichment']['goatools']['fdr_go_terms']).replace('.', '-')"
ns: '["BP", "CC", "MF"]'
fgsea:
gene_sets_file: "resources/gene_sets/dummy.gmt"
# tool is only run if set to `true`
activate: true
activate: false
# if activated, you need to provide a GMT file with gene sets of interest
fdr_gene_set: 0.05
eps: 1.0e-50
spia:
# tool is only run if set to `true`
activate: true
# pathway database to use in SPIA, needs to be available for
# the species specified by resources -> ref -> species above
pathway_database: "reactome"
# OrgDB Genome wide annotation package (https://www.bioconductor.org/packages/release/BiocViews.html#___OrgDb) for the species under consideration.
# Only required if you want to have a gene analysis for your pathways. Else NA
orgDb: org.Hs.eg.db
pathvars:
sleuth_sample: "results/sleuth/{group}.samples.tsv"
input_file: "results/dmr_calls/{group}/genes_transcripts/chipseeker_postprocessed"
output_file: "results/tables/fgsea/{group}"
plot_file: "results/plots/fgsea/{group}"
log_file_name: "{group}"
wildcards:
# group: 'read_sample_tsv(config["sample_path"])'
group: '["BC02-ref-sorted", "BC03-ref-sorted", "BC04-ref-sorted",]'

effect_col:
dynamic: false
dynamic_expression: ""
static_value: "mean_methylation_difference"

# Not used right now
meta_comparisons:
# comparison is only run if set to `true`
activate: true
activate: false
# Define here the comparisons under interest
comparisons:
# Define any name for comparison. You can add as many comparisions as you want
Expand Down
17 changes: 9 additions & 8 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,21 @@ include: "rules/common.smk"
include: "rules/dmr_heatmap.smk"
include: "rules/dmr_calling.smk"
include: "rules/prepare_meth_calling.smk"
include: "rules/prepare_enrichment.smk"
include: "rules/methylation_calling.smk"
include: "rules/annotate_dmrs.smk"


# declare https://github.com/Addimator/enrichment as a module
module enrichment:
snakefile:
github(
"Addimator/enrichment",
path="workflow/Snakefile",
# tag="v2.5.4",
branch="main",
)
# "/home/adrian/Documents/Promotion/locotact_stuff/enrichment/workflow/Snakefile"
# github(
# "Addimator/enrichment",
# path="workflow/Snakefile",
# # tag="v2.5.4",
# branch="main",
# )
"/homes/aprinz/enrichment/workflow/Snakefile"
config:
config

Expand All @@ -46,5 +47,5 @@ use rule * from enrichment
rule modified_all:
default_target: True
input:
# rules.all.input,
all_input,
rules.all.input,
6 changes: 6 additions & 0 deletions workflow/envs/biomart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
dependencies:
- bioconductor-biomart =2.58
- r-tidyverse =2.0
33 changes: 0 additions & 33 deletions workflow/envs/varlociraptor.yaml
Original file line number Diff line number Diff line change
@@ -1,36 +1,3 @@
# name: varlociraptor
# channels:
# - conda-forge
# - bioconda
# - rmg
# - nodefaults
# dependencies:
# - alignoth=0.7.3
# # - bcftools=1.16
# # - bwa=0.7.17
# - bzip2=1.0.8
# - ca-certificates=2022.12.7
# - cmake=3.25.1
# - curl=7.87.0
# - expat=2.5.0
# - gcc=12.2.0
# - gsl=2.7
# # - htslib=1.16
# - make=4.3
# - mason=2.0.9
# - perl=5.32.1
# - pkg-config=0.29.2
# - rust=1.80.0
# # - samtools=1.16.1
# - xz=5.2.6
# # - zlib=1.2.13
# - zlib-ng=2.0.6
# - zstd=1.5.2
# - libclang=16.0.6
# - clangdev=16.0.6
# - libgcc-ng


channels:
- conda-forge
- bioconda
Expand Down
6 changes: 6 additions & 0 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ def read_sample_tsv(sample_tsv_path):
samples = read_sample_tsv(sample_tsv_path)


def get_bioc_species_name():
first_letter = config["resources"]["ref"]["species"][0]
subspecies = config["resources"]["ref"]["species"].split("_")[1]
return first_letter + subspecies


def all_input(wildcards):
wanted_input = []
wanted_input.extend(
Expand Down
11 changes: 7 additions & 4 deletions workflow/rules/methylation_calling.smk
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ scattergather:

rule find_candidates:
input:
"resources/genome.fasta",
fasta="resources/genome.fasta",
varlo_path="resources/tools/varlociraptor",
output:
"resources/candidates/candidates.bcf",
Expand All @@ -15,8 +15,9 @@ rule find_candidates:
"../envs/varlociraptor.yaml"
shell:
"""
PIPELINE_PATH=$(pwd)
cd {input.varlo_path}
cargo run -- methylation-candidates {input} {output}
cargo run -- methylation-candidates $PIPELINE_PATH/{input.fasta} $PIPELINE_PATH/{output}
"""


Expand Down Expand Up @@ -51,8 +52,9 @@ rule compute_meth_observations:
sequencer=lambda wildcards: samples[wildcards.sample][1],
shell:
"""
PIPELINE_PATH=$(pwd)
cd {input.varlo_path}
cargo run --release -- preprocess variants {input.genome} --candidates {input.candidates} --bam {input.alignments} --read-type {params.sequencer} --max-depth 1000 > {output}
cargo run --release -- preprocess variants $PIPELINE_PATH/{input.genome} --candidates $PIPELINE_PATH/{input.candidates} --bam $PIPELINE_PATH/{input.alignments} --read-type {params.sequencer} --max-depth 1000 > $PIPELINE_PATH/{output}
"""


Expand All @@ -69,8 +71,9 @@ rule call_methylation:
"../envs/varlociraptor.yaml"
shell:
"""
PIPELINE_PATH=$(pwd)
cd {input.varlo_path}
cargo run --release -- call variants --omit-strand-bias generic --scenario {input.scenario} --obs normal={input.preprocess_obs} > {output}
cargo run --release -- call variants --omit-strand-bias generic --scenario $PIPELINE_PATH/{input.scenario} --obs normal=$PIPELINE_PATH/{input.preprocess_obs} > $PIPELINE_PATH/{output}
"""


Expand Down
47 changes: 47 additions & 0 deletions workflow/rules/prepare_enrichment.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
rule get_ensembl_genes:
input:
"results/dmr_calls/{group2}/genes_transcripts/chipseeker.tsv",
output:
"results/dmr_calls/{group2}/genes_transcripts/ensembl_genes.tsv",
conda:
"../envs/biomart.yaml"
log:
"logs/get_ensembl_genes_{group2}.log",
params:
species=get_bioc_species_name(),
version=config["resources"]["ref"]["release"],
script:
"../scripts/get_ensembl_genes.R"


# We only merge with polars because the join in get_ensembl_genes.R does not work
rule annotate_chipseeker:
input:
chipseeker="results/dmr_calls/{group2}/genes_transcripts/chipseeker.tsv",
genes="results/dmr_calls/{group2}/genes_transcripts/ensembl_genes.tsv",
output:
"results/dmr_calls/{group2}/genes_transcripts/chipseeker_postprocessed.tsv",
conda:
"../envs/python_standard.yaml"
log:
"logs/annotate_chipseeker{group2}.log",
script:
"../scripts/annotate_chipseeker.py"


# rule compose_sample_sheet:
# input:
# config["samples"],
# config["units"],
# kallisto_output=kallisto_output,
# output:
# "results/sleuth/{group}.samples.tsv",
# log:
# "logs/{group}.compose-sample-sheet.log",
# params:
# units=units,
# samples=samples,
# group:
# "sleuth-init"
# script:
# "../scripts/compose-sample-sheet.py"
33 changes: 33 additions & 0 deletions workflow/scripts/annotate_chipseeker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pandas as pd
import sys

# Define file paths from Snakemake
chipseeker_file = snakemake.input["chipseeker"]
ensembl_genes_file = snakemake.input["genes"]
output_file = snakemake.output[0]

# Load the input files
chipseeker_df = pd.read_csv(chipseeker_file, sep="\t")
ensembl_genes_df = pd.read_csv(ensembl_genes_file, sep="\t")

# Rename columns to match for merging
ensembl_genes_df.rename(columns={"ensembl_transcript_id": "transcriptId"}, inplace=True)

# Perform the inner join
merged_df = pd.merge(chipseeker_df, ensembl_genes_df, on="transcriptId", how="inner")

# Drop rows with missing values in the specified columns
filtered_df = merged_df.dropna(
subset=["transcriptId", "ensembl_gene_id", "external_gene_name"]
)

filtered_df = filtered_df.rename(
columns={
"q_value": "qval",
"ensembl_gene_id": "ens_gene",
"external_gene_name": "ext_gene",
}
)

# Save the processed data to the output file
filtered_df.to_csv(output_file, sep="\t", index=False)
72 changes: 72 additions & 0 deletions workflow/scripts/get_ensembl_genes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# log <- file(snakemake@log[[1]], open="wt")
# sink(log)
# sink(log, type="message")

library(biomaRt)
library("tidyverse")
library("cli")

data <- read.table(snakemake@input[[1]], sep="\t", header=TRUE)

mart <- "useast"
rounds <- 0
while (class(mart)[[1]] != "Mart") {
mart <- tryCatch(
{
# done here, because error function does not
# modify outer scope variables, I tried
if (mart == "www") rounds <- rounds + 1
# equivalent to useMart, but you can choose
# the mirror instead of specifying a host
biomaRt::useEnsembl(
biomart = "ENSEMBL_MART_ENSEMBL",
dataset = str_c(snakemake@params[["species"]], "_gene_ensembl"),
version = snakemake@params[["version"]],
mirror = mart
)
},
error = function(e) {
# change or make configurable if you want more or
# less rounds of tries of all the mirrors
if (rounds >= 3) {
cli_abort(
str_c(
"Have tried all 4 available Ensembl biomaRt mirrors ",
rounds,
" times. You might have a connection problem, or no mirror is responsive.\n",
"The last error message was:\n",
message(e)
)
)
}
# hop to next mirror
mart <- switch(mart,
useast = "uswest",
uswest = "asia",
asia = "www",
www = {
# wait before starting another round through the mirrors,
# hoping that intermittent problems disappear
Sys.sleep(30)
"useast"
}
)
}
)
}

gene_info <- getBM(attributes=c('ensembl_transcript_id', 'ensembl_gene_id', 'external_gene_name'),
filters='ensembl_transcript_id',
values=data$transcriptId,
mart=mart)


# Join does not work as intended
# data_with_genes <-
# inner_join(
# data,
# gene_info,
# by = join_by(transcriptId == ensembl_transcript_id))


write.table(gene_info, file=snakemake@output[[1]], sep="\t", row.names=FALSE, quote=FALSE)

0 comments on commit b80edbd

Please sign in to comment.