-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Addimator
committed
Oct 9, 2024
1 parent
934504f
commit b80edbd
Showing
9 changed files
with
210 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
channels: | ||
- conda-forge | ||
- bioconda | ||
dependencies: | ||
- bioconductor-biomart =2.58 | ||
- r-tidyverse =2.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
rule get_ensembl_genes: | ||
input: | ||
"results/dmr_calls/{group2}/genes_transcripts/chipseeker.tsv", | ||
output: | ||
"results/dmr_calls/{group2}/genes_transcripts/ensembl_genes.tsv", | ||
conda: | ||
"../envs/biomart.yaml" | ||
log: | ||
"logs/get_ensembl_genes_{group2}.log", | ||
params: | ||
species=get_bioc_species_name(), | ||
version=config["resources"]["ref"]["release"], | ||
script: | ||
"../scripts/get_ensembl_genes.R" | ||
|
||
|
||
# We only merge with polars because the join in get_ensembl_genes.R does not work | ||
rule annotate_chipseeker: | ||
input: | ||
chipseeker="results/dmr_calls/{group2}/genes_transcripts/chipseeker.tsv", | ||
genes="results/dmr_calls/{group2}/genes_transcripts/ensembl_genes.tsv", | ||
output: | ||
"results/dmr_calls/{group2}/genes_transcripts/chipseeker_postprocessed.tsv", | ||
conda: | ||
"../envs/python_standard.yaml" | ||
log: | ||
"logs/annotate_chipseeker{group2}.log", | ||
script: | ||
"../scripts/annotate_chipseeker.py" | ||
|
||
|
||
# rule compose_sample_sheet: | ||
# input: | ||
# config["samples"], | ||
# config["units"], | ||
# kallisto_output=kallisto_output, | ||
# output: | ||
# "results/sleuth/{group}.samples.tsv", | ||
# log: | ||
# "logs/{group}.compose-sample-sheet.log", | ||
# params: | ||
# units=units, | ||
# samples=samples, | ||
# group: | ||
# "sleuth-init" | ||
# script: | ||
# "../scripts/compose-sample-sheet.py" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import pandas as pd | ||
import sys | ||
|
||
# Define file paths from Snakemake | ||
chipseeker_file = snakemake.input["chipseeker"] | ||
ensembl_genes_file = snakemake.input["genes"] | ||
output_file = snakemake.output[0] | ||
|
||
# Load the input files | ||
chipseeker_df = pd.read_csv(chipseeker_file, sep="\t") | ||
ensembl_genes_df = pd.read_csv(ensembl_genes_file, sep="\t") | ||
|
||
# Rename columns to match for merging | ||
ensembl_genes_df.rename(columns={"ensembl_transcript_id": "transcriptId"}, inplace=True) | ||
|
||
# Perform the inner join | ||
merged_df = pd.merge(chipseeker_df, ensembl_genes_df, on="transcriptId", how="inner") | ||
|
||
# Drop rows with missing values in the specified columns | ||
filtered_df = merged_df.dropna( | ||
subset=["transcriptId", "ensembl_gene_id", "external_gene_name"] | ||
) | ||
|
||
filtered_df = filtered_df.rename( | ||
columns={ | ||
"q_value": "qval", | ||
"ensembl_gene_id": "ens_gene", | ||
"external_gene_name": "ext_gene", | ||
} | ||
) | ||
|
||
# Save the processed data to the output file | ||
filtered_df.to_csv(output_file, sep="\t", index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
# log <- file(snakemake@log[[1]], open="wt") | ||
# sink(log) | ||
# sink(log, type="message") | ||
|
||
library(biomaRt) | ||
library("tidyverse") | ||
library("cli") | ||
|
||
data <- read.table(snakemake@input[[1]], sep="\t", header=TRUE) | ||
|
||
mart <- "useast" | ||
rounds <- 0 | ||
while (class(mart)[[1]] != "Mart") { | ||
mart <- tryCatch( | ||
{ | ||
# done here, because error function does not | ||
# modify outer scope variables, I tried | ||
if (mart == "www") rounds <- rounds + 1 | ||
# equivalent to useMart, but you can choose | ||
# the mirror instead of specifying a host | ||
biomaRt::useEnsembl( | ||
biomart = "ENSEMBL_MART_ENSEMBL", | ||
dataset = str_c(snakemake@params[["species"]], "_gene_ensembl"), | ||
version = snakemake@params[["version"]], | ||
mirror = mart | ||
) | ||
}, | ||
error = function(e) { | ||
# change or make configurable if you want more or | ||
# less rounds of tries of all the mirrors | ||
if (rounds >= 3) { | ||
cli_abort( | ||
str_c( | ||
"Have tried all 4 available Ensembl biomaRt mirrors ", | ||
rounds, | ||
" times. You might have a connection problem, or no mirror is responsive.\n", | ||
"The last error message was:\n", | ||
message(e) | ||
) | ||
) | ||
} | ||
# hop to next mirror | ||
mart <- switch(mart, | ||
useast = "uswest", | ||
uswest = "asia", | ||
asia = "www", | ||
www = { | ||
# wait before starting another round through the mirrors, | ||
# hoping that intermittent problems disappear | ||
Sys.sleep(30) | ||
"useast" | ||
} | ||
) | ||
} | ||
) | ||
} | ||
|
||
gene_info <- getBM(attributes=c('ensembl_transcript_id', 'ensembl_gene_id', 'external_gene_name'), | ||
filters='ensembl_transcript_id', | ||
values=data$transcriptId, | ||
mart=mart) | ||
|
||
|
||
# Join does not work as intended | ||
# data_with_genes <- | ||
# inner_join( | ||
# data, | ||
# gene_info, | ||
# by = join_by(transcriptId == ensembl_transcript_id)) | ||
|
||
|
||
write.table(gene_info, file=snakemake@output[[1]], sep="\t", row.names=FALSE, quote=FALSE) |