DE_pipeline.snake

## SeA-SnaP differential expression pipeline for RNA-seq analysis
## version: 1.0
## author: J.P.Pett (patrick.pett@bihealth.de)

#TODO: replace config value check by built-in? https://snakemake.readthedocs.io/en/stable/snakefiles/configuration.html#validation

import os, re, yaml, textwrap, pandas as pd
from collections import OrderedDict
from time import asctime, localtime, time
from pathlib import Path
from snakemake.utils import report, format as snakemake_format
from tools.pipeline_tools import DEPipelinePathHandler, ReportTool

yaml.add_representer(OrderedDict, lambda dumper, data: dumper.represent_dict(dict(data)))

SNAKEDIR  = Path(workflow.current_basedir)
SNAKEFILE = workflow.snakefile
SCRIPTDIR = str(SNAKEDIR / "external_scripts")

# assemble config
config_file_name = config["file_name"] if "file_name" in config else "DE_config.yaml"
configfile: str(SNAKEDIR / "defaults" / "DE_config_defaults.yaml")
configfile: config_file_name
if config["organism_defaults"]:
	configfile: str(SNAKEDIR / "defaults" / config["organism_defaults"])
	configfile: config_file_name

# create path handler
conf_ranges = str(SNAKEDIR / "defaults" / "DE_config_ranges.yaml")
test_config = conf_ranges if config["pipeline_param"]["test_config"] else None
pph = DEPipelinePathHandler(workflow, test_config)

# exclude symbols '.' and '/' from wildcards
wildcard_constraints: 
	sample="[^./]+"


R_SESSION_INFO = r"""
cat("########################### session info ############################","\n")
print(sessionInfo())
cat("#####################################################################","\n\n")
"""

onstart:
	# draw a dag
	dag_file = pph.file_path(step="pipeline_report", extension="rule_execution.png", contrast="all")
	os.makedirs(os.path.dirname(dag_file), exist_ok=True)
	shell("snakemake --quiet --snakefile {} --rulegraph | dot -Tpng 1> {}".format(SNAKEFILE, dag_file))
	# info about the pipeline run
	info_file = pph.file_path(step="pipeline_report", extension="summary.csv", contrast="all")
	os.makedirs(os.path.dirname(info_file), exist_ok=True)
	shell("snakemake --quiet --snakefile {} --summary | sed 's/\t/, /g' 1> {}".format(SNAKEFILE, info_file))
	# save merged config
	config_file = pph.file_path(step="pipeline_report", extension="yaml", contrast="all")
	with open(config_file, "w") as f: yaml.dump(config, f, default_flow_style=False)


##-------------------- starting point ----------------------------------------------------------------------

def get_inputs_all():
	#contrasts
	inputs = pph.expand_path(step="contrast", extension="rds")
	
	#functional annotation
	inputs.append(pph.expand_path(step = "goseq", extension = "go.rds",   if_set = dict(goseq=True) ))
	inputs.append(pph.expand_path(step = "goseq", extension = "kegg.rds", if_set = dict(goseq=True) ))
	inputs.append(pph.file_path(step = "annotation", extension = "rds", contrast="all"))
	inputs.append(pph.file_path(step = "export_raw_counts", extension = "xlsx", contrast = "all"))
	inputs.append(pph.expand_path(step = "cluster_profiler", extension = "rds", if_set = dict(cluster_profiler=dict(run=True)) ))
	inputs.append(pph.file_path(step = "tmod_dbs", extension = "rds", contrast="all"))
	inputs.append(pph.file_path(step = "tmod_pca", extension = "rds", contrast="all", if_set=dict(tmod_pca=True)))
	inputs.append(pph.expand_path(step = "tmod", extension = "rds", if_set = dict(tmod=True)))

	#time series
	for ts in config["time_series"]:
		inputs.append(pph.file_path(step = "rain", extension = "tsv", contrast=ts))

	#time series comparisons
	for tsc in config["dodr"]["comparisons"]:
		inputs.append(pph.file_path(step = "dodr", extension = "tsv", contrast=tsc))

	return inputs


shell("rm -f {}".format(pph.file_path(step="pipeline_report", extension="report.html", contrast="all")))

rule all:
	input:
		get_inputs_all(),
		# html generation does not work
		#pph.file_path(step="report_html", extension="html", contrast="all"),
		pph.file_path("report", "Rmd", contrast = "all")
	output:
		html = pph.file_path(step="pipeline_report", extension="report.html", contrast="all")
	run:
		loctime = asctime(localtime(time()))
		rule_execution = pph.file_path("pipeline_report", "rule_execution.png", contrast="all")
		summary        = pph.file_path("pipeline_report", "summary.csv",        contrast="all")
		version_info   = pph.file_path("pipeline_report", "version_info.txt",   contrast="all")
		conda_info     = pph.file_path("pipeline_report", "conda_info.txt",     contrast="all")
		dag = rule_execution.split("/")[-1]
		shell("conda list > {}".format(version_info))
		shell("conda info > {}".format(conda_info))
		report("""
		=======================
		RNAseq mapping pipeline
		=======================
		
		**Finished: {loctime}**
		
		.. image:: {dag}
		
		File status at pipeline start:
		==============================
		
		.. csv-table::
			:file: {summary}
			
		Version info:
		=============
		
		.. include:: {version_info}
			:literal:
		
		Conda info:
		===========
		
		.. include:: {conda_info}
			:literal:
		
		""", output.html, graph = rule_execution, table = summary)


rule export:
	input:
		get_inputs_all()
	run:
		pph.export()

	
##-------------------- make TxDb -----------------------------------------------------------------------------

rule TxDb_from_GTF:
	""" make a TxDb object from GTF """
	input:
		gtf = config["organism"]["files"]["gtf"]
	output:
		pph.file_path(step = "TxDb_from_GTF", extension = "sqlite", contrast = "all")
	log:
		out = pph.file_path("TxDb_from_GTF", "output.log", contrast = "all", log=True)
	run:
		genus = config["organism"]["genus"]
		taxon = config["organism"]["taxon"]
		
		script = textwrap.dedent(r"""
		#----- import packages
		library(GenomicFeatures)
		
		{R_SESSION_INFO}
		
		#----- make transcript to gene table
		TxDb <- makeTxDbFromGFF("{input.gtf}", format="gtf", organism="{genus}", taxonomyId={taxon})
		
		#----- save Db
		saveDb(TxDb, "{output}")
		""")
		
		script_file = pph.log(log.out, snakemake_format(script), step="TxDb_from_GTF", extension="R", contrast = "all", **wildcards)
		shell("Rscript --vanilla '{script_file}' &>> '{log.out}'")


##-------------------- import -------------------------------------------------------------------------------

DESIGN = config["experiment"]["design_formula"]


rule import_gene_counts:
	""" collect count files (STAR) and build count matrix """
	input:
		samples = config["experiment"]["covariate_file"]["star"]
	output:
		rds=pph.file_path("import_gene_counts", "rds", contrast = "all"),
		tsv=pph.file_path("import_gene_counts", "tsv", contrast = "all"),
		xlsx=pph.file_path("import_gene_counts", "xlsx", contrast = "all")
	log:
		out = pph.file_path("import_gene_counts", "output.log", contrast = "all", log=True)
	params:
		tpm_xlsx=pph.file_path("import_gene_counts", "tpm.xlsx", contrast = "all"),
		annot_pkg  = config["organism"]["R"]["annotations"]
	run:
		config_file = pph.file_path(step="pipeline_report", extension="yaml", contrast="all")
		column_filter = ""
		if config["filters"]["experiment_whitelist"]:
			column_filter += "&".join("is.element(sample_df${},{})".format( k, "c({})".format(",".join('"'+l+'"' for l in v)) )
								for k,v in config["filters"]["experiment_whitelist"].items())
		if config["filters"]["experiment_blacklist"]:
			column_filter += "&".join("!is.element(sample_df${},{})".format( k, "c({})".format(",".join('"'+l+'"' for l in v)) )
								for k,v in config["filters"]["experiment_blacklist"].items())
		if column_filter == "":
			column_filter = "TRUE"

		r_gene_list = ("'"+config["filters"]["gene_list"]["file"]+"'") if config["filters"]["gene_list"]["file"] else "NULL"
		r_gene_list_type = config["filters"]["gene_list"]["type"] or "NULL"
		
		column_lvl = config["experiment"]["columns"] if "columns" in config["experiment"] else {}
		level_list = "list({})".format(",".join("{}=c({})".format( n, ",".join('"'+s+'"' for s in l) ) for n,l in column_lvl.items()))
		
		script = textwrap.dedent(r"""
		#----- import packages
		library(DESeq2)
		library(writexl)
		library(AnnotationDbi)
		library({params.annot_pkg})

		{R_SESSION_INFO}

		#----- variables
		sample_f  <- "{input.samples}"
		output_xlsx <- "{output.xlsx}"
		output_tsv <- "{output.tsv}"
		output_tpm_xlsx <- "{params.tpm_xlsx}"
		output_rds  <- "{output.rds}"
		conf.f <- "{config_file}"
		gene_list <- {r_gene_list}
		subset_genes_type <- "{r_gene_list_type}"

		#----- import data
		sample_df <- read.table(sample_f, header=TRUE)
		files     <- as.character(sample_df$filename); print(files)
		samples   <- as.character(sample_df$label)
		config <- yaml::yaml.load_file(conf.f)

		#----- merge count files from star
		df_list   <- lapply(1:length(files), function(i) as.data.frame(read.csv(files[i], skip=4, sep="\t", 
		                                                                        col.names=c("ID", samples[i]), check.names=FALSE, header=FALSE)))
		count_dat <- Reduce(function(...) merge(..., all=TRUE, by="ID"), df_list)
		rownames(count_dat) <- count_dat$ID

		#----- subset genes
		if (!is.null(gene_list)) {{
			subset_genes <- as.character(read.csv(gene_list, header=F)[,1]); print(subset_genes)
			if (subset_genes_type == "ENSEMBL"){{
				subset_ensembl_genes <- subset_genes
			}} else {{
				subset_ensembl_genes <- mapIds({params.annot_pkg}, keys=subset_genes, column="ENSEMBL",
											   keytype=subset_genes_type, multiVals="first"); print(subset_ensembl_genes)
			}}
			count_dat <- count_dat[subset_ensembl_genes[!is.na(subset_ensembl_genes)],]
		}}

		#----- if present, merge tpm values (generated from star bam with TPMcalculator) for display along results
		if ("tpm" %in% colnames(sample_df)) {{
			tpm_files <- as.character(sample_df$tpm)
			tpm_df_list <- lapply(1:length(tpm_files), function(i) as.data.frame(read.csv(tpm_files[i], sep="\t",
		                                                                        col.names=c("ID", samples[i]), check.names=FALSE, header=FALSE)))
			tpm_dat <- Reduce(function(...) merge(..., all=TRUE, by="ID"), tpm_df_list); rownames(tpm_dat) <- tpm_dat$ID
		}}

		#----- change levels
		level_cols <- {level_list}
		for (col in names(level_cols)) {{
			sample_df[,col] <- factor(sample_df[,col], levels = level_cols[[col]])
		}}

		#----- import counts in DESeq2
		rownames(sample_df) <- sample_df$label; count_dat <- count_dat[, rownames(sample_df)]
		dds <- DESeqDataSetFromMatrix(countData = as.matrix(count_dat), colData = sample_df, design = {DESIGN})
		
		#----- save as rds
		saveRDS(dds, file=output_rds)

		#----- export counts as tables
		colnames(count_dat) <- sample_df$label
		write_xlsx(count_dat, path=output_xlsx)
		write.table(count_dat, file=output_tsv, sep="\t", quote=F)
		if ("tpm" %in% colnames(sample_df)) {{
			symbol <- mapIds({params.annot_pkg}, keys=sub("\\.[0-9]+$", "", tpm_dat$ID),
		                     column="SYMBOL", keytype="ENSEMBL", multiVals="first")
		    tpm_dat <- cbind(symbol, tpm_dat)
			colnames(tpm_dat) <- c("Symbol", "geneID", as.character(sample_df$label))
			write_xlsx(tpm_dat, path=output_tpm_xlsx)
		}}
		""")
		
		script_file = pph.log(log.out, snakemake_format(script), step="import_gene_counts", extension="R", contrast = "all", **wildcards)
		shell("Rscript --vanilla '{script_file}' &>> '{log.out}'")

rule import_featurecounts:
	""" collect count files (featurecounts) and build count matrix """
	input:
		samples = config["experiment"]["covariate_file"]["star"]
	output:
		pph.file_path("import_featurecounts", "rds", contrast = "all")
	log:
		out = pph.file_path("import_featurecounts", "output.log", contrast = "all", log=True)
	run:
		column_filter = ""
		if config["filters"]["experiment_whitelist"]:
			column_filter += "&".join("is.element(sample_df${},{})".format( k, "c({})".format(",".join('"'+l+'"' for l in v)) )
								for k,v in config["filters"]["experiment_whitelist"].items())
		if config["filters"]["experiment_blacklist"]:
			column_filter += "&".join("!is.element(sample_df${},{})".format( k, "c({})".format(",".join('"'+l+'"' for l in v)) )
								for k,v in config["filters"]["experiment_blacklist"].items())
		if column_filter == "":
			column_filter = "TRUE"
								
		column_lvl = config["experiment"]["columns"] if "columns" in config["experiment"] else {}
		level_list  = "list({})".format(",".join("{}=c({})".format( n, ",".join('"'+s+'"' for s in l) ) for n,l in column_lvl.items()))
		
		script = textwrap.dedent(r"""
		#----- import packages
		library(DESeq2)
		
		{R_SESSION_INFO}
		
		#----- import data
		sample_df <- read.table("{input.samples}", header=TRUE)
		sample_df  <- sample_df[{column_filter},]
		files     <- as.character(sample_df$filename); print(files)
		samples   <- as.character(sample_df$label)

		#----- merge count files
		read_fc <- function(fn, sn) read.csv(fn, comment="#", sep="\t", colClasses=c("character", rep("NULL", 5), "numeric"),
		                                             col.names=c("ID", rep("dummy", 5), sn), check.names=FALSE)
		df_list   <- lapply(1:length(files), function(i) as.data.frame(read_fc(files[i], samples[i])))
		count_dat <- Reduce(function(...) merge(..., all=TRUE, by="ID"), df_list); rownames(count_dat) <- count_dat$ID
		
		#----- change levels
		level_cols <- {level_list}
		for (col in names(level_cols)) {{
			sample_df[,col] <- factor(sample_df[,col], levels = level_cols[[col]])
		}}

		#----- import counts in DESeq2
		rownames(sample_df) <- sample_df$label; count_dat <- count_dat[, rownames(sample_df)]
		dds <- DESeqDataSetFromMatrix(countData = as.matrix(count_dat), colData = sample_df, design = {DESIGN})
		
		#----- save as rds
		saveRDS(dds, file="{output}")
		""")
		
		script_file = pph.log(log.out, snakemake_format(script), step="import_featurecounts", extension="R", contrast = "all", **wildcards)
		shell("Rscript --vanilla '{script_file}' &>> '{log.out}'")
	
rule import_sf:
	""" collect sf files (Salmon) and build count matrix """
	input:
		txdb    = pph.file_path("TxDb_from_GTF", "sqlite", contrast = "all"),
		samples = config["experiment"]["covariate_file"]["salmon"]
	output:
		pph.file_path("import_sf", "rds", contrast = "all")
	log:
		out = pph.file_path("import_sf", "output.log", contrast = "all", log=True)
	run:
		column_filter = ""
		if config["filters"]["experiment_whitelist"]:
			column_filter += "&".join("is.element(sample_df${},{})".format( k, "c({})".format(",".join('"'+l+'"' for l in v)) ) 
								for k,v in config["filters"]["experiment_whitelist"].items())
		if config["filters"]["experiment_blacklist"]:
			column_filter += "&".join("!is.element(sample_df${},{})".format( k, "c({})".format(",".join('"'+l+'"' for l in v)) ) 
								for k,v in config["filters"]["experiment_blacklist"].items())
		if column_filter == "":
			column_filter = "TRUE"
								
		column_lvl = config["experiment"]["columns"] if "columns" in config["experiment"] else {}
		level_list = "list({})".format(",".join("{}=c({})".format( k, ",".join('"'+l+'"' for l in v) ) for k,v in column_lvl.items()))
		
		script = textwrap.dedent(r"""
		#----- import packages
		library(DESeq2)
		library(tximport)
		library(readr)
		library(AnnotationDbi)
		
		{R_SESSION_INFO}
		
		#----- make transcript to gene table
		TxDb    <- loadDb(file = "{input.txdb}")
		k       <- keys(TxDb, keytype = "TXNAME")
		tx2gene <- select(TxDb, k, "GENEID", "TXNAME")
		
		#----- use tximport on input files
		sample_df  <- read.table("{input.samples}", header=TRUE)
		sample_df  <- sample_df[{column_filter},]
		files      <- as.character(sample_df$filename); print(files)
		txi        <- tximport(files, type = "salmon", tx2gene = tx2gene)
		
		#----- change levels
		level_cols <- {level_list}
		for (col in names(level_cols)) {{
			sample_df[,col] <- factor(sample_df[,col], levels = level_cols[[col]])
		}}

		#----- import txi in DESeq2
		rownames(sample_df) <- sample_df$label
		dds <- DESeqDataSetFromTximport(txi, colData = sample_df, design = {DESIGN})
		
		#----- save as rds
		saveRDS(dds, file="{output}")
		""")
		script_file = pph.log(log.out, snakemake_format(script), step="import_sf", extension="R", contrast = "all", **wildcards)
		shell("Rscript --vanilla '{script_file}' &>> '{log.out}'")

##-------------------- export_raw_counts -------------------------------------------------------------------------------

rule export_raw_counts:
	""" export DESeq2 raw counts from the DESeq2 object as XLSX"""
	input:
		rds = pph.choose_input(choice_name = "mapping", options = [
		dict(step = "import_gene_counts",   extension = "rds", contrast = "all"),
		dict(step = "import_featurecounts", extension = "rds", contrast = "all"),
		dict(step = "import_sf",            extension = "rds", contrast = "all")
		])
	output:
		xlsx = pph.file_path("export_raw_counts", "xlsx", contrast = "all"),
		csv  = pph.file_path("export_raw_counts", "csv", contrast = "all")
	log:
		out = pph.file_path("export_raw_counts", "output.log", contrast = "all", log=True)
	run:
		script = textwrap.dedent(r"""
		#----- import packages
		library(DESeq2)
		library(writexl)
    library(tibble)
    library(readr)

		{R_SESSION_INFO}

		#----- variables
		output_xlsx <- "{output.xlsx}"
		output_csv  <- "{output.csv}"
		input_rds   <- "{input.rds}"

		#----- read RDS object, extract count data, save as XLSX
		dds <- readRDS(input_rds)
    count_dat <- assay(dds)
		count_dat <- as.data.frame(count_dat) %>% rownames_to_column("PrimaryID")
		col_dat <- colData(dds)
		colnames(count_dat)[-1] <- col_dat$label
    write_csv(count_dat, path=output_csv)
    write_xlsx(count_dat, path=output_xlsx)
		""")
		script_file = pph.log(log.out, snakemake_format(script), step="export_raw_counts", extension="R", contrast = "all", **wildcards)
		shell("Rscript --vanilla '{script_file}' &>> '{log.out}'")

##-------------------- DESeq2 -------------------------------------------------------------------------------

rule DESeq2:
	""" run DESeq2 on a count matrix """
	input:
		pph.choose_input(choice_name = "mapping", options = [
		dict(step = "import_gene_counts",   extension = "rds", contrast = "all"),
		dict(step = "import_featurecounts", extension = "rds", contrast = "all"),
		dict(step = "import_sf",            extension = "rds", contrast = "all")
		])
	output:
		dds           = pph.file_path("DESeq2", "deseq2.rds", contrast = "all"),
		rld_blind     = pph.file_path("DESeq2", "rld.blind.rds", contrast = "all"),
		rld_model     = pph.file_path("DESeq2", "rld.model.rds", contrast = "all"),
		rld_blind_csv = pph.file_path("DESeq2", "rld.blind.csv", contrast = "all"),
		rld_model_csv = pph.file_path("DESeq2", "rld.model.csv", contrast = "all")
	log:
		out = pph.file_path("DESeq2", "output.log", contrast = "all", log=True)
	params:
		coef_names = pph.file_path("DESeq2", "coef_names.txt", contrast = "all"),
		rld_names  = pph.file_path("DESeq2", "rld.model.csv", contrast = "all"),
		annot_pkg  = config["organism"]["R"]["annotations"]
	run:
		config_file = pph.file_path(step="pipeline_report", extension="yaml", contrast="all")
		count_threshold       = config["filters"]["low_counts"]
		normalized_expression = config["normalization"]["normalized_expression"]
		script = textwrap.dedent(r"""
		#----- import packages
		library(DESeq2)
		library(AnnotationDbi)
		library({params.annot_pkg})
		
		{R_SESSION_INFO}

		conf.f   <- "{config_file}"

		#----- load config
		config <- yaml::yaml.load_file(conf.f)

		#----- load DESeqDataSet
		dds <- readRDS("{input}")
		
		#----- pre-filtering
		keep <- rowSums(counts(dds)) >= {count_threshold}

		## at least min_counts in at least min_count_n number of samples
		if(!is.null(config$filters$min_counts) && !is.null(config$filters$min_count_n)) {{
		  rs <- rowSums(counts(dds) > config$filters$min_counts)
		  keep <- keep & rs >= config$filters$min_count_n
		}}

		dds  <- dds[keep,]; print(dds)
		
		#----- DESeq2
		dds <- DESeq(dds, betaPrior=FALSE, test="Wald")

		# ----- Normalised expression values
		rld_blind <- switch("{normalized_expression}",
			"rld"=rlog(dds, blind=TRUE),
			"vst"=vst(dds, blind=TRUE)
		)
		rld_model <- switch("{normalized_expression}",
			"rld"=rlog(dds, blind=FALSE),
			"vst"=vst(dds, blind=FALSE)
		)

		#----- save as rds
		saveRDS(dds, file="{output.dds}")
		saveRDS(rld_blind, file="{output.rld_blind}")
		saveRDS(rld_model, file="{output.rld_model}")

		#----- save other files
		writeLines(resultsNames(dds), "{params.coef_names}")
		write.table(data.frame(
			gene_id=rownames(rld_model),
			assay(rld_model),
			symbol=mapIds({params.annot_pkg}, keys=sub("\\.[0-9]+$", "", row.names(dds)), 
			              column="SYMBOL", keytype="ENSEMBL", multiVals="first"),
			entrez=mapIds({params.annot_pkg}, keys=sub("\\.[0-9]+$", "", row.names(dds)), 
			              column="ENTREZID", keytype="ENSEMBL", multiVals="first"),
			check.names=FALSE
			), file="{output.rld_model_csv}", sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
		write.table(data.frame(
			gene_id=rownames(rld_blind),
			assay(rld_blind),
			symbol=mapIds({params.annot_pkg}, keys=sub("\\.[0-9]+$", "", row.names(dds)), 
			              column="SYMBOL", keytype="ENSEMBL", multiVals="first"),
			entrez=mapIds({params.annot_pkg}, keys=sub("\\.[0-9]+$", "", row.names(dds)), 
			              column="ENTREZID", keytype="ENSEMBL", multiVals="first"),
			check.names=FALSE
			), file="{output.rld_blind_csv}", sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
		""")
		script_file = pph.log(log.out, snakemake_format(script), step="DESeq2", extension="R", contrast = "all", **wildcards)
		shell("Rscript --vanilla '{script_file}' &>> '{log.out}'")
		
		
##-------------------- contrasts ----------------------------------------------------------------------------

rule contrast:
	""" make a contrast """
	input:
		pph.file_path("DESeq2", "deseq2.rds", contrast = "all")
	output:
		rds = pph.file_path("contrast", "rds"),
		csv = pph.file_path("contrast", "csv")
	log:
		out           = pph.file_path("contrast", "output.log",  log=True),
		contrast_yaml = pph.file_path("contrast", "contrast.yaml", log=True)
	params:
		ma_plot    = pph.file_path("contrast", "ma.pdf"),
		count_plot = pph.file_path("contrast", "counts.pdf"),
		annot_pkg  = config["organism"]["R"]["annotations"]
	run:
		contrast = pph.get_contrast(wildcards.contrast)
		with open(log.contrast_yaml, "w") as f: yaml.dump(contrast, f, default_flow_style=False)
		
		# variants of contrast definition
		contrast_column, contrast_ref = "group", ""
		if "coef" in contrast:
			contrast_type          = "coef"
			arg_val                = '="{}"'.format(contrast["coef"])
			contrast_def, lfc_def  = "name" + arg_val, "coef" + arg_val
		elif "ratio" in contrast:
			contrast_type, ratio   = "ratio", contrast["ratio"]
			contrast_column, contrast_num, contrast_ref = ratio["column"], ratio["numerator"], ratio["denominator"]
			contrast_def = lfc_def = 'contrast=c("{}", "{}", "{}")'.format(contrast_column, contrast_num, contrast_ref)
		elif "vector" in contrast:
			contrast_type          = "vector"
			contrast_def = lfc_def = 'contrast=c({})'.format(", ".join(map(str, contrast["vector"])))
		else:
			raise ValueError("Error in contrast: no valid contrast definition! must be one of {}".format(["coef:", "ratio:", "vector:"]))
		
		# parameters
		cutoff_FDR           =           contrast["max_p_adj"]
		lfcThreshold         =           contrast["results_parameters"]["lfcThreshold"]
		altHypothesis        =           contrast["results_parameters"]["altHypothesis"]
		independentFiltering = "TRUE" if contrast["results_parameters"]["independentFiltering"] else "FALSE"
		lfc_shrink_type      =           contrast["lfcShrink_parameters"]["type"]
		rank_by              =           contrast["ranking_by"]
		rank_order           =           contrast["ranking_order"].replace("x", "res$"+rank_by)
		
		script = textwrap.dedent(r"""
		#----- import packages
		library(DESeq2)
		library(AnnotationDbi)
		library({params.annot_pkg})
		
		{R_SESSION_INFO}
		
		#----- load DESeqDataSet
		dds <- readRDS("{input}")
		
		#----- contrast
		if ("{contrast_type}" == "ratio") {{
			dds${contrast_column} = relevel(dds${contrast_column}, ref="{contrast_ref}")
			dds <- nbinomWaldTest(dds)
		}}
		res <- results(dds, {contrast_def}, alpha={cutoff_FDR}, lfcThreshold={lfcThreshold}, 
		               altHypothesis="{altHypothesis}", independentFiltering={independentFiltering})

		#----- log-fold-change shrinkage
		#res$lfcShrunk   <- res$log2FoldChange
		#res$lfcShrunkSE <- res$lfcSE
		if ("{lfc_shrink_type}"!="none"){{
			lfc <- lfcShrink(dds, {lfc_def}, lfcThreshold={lfcThreshold}, type="{lfc_shrink_type}")
			res$log2FoldChange_orig <- res$log2FoldChange
			res$log2FoldChange      <- lfc$log2FoldChange
			res$lfcSE_orig          <- res$lfcSE
			res$lfcSE               <- lfc$lfcSE
		}}
		
		#----- ranking
		res <- res[order({rank_order}),]
		
		#----- annotation [BEWARE- the in case on multiple mapping, the first entry will be selected]
		res$symbol <- mapIds({params.annot_pkg}, keys=sub("\\.[0-9]+$", "", row.names(res)), 
		                     column="SYMBOL", keytype="ENSEMBL", multiVals="first")
		res$entrez <- mapIds({params.annot_pkg}, keys=sub("\\.[0-9]+$", "", row.names(res)), 
		                     column="ENTREZID", keytype="ENSEMBL", multiVals="first")
		
		#----- save as rds & csv
		saveRDS(res, file="{output.rds}")
		write.table(data.frame(gene_id=rownames(res), res), file="{output.csv}", sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
		""")
		
		script_file = pph.log(log.out, snakemake_format(script), step="contrast", extension="R", **wildcards)
		shell("Rscript --vanilla '{script_file}' &>> '{log.out}'")

## ------------------------ annotation ---------------------------
## Annotation rule produces a table with mapping between the Primary IDs
## (ENSEMBL usually) and other potentially useful IDs as well as gene
## descriptions.

rule annotation:
	""" Adding annotation information """
	input:
		# we use dds as this is the most reliable source of the primary IDs
		dds       = pph.file_path("DESeq2", "deseq2.rds", contrast = "all")
	output:
		res = pph.file_path(step = "annotation", extension = "rds", contrast="all"),
		csv = pph.file_path(step = "annotation", extension = "csv", contrast="all")
	log:
		out = pph.file_path(step = "annotation", extension = "output.log", log=True, contrast="all")
	run:
		config_file = pph.file_path(step="pipeline_report", extension="yaml", contrast="all")
		script = textwrap.dedent(r"""
			library(AnnotationDbi)
			{R_SESSION_INFO}

			res.file <- "{output.res}"
			res.csv  <- "{output.csv}"
			script.d <- "{SCRIPTDIR}"
			conf.f   <- "{config_file}"
      dds      <- "{input.dds}"
			 
			# ----------------------------------------
			# No Snakemake wildcards beyond this point
			# ----------------------------------------
			library(orthomapper)

			config <- yaml::yaml.load_file(conf.f)

			gene_ids <- rownames(readRDS(dds))

			res <- data.frame(PrimaryID=gene_ids, stringsAsFactors=FALSE)
			res$ENSEMBL <- sub("\\..*$", "", res$PrimaryID)

			annot <- orthomapper::entrez_annotate(res$ENSEMBL, taxon=as.numeric(config$organism$taxon),
				keytype="ENSEMBL", column=c("SYMBOL", "ENTREZID", "REFSEQ", "GENENAME"))

		  if(!all(annot[,1] == res$ensembl)) 
				stop("object returned by entrez_annotate() does not match query")

			## no need to duplicate columns
			res <- cbind(res, annot[, -1, drop=FALSE])

			write.csv(res, row.names=FALSE, file=res.csv)
	    saveRDS(res, file=res.file)
		""")

		script_file = pph.log(log.out, snakemake_format(script), step="annotation", extension="R", contrast="all", **wildcards)
		shell("Rscript --vanilla '{script_file}' &>> '{log.out}'")


##-------------------- contrasts ----------------------------------------------------------------------------
## Same as contrasts, but saves all results from DESeq2
## everything named contrasts_full for consistency

rule contrasts_full:
	""" make all contrasts (full version) """
	input:
		pph.file_path("DESeq2", "deseq2.rds", contrast = "all")
	output:
		rds = pph.file_path("contrasts_full", "rds"),
		csv = pph.file_path("contrasts_full", "csv")
	log:
		out           = pph.file_path("contrasts_full", "output.log",  log=True),
	params:
		annot_pkg  = config["organism"]["R"]["annotations"]
	run:
		contrast = pph.get_contrast(wildcards.contrast)
		#with open(log.contrast_yaml, "w") as f: yaml.dump(contrast, f, default_flow_style=False)
		
		# variants of contrast definition
		contrast_column, contrast_ref = "group", ""
		if "coef" in contrast:
			contrast_type          = "coef"
			arg_val                = '="{}"'.format(contrast["coef"])
			contrast_def, lfc_def  = "name" + arg_val, "coef" + arg_val
		elif "ratio" in contrast:
			contrast_type, ratio   = "ratio", contrast["ratio"]
			contrast_column, contrast_num, contrast_ref = ratio["column"], ratio["numerator"], ratio["denominator"]
			contrast_def = lfc_def = 'contrast=c("{}", "{}", "{}")'.format(contrast_column, contrast_num, contrast_ref)
		elif "vector" in contrast:
			contrast_type          = "vector"
			contrast_def = lfc_def = 'contrast=c({})'.format(", ".join(map(str, contrast["vector"])))
		else:
			raise ValueError("Error in contrast: no valid contrast definition! must be one of {}".format(["coef:", "ratio:", "vector:"]))
		
		# parameters
		cutoff_FDR           =           contrast["max_p_adj"]
		lfcThreshold         =           contrast["results_parameters"]["lfcThreshold"]
		altHypothesis        =           contrast["results_parameters"]["altHypothesis"]
		independentFiltering = "TRUE" if contrast["results_parameters"]["independentFiltering"] else "FALSE"
		lfc_shrink_type      =           contrast["lfcShrink_parameters"]["type"]
		rank_by              =           contrast["ranking_by"]
		rank_order           =           contrast["ranking_order"].replace("x", "res$"+rank_by)
		
		script = textwrap.dedent(r"""
		#----- import packages
		library(AnnotationDbi)
		library(DESeq2)
		library({params.annot_pkg})
		
		{R_SESSION_INFO}

    
		#----- load DESeqDataSet
		dds <- readRDS("{input}")
		
		#----- contrast
		if ("{contrast_type}" == "ratio") {{
			dds${contrast_column} = relevel(dds${contrast_column}, ref="{contrast_ref}")
			dds <- nbinomWaldTest(dds)
		}}
		res <- results(dds, {contrast_def}, # alpha={cutoff_FDR}, lfcThreshold={lfcThreshold}, 
		               altHypothesis="{altHypothesis}") #, independentFiltering={independentFiltering})

		#----- log-fold-change shrinkage
		#res$lfcShrunk   <- res$log2FoldChange
		#res$lfcShrunkSE <- res$lfcSE
		if ("{lfc_shrink_type}"!="none"){{
			lfc <- lfcShrink(dds, {lfc_def}, # lfcThreshold={lfcThreshold}, 
				type="{lfc_shrink_type}")
			res$log2FoldChange_orig <- res$log2FoldChange
			res$log2FoldChange      <- lfc$log2FoldChange
			res$lfcSE_orig          <- res$lfcSE
			res$lfcSE               <- lfc$lfcSE
		}}
		
		#----- ranking
		res <- res[order({rank_order}),]
		
		#----- annotation [BEWARE- the in case on multiple mapping, the first entry will be selected]
		res$symbol <- mapIds({params.annot_pkg}, keys=sub("\\.[0-9]+$", "", row.names(res)), 
		                     column="SYMBOL", keytype="ENSEMBL", multiVals="first")
		res$entrez <- mapIds({params.annot_pkg}, keys=sub("\\.[0-9]+$", "", row.names(res)), 
		                     column="ENTREZID", keytype="ENSEMBL", multiVals="first")
		
		#----- save as rds & csv
		saveRDS(res, file="{output.rds}")
		write.table(data.frame(gene_id=rownames(res), res), file="{output.csv}", sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
		""")
		
		script_file = pph.log(log.out, snakemake_format(script), step="contrasts_full", extension="R", **wildcards)
		shell("Rscript --vanilla '{script_file}' &>> '{log.out}'")


## ------------------------ preparation of the tmod databases ---------------------------

rule tmod_dbs:
	""" Preparing tmod databases """
	input:
		#rds = pph.file_path("contrasts_full", "rds"),
		annotation = pph.file_path(step="annotation", extension="rds", contrast="all")
	output:
		res = pph.file_path(step = "tmod_dbs", extension = "rds", contrast="all"),
		map = pph.file_path(step = "tmod_dbs", extension = "mapping.rds", contrast="all")
	log:
		out = pph.file_path(step = "tmod_dbs", extension = "output.log", log=True, contrast="all")
	run:
		config_file = pph.file_path(step="pipeline_report", extension="yaml", contrast="all"),
		script = textwrap.dedent(r"""
			{R_SESSION_INFO}

			res.file <- "{output.res}"
			map.file <- "{output.map}"
			script.d <- "{SCRIPTDIR}"
			conf.f   <- "{config_file}"
			annot.f  <- "{input.annotation}"
			 
			## ------------------------------
			## no wildcards beyond this point
			## ------------------------------

			config <- yaml::yaml.load_file(conf.f)
			source(file.path(script.d, "tmod_functions.R"))

			## read the necessary files
			## create the databases
			dbs <- process_dbs(config)
			saveRDS(dbs, file=res.file)

			## provide mapping for the databases
			map <- get_mapping(config, dbs, annot.f)
			saveRDS(map, file=map.file)
		""")

		script_file = pph.log(log.out, snakemake_format(script), step="tmod_dbs", extension="R", contrast="all", **wildcards)
		shell("Rscript --vanilla '{script_file}' &>> '{log.out}'")


## ------------------------ gene set enrichments with tmod ---------------------------

rule tmod:
	""" Gene set enrichment with tmod and MSigDB """
	input:
		rds = pph.file_path("contrasts_full", "rds"),
		dbs = pph.file_path(step = "tmod_dbs", extension = "rds", contrast="all"),
		map = pph.file_path(step = "tmod_dbs", extension = "mapping.rds", contrast="all")
	output:
		tmod_res  = pph.file_path(step = "tmod", extension = "rds"),
		tmod_gl   = pph.file_path(step = "tmod", extension = "gl.rds"),
		tmod_xlsx = pph.file_path(step = "tmod", extension = "xlsx")
	log:
		out = pph.file_path(step = "tmod", extension = "output.log", log=True)
	run:
		config_file = pph.file_path(step="pipeline_report", extension="yaml", contrast="all")
		script = textwrap.dedent(r"""
			{R_SESSION_INFO}

			script.d <- "{SCRIPTDIR}"
			conf.f   <- "{config_file}"
			de.input <- "{input.rds}"
			map.f    <- "{input.map}"
			dbs.f    <- "{input.dbs}"
			res.file <- "{output.tmod_res}"
			res.xlsx <- "{output.tmod_xlsx}"
			res.gl.f <- "{output.tmod_gl}"
			 
			## ------------------------------
			## no wildcards beyond this point
			## ------------------------------
			library(DESeq2)
			library(tmod)
			library(writexl)
      library(purrr)
			
			config <- yaml::yaml.load_file(conf.f)
			source(file.path(script.d, "tmod_functions.R"))

			dbs <- readRDS(dbs.f)
			db.map <- readRDS(map.f)
			de.res <- readRDS(de.input)

			## create ordered gene lists for every database
			message(paste0("Starting tmod for contrast ", de.input))
			ordered_genelist <- get_ordered_genelist(de.res, config)
			genelists <- lapply(dbs, map_genelists, ordered_genelist, db.map)
			saveRDS(genelists, file=res.gl.f)
			
			## run tmod using each of the predefined databases
			res <- lapply(dbs, run_tmod, config, genelists, db.map)

			## res is a list with an element for each db
			## each element for a db is a list with an element for each
			## of the sorting keys
			## each of these elements contains objects `res` (df with results)
			## and `gl` (genelist used to generate the results) 
			saveRDS(res, file=res.file)
			message("tmod rds saved")

			## shorten results for the human readable XLSX file

			## filter the results
			res.short <- lapply(res, function(x) lapply(x, reformat_res, pval.thr=Inf, auc.thr=0))

			## hard: add lists of significant genes to the results tables
			res.short <- imap(res, ~ {{
			  db.name <- .y
			  map(.x, ~ {{
			    add_sign_genes(., dbs[[db.name]], de.res, db.map)
			  }})
			}})

			## res.short still has two levels, one for the databases, another one
			## for sorting keys, so we need to unlist – one level only
			res.short <- unlist(res.short, recursive=FALSE)

			## remove if there were no results
			res.short <- res.short[!sapply(res.short, is.null)]
			write_xlsx(res.short, path=res.xlsx)
			""")
		script_file = pph.log(log.out, snakemake_format(script), step="tmod", extension="R", **wildcards)

		# run R
		shell("Rscript --vanilla '{script_file}' &>> '{log.out}'")

##-------------------- multivariate tmod ----------------------------------------------------------------

rule tmod_pca:
	""" Gene set enrichment of PCA with tmod and MSigDB """
	input:
		dds = pph.file_path("DESeq2", "deseq2.rds", contrast = "all"),
		dbs = pph.file_path(step = "tmod_dbs", extension = "rds", contrast="all"),
		map = pph.file_path(step = "tmod_dbs", extension = "mapping.rds", contrast="all")
	output:
		res    = pph.file_path(step = "tmod_pca", extension = "rds", contrast="all"),
		pca    = pph.file_path(step = "tmod_pca", extension = "pca.rds", contrast="all"),
		xlsx   = pph.file_path(step = "tmod_pca", extension = "xlsx", contrast="all")
	log:
		out = pph.file_path(step = "tmod_pca", extension = "output.log", log=True, contrast="all")
	run:
		config_file = pph.file_path(step="pipeline_report", extension="yaml", contrast="all")
		script = textwrap.dedent(r"""
			{R_SESSION_INFO}

			script.d  <- "{SCRIPTDIR}"
			conf.f    <- "{config_file}"
			dds.f     <- "{input.dds}"
			map.f     <- "{input.map}"
			dbs.f     <- "{input.dbs}"
			res.f     <- "{output.res}"
			res.xlsx  <- "{output.xlsx}"
			res.pca.f <- "{output.pca}"
			 
			## ------------------------------
			## no wildcards beyond this point
			## ------------------------------
			require(DESeq2)
			require(tmod)
			require(tidyverse)
			require(writexl)

			config <- yaml::yaml.load_file(conf.f)
			source(file.path(script.d, "tmod_functions.R"))

			dbs <- readRDS(dbs.f)
			db.map <- readRDS(map.f)
			dds <- readRDS(dds.f)
			rl <- assay(rlog(dds))

			nullvar <- apply(rl, 1, var) < .Machine$double.eps
			rl <- rl[ !nullvar, ]

			genes <- rownames(rl)
			if(is.null(genes)) stop("can't get primary IDs from DESeq2 object!")

			pca <- prcomp(t(rl), scale.=TRUE)
			saveRDS(pca, file=res.pca.f)

			ncomp <- min(4, ncol(pca$x))

			res <- lapply(dbs, function(db) {{
				mapping.id <- db.map$dbs[[ db$name ]]
				mapping <- db.map$maps[[ mapping.id ]]
				g.id <- setNames(mapping[genes], genes)
				ret <- lapply(1:ncomp, function(i) {{
					glists <- list(
						up=g.id[order(pca$rotation[,i])],
						down=g.id[order(-pca$rotation[,i])],
						abs=g.id[order(-abs(pca$rotation[,i]))])
					lapply(glists, tmodCERNOtest, mset=db$dbobj)
				}})
			  names(ret) <- paste0("PC.", 1:ncomp)
				ret
			}})

			saveRDS(res, file=res.f)

			## shorten results for the human readable XLSX file

			## filter the results
			res.short <- lapply(res, function(x) lapply(x, function(y) lapply(y, reformat_res, pval.thr=.05, auc.thr=.55)))

			## res.short still has two levels, one for the databases, another one
			## for sorting keys, so we need to unlist – one level only
			res.short <- unlist(unlist(res.short, recursive=FALSE), recursive=FALSE)
			write_xlsx(res.short, path=res.xlsx)
		""")

		script_file = pph.log(log.out, snakemake_format(script), step="tmod_pca", contrast="all", extension="R", **wildcards)

		# run R
		shell("Rscript --vanilla '{script_file}' &>> '{log.out}'")


##-------------------- functional annotation ----------------------------------------------------------------

rule goseq:
	""" over-representation analysis (ORA) with goseq producing GO and KEGG lists """
	input:
		txdb = pph.file_path("TxDb_from_GTF", "sqlite", contrast = "all"),
		res  = pph.file_path("contrast", "rds")
	output:
		go   = pph.file_path(step = "goseq", extension = "go.rds"),
		kegg = pph.file_path(step = "goseq", extension = "kegg.rds")
	log:
		out           = pph.file_path(step = "goseq", extension = "output.log",    log=True),
		contrast_yaml = pph.file_path(step = "goseq", extension = "contrast.yaml", log=True)
	params:
		annot_pkg = config["organism"]["R"]["annotations"],
		fit_pwf   = pph.file_path(step = "goseq", extension = "pwf_fit.pdf")
	run:
		contrast = pph.get_contrast(wildcards.contrast)
		with open(log.contrast_yaml, "w") as f: yaml.dump(contrast, f, default_flow_style=False)
		
		fdr_threshold = contrast["ORA"]["fdr_threshold"]
		AnnotationDbi_prefix = ".".join(params.annot_pkg.split(".")[:-1])
		
		script = textwrap.dedent(r"""
		#----- import packages
		library(goseq)
		library(GenomicFeatures)
		library(AnnotationDbi)
		library({params.annot_pkg})
		
		{R_SESSION_INFO}
		
		#----- load results
		res <- readRDS("{input.res}")
		
		#----- vector for goseq
		assayed.genes <- rownames(res)
		de.genes <- assayed.genes[which(res$padj < {fdr_threshold})]
		gene.vector <- as.integer(assayed.genes %in% de.genes); names(gene.vector) <- assayed.genes
                names(gene.vector) <- sub("\\.[0-9]+", "", names(gene.vector))
		
		#----- get gene lengths
		TxDb <- loadDb(file = "{input.txdb}")
		lengthData <- median(width(transcriptsBy(TxDb, "gene")))[rownames(res)]; names(lengthData) <- rownames(res)
		
		#----- get GO mapping
		en2eg <- as.list({AnnotationDbi_prefix}ENSEMBL2EG); go2eg <- as.list({AnnotationDbi_prefix}GO2ALLEGS)
		eg2go <- split(rep(names(go2eg), lengths(go2eg)), unlist(go2eg)); eg2kegg <- as.list({AnnotationDbi_prefix}PATH)

		grepAnnot <- function(id,mapkeys) unique(unlist(mapkeys[id], use.names=FALSE))
		go <- lapply(en2eg, grepAnnot, eg2go); kegg <- lapply(en2eg, grepAnnot, eg2kegg)
		
		#----- run goseq
		pdf("{params.fit_pwf}")
		pwf <- nullp(gene.vector, bias.data=lengthData)
		dev.off()
		GO <- goseq(pwf, gene2cat=go)
		KEGG <- goseq(pwf, gene2cat=kegg)
		
		GO$over_represented_qvalue <- p.adjust(GO$over_represented_pvalue, method="BH")
		GO$under_represented_qvalue <- p.adjust(GO$under_represented_pvalue, method="BH")
		KEGG$over_represented_qvalue <- p.adjust(KEGG$over_represented_pvalue, method="BH")
		KEGG$under_represented_qvalue <- p.adjust(KEGG$under_represented_pvalue, method="BH")
		
		#----- save as rds
		saveRDS(GO, file="{output.go}"); saveRDS(KEGG, file="{output.kegg}")
		""")
		
		script_file = pph.log(log.out, snakemake_format(script), step="goseq", extension="R", **wildcards)
		shell("Rscript --vanilla '{script_file}' &>> '{log.out}'")
	
	
rule cluster_profiler:
	""" gene set enrichment analysis (GSEA) or over-representation analysis (ORA) with cluster profiler for MSigDb, GO and/or KEGG"""
	input:
		res  = pph.file_path("contrast", "rds")
	output:
		rds = pph.file_path(step = "cluster_profiler", extension = "rds"),
		csv = directory(pph.file_path(step = "cluster_profiler", extension = "csv"))
	log:
		out           = pph.file_path(step = "cluster_profiler", extension = "output.log",    log=True),
		contrast_yaml = pph.file_path(step = "cluster_profiler", extension = "contrast.yaml", log=True)
	params:
		annot_pkg = config["organism"]["R"]["annotations"],
		genus = config["organism"]["genus"],
		csv_basename = "category_"
	run:
		contrast = pph.get_contrast(wildcards.contrast)
		with open(log.contrast_yaml, "w") as f: yaml.dump(contrast, f, default_flow_style=False)
		
		fdr_threshold = contrast["ORA"]["fdr_threshold"]
		AnnotationDbi_prefix = ".".join(params.annot_pkg.split(".")[:-1])
		go_pval = go_qval = kegg_pval = kegg_qval = keggm_pval = keggm_qval = ""
		
		test_annotations = {}
		if "MSigDb" in contrast["cluster_profiler"]:
			if "categories" in contrast["cluster_profiler"]["MSigDb"]:
				test_annotations["MSigDb"] = dict(categories='c({})'.format(", ".join('"'+c+'"' for c in contrast["cluster_profiler"]["MSigDb"]["categories"])))
			else:
				test_annotations["MSigDb"] = dict(categories='c("H","C1","C2","C3","C4","C5","C6","C7")')
			test_annotations["MSigDb"]["type"] = '"'+contrast["cluster_profiler"]["MSigDb"]["type"]+'"' if "type" in contrast["cluster_profiler"]["MSigDb"] else '"gsea"'
		if "GO" in contrast["cluster_profiler"]:
			if "ontologies" in contrast["cluster_profiler"]["GO"]:
				test_annotations["GO"] = dict(categories='c({})'.format(", ".join('"'+c+'"' for c in contrast["cluster_profiler"]["GO"]["ontologies"])))
			else:
				test_annotations["GO"] = dict(categories='c("MF", "BP", "CC")')
			test_annotations["GO"]["type"] = '"'+contrast["cluster_profiler"]["GO"]["type"]+'"' if "type" in contrast["cluster_profiler"]["GO"] else '"gsea"'
			if "pval" in contrast["cluster_profiler"]["GO"]: go_pval = ", pvalueCutoff = {}".format(contrast["cluster_profiler"]["GO"]["pval"])
			if "qval" in contrast["cluster_profiler"]["GO"]: go_qval = ", qvalueCutoff = {}".format(contrast["cluster_profiler"]["GO"]["qval"])
		if "KEGG" in contrast["cluster_profiler"]:
			test_annotations["KEGG"] = dict(categories='c("pathways")')
			test_annotations["KEGG"]["org"]  = '"'+contrast["cluster_profiler"]["KEGG"]["kegg_organism_code"]+'"'
			test_annotations["KEGG"]["type"] = '"'+contrast["cluster_profiler"]["KEGG"]["type"]+'"' if "type" in contrast["cluster_profiler"]["KEGG"] else '"gsea"'
			if "pval" in contrast["cluster_profiler"]["KEGG"]: kegg_pval = ", pvalueCutoff = {}".format(contrast["cluster_profiler"]["KEGG"]["pval"])
			if "qval" in contrast["cluster_profiler"]["KEGG"]: kegg_qval = ", qvalueCutoff = {}".format(contrast["cluster_profiler"]["KEGG"]["qval"])
		if "KEGG_modules" in contrast["cluster_profiler"]:
			test_annotations["KEGG_modules"] = dict(categories='c("modules")')
			test_annotations["KEGG_modules"]["org"]  = '"'+contrast["cluster_profiler"]["KEGG"]["kegg_organism_code"]+'"'
			test_annotations["KEGG_modules"]["type"] = '"'+contrast["cluster_profiler"]["KEGG"]["type"]+'"' if "type" in contrast["cluster_profiler"]["KEGG"] else '"gsea"'
			if "pval" in contrast["cluster_profiler"]["KEGG_modules"]: keggm_pval = ", pvalueCutoff = {}".format(contrast["cluster_profiler"]["KEGG_modules"]["pval"])
			if "qval" in contrast["cluster_profiler"]["KEGG_modules"]: keggm_qval = ", qvalueCutoff = {}".format(contrast["cluster_profiler"]["KEGG_modules"]["qval"])
		test_annot_R = "list({})".format(", ".join("{}=list({})".format(k, ",".join("{}={}".format(k2,v) for k2, v in d.items())) for k,d in test_annotations.items()))
		
		script = textwrap.dedent(r"""
		#----- import packages
		library(clusterProfiler)
		library(GenomicFeatures)
		library(AnnotationDbi)
		library({params.annot_pkg})
		library(msigdbr)
		library(dplyr)
		
		{R_SESSION_INFO}
		
		#----- load results
		res <- readRDS("{input.res}")
		
		#----- gene list GSEA
		use_entries <- !is.na(res$entrez)
		geneList_gsea <- res$log2FoldChange[use_entries]
		names(geneList_gsea) <- as.character(res$entrez[use_entries])
		geneList_gsea <- sort(geneList_gsea, decreasing = TRUE)
		
		#----- gene list ORA
		geneList_ora <- res$entrez[res$padj < {fdr_threshold}]
		geneList_ora <- geneList_ora[!is.na(geneList_ora)]
		geneList_ora <- as.character(geneList_ora)
		
		#----- collect results
		test_annotations <- {test_annot_R}
		tests <- list()
		
		#----- run tests
    tests_to_run <- intersect(c("MSigDb", "GO", "KEGG", "KEGG_modules"), names(test_annotations))
		dir.create("{output.csv}", recursive=TRUE)
    tests <- lapply(tests_to_run, function(test) {{
			dir.create(paste0("{output.csv}/", test), recursive=TRUE)
			categories <- test_annotations[[test]]$categories
			#cat_tests <- vector("list", length(categories))

      cat_tests <- lapply(categories, function(cat) {{
				if (test == "MSigDb") msigdb_gs <- msigdbr(species = "{params.genus}", category = cat) %>% dplyr::select(gs_name, entrez_gene)
				if (test_annotations[[test]]$type == "gsea"){{
          em <- switch(test,
					  MSigDb         =GSEA(geneList_gsea, TERM2GENE = msigdb_gs),
					  GO             =gseGO(geneList = geneList_gsea, OrgDb = {params.annot_pkg}, ont = cat, pAdjustMethod = "BH"{go_pval}),
					  KEGG           =gseKEGG(geneList = geneList_gsea, organism = test_annotations$KEGG$org{kegg_pval}),
					  KEGG_modules   =gseMKEGG(geneList = geneList_gsea, organism = test_annotations$KEGG_modules$org{keggm_pval}))
				}} else {{
          em <- switch(test,
					  MSigDb         =enricher(geneList_ora, TERM2GENE = msigdb_gs),
					  GO             =enrichGO(gene = geneList_ora, universe = names(geneList_gsea), OrgDb = {params.annot_pkg}, 
					                           ont = cat, pAdjustMethod = "BH"{go_pval}{go_qval}),
					  KEGG           =enrichKEGG(gene = geneList_ora, organism = test_annotations$KEGG$org{kegg_pval}{kegg_qval}),
					  KEGG_modules   =enrichMKEGG(gene = geneList_ora, organism = test_annotations$KEGG_modules$org{keggm_pval}{keggm_qval}))
				}}
				write.table(data.frame(em), file=paste0("{output.csv}/",test,"/{params.csv_basename}",cat,".csv"), 
					    sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
        return(em)
			}})
      names(cat_tests) <- categories
      return(cat_tests)
		}})
    names(tests) <- tests_to_run
		
		#----- save as rds
		saveRDS(tests, file="{output.rds}")
		""")
		
		script_file = pph.log(log.out, snakemake_format(script), step="cluster_profiler", extension="R", **wildcards)
		shell("Rscript --vanilla '{script_file}' &>> '{log.out}'")

		
##-------------------- time series -----------------------------------------------------------------------------

rule rain:
	""" run Rain for rhythm detection in time series """
	input:
		counts = pph.file_path("import_gene_counts", "tsv", contrast="all")
	output:
		tsv = pph.file_path("rain", "tsv")
	log:
		out = pph.file_path(step = "rain", extension = "output.log", log=True)
	params:
		period = config["rain"]["period"],
		peak_boarder = config["rain"]["peak_boarder"],
		min_fold_change = config["rain"]["filter"]["min_fold_change"],
		min_threshold = config["rain"]["filter"]["frac_above_threshold"]["threshold"],
		frac_above_threshold = config["rain"]["filter"]["frac_above_threshold"]["frac"],
		annot_pkg  = config["organism"]["R"]["annotations"]
	run:
		ts_columns = [rep for samp in config["time_series"][wildcards.contrast]["samples"] for rep in samp]
		r_ts_columns = pph.get_r_repr(ts_columns, to_type = "vector")
		r_time_points = pph.get_r_repr(config["time_series"][wildcards.contrast]["time_points"], to_type = "vector")
		r_nr_replicates = pph.get_r_repr([len(samp) for samp in config["time_series"][wildcards.contrast]["samples"]], to_type = "vector")

		script = textwrap.dedent(r"""
		library("stats")
		library("siggenes")
		library("rain")
		library("AnnotationDbi")
		library("{params.annot_pkg}")
		{R_SESSION_INFO}

		#----- params
		period <- {params.period}
		peak_boarder <- {params.peak_boarder}
		min_fold_change <- {params.min_fold_change}
		min_threshold <- {params.min_threshold}
		frac_above_threshold <- {params.frac_above_threshold}

		#----- prepare matrix
		counts_file <- data.frame(read.csv("{input.counts}", sep="\t"))
		ts_columns <- {r_ts_columns}
		counts_file <- t(counts_file[,ts_columns])

		#----- fraction above threshold filter
		frac <- apply(counts_file, 2, function(x) sum(x >= min_threshold)/length(x))
		counts_file <- counts_file[,frac >= frac_above_threshold]

		#----- fold change filter
		maxima <- apply(counts_file, 2, function(x) max(x, na.rm=T))
		minima <- apply(counts_file, 2, function(x) min(x, na.rm=T))
		counts_file <- counts_file[,maxima/minima > min_fold_change]

		#----- run rain
		distances <- diff({r_time_points})
		rainresult <- rain(counts_file,
		                   period=period,
		                   deltat=distances[1],
		                   peak.border=peak_boarder,
		                   verbose=F,
		                   measure.sequence={r_nr_replicates},
		                   na.rm=F
		)

		pvalues <- rainresult[T,1]
		qvalues_BH <- p.adjust(pvalues, 'BH')

		#----- map Ids
		symbol <- mapIds({params.annot_pkg}, keys=sub("\\.[0-9]+$", "", colnames(counts_file)),
		                     column="SYMBOL", keytype="ENSEMBL", multiVals="first")
		entrez <- mapIds({params.annot_pkg}, keys=sub("\\.[0-9]+$", "", colnames(counts_file)),
		                     column="ENTREZID", keytype="ENSEMBL", multiVals="first")

		#----- write tsv
		write.table(
			data.frame(
				ensembl=colnames(counts_file),
				symbol=symbol,
				entrez=entrez,
				q_val_BH=qvalues_BH,
				p_val=pvalues,
				phase=rainresult["phase"],
				peak_shape=rainresult["peak.shape"],
				period=rainresult["period"]
				),
			sep = "\t",
			quote = F,
			row.names = F,
			file = "{output.tsv}"
		)
		""")

		script_file = pph.log(log.out, snakemake_format(script), step="rain", extension="R", **wildcards)
		shell("Rscript --vanilla '{script_file}' &>> '{log.out}'")

rule dodr:
	input:
		counts = pph.file_path("import_gene_counts", "tsv", contrast="all")
	output:
		tsv = pph.file_path("dodr", "tsv")
	log:
		out = pph.file_path(step = "dodr", extension = "output.log", log=True)
	params:
		time_series1 = lambda wcs: config["dodr"]["comparisons"][wcs.contrast]["time_series1"],
		time_series2 = lambda wcs: config["dodr"]["comparisons"][wcs.contrast]["time_series2"],
		period = config["dodr"]["period"],
		method = config["dodr"]["method"],
		annot_pkg  = config["organism"]["R"]["annotations"]
	run:
		def get_time_series_data(time_series):
			""" get data from a time series, which may come from a different experiment """
			time_series_path = time_series["path"] or pph.out_path_pattern
			ts_config = pph.load_config_from_path(time_series_path)
			r_time_points = pph.get_r_repr([
					tp
					for i, tp in enumerate(ts_config["time_series"][time_series["name"]]["time_points"])
					for rep in ts_config["time_series"][time_series["name"]]["samples"][i]
				], to_type = "vector"
			)
			ts_columns = [rep for samp in ts_config["time_series"][time_series["name"]]["samples"] for rep in samp]
			r_ts_columns = pph.get_r_repr(ts_columns, to_type = "vector")
			ts_counts = pph.file_path("import_gene_counts", "tsv", contrast="all", path_pattern=time_series_path)
			return (ts_counts, r_time_points, r_ts_columns)

		counts1, r_time_points1, r_ts_columns1 = get_time_series_data(params.time_series1)
		counts2, r_time_points2, r_ts_columns2 = get_time_series_data(params.time_series2)

		script = textwrap.dedent(r"""
		library("DODR")
		library("AnnotationDbi")
		library("{params.annot_pkg}")
		{R_SESSION_INFO}

		#----- params
		period <- {params.period}
		method <- "{params.method}"
		times1 <- {r_time_points1}
		times2 <- {r_time_points2}

		#----- prepare main matrix
		counts1 <- data.frame(read.csv("{counts1}", sep="\t"))
		ts_columns1 <- {r_ts_columns1}
		counts1 <- t(counts1[,ts_columns1]); print(counts1[1:5,1:5]); print(times1)

		#----- prepare comparison matrix
		counts2 <- data.frame(read.csv("{counts2}", sep="\t"))
		ts_columns2 <- {r_ts_columns2}
		counts2 <- t(counts2[,ts_columns2]); print(counts2[1:5,1:5]); print(times2)

		#----- keep common columns
		common_col <- intersect(colnames(counts1), colnames(counts2))
		counts1 <- counts1[,common_col]
		counts2 <- counts2[,common_col]

		print(paste("times1 length:", length(times1), "counts1 dim:", paste(dim(counts1), collapse=", ")))
		print(paste("times2 length:", length(times2), "counts2 dim:", paste(dim(counts2), collapse=", ")))
		print(counts1[1:5,1:5])

		#----- run rain
		dodrresult <- dodr(val1=counts1, val2=counts2, times1=times1, times2=times2, period=period, method=method)

		#----- map Ids
		symbol <- mapIds({params.annot_pkg}, keys=sub("\\.[0-9]+$", "", common_col),
		                     column="SYMBOL", keytype="ENSEMBL", multiVals="first")
		entrez <- mapIds({params.annot_pkg}, keys=sub("\\.[0-9]+$", "", common_col),
		                     column="ENTREZID", keytype="ENSEMBL", multiVals="first")

		#----- write tsv
		write.table(data.frame(ensembl=common_col, symbol=symbol, entrez=entrez, dodrresult$p.value.table),
		            sep = "\t", quote = F, row.names = F, file = "{output.tsv}")
		""")

		script_file = pph.log(log.out, snakemake_format(script), step="dodr", extension="R", **wildcards)
		shell("Rscript --vanilla '{script_file}' &>> '{log.out}'")

##-------------------- report -------------------------------------------------------------------------------

rule report:
	""" write a static Rmd report """
	input:
		get_inputs_all()
	output:
		pph.file_path("report", "Rmd", contrast = "all")
	run:
		rt = ReportTool(pph)
		report_text = rt.generate_report()

		text_sub = dict(file_tab="", config="")
		for tag, num, path in ((tag, num, path) for tag, paths in rt.use_results.items() for num, path in enumerate(paths)):
			file_table  = pph.file_path("report",          "tsv",  contrast="all", path_pattern=path)
			config_file = pph.file_path("pipeline_report", "yaml", contrast="all", path_pattern=path)
			pph.log_generated_files(save_to=file_table, path_pattern=path)
			id_suffix, _ = rt.get_id_suffix(tag, num)
			text_sub["file_tab"] += (f'file_tab{id_suffix} <- read.table("{file_table}", sep="\\t", header=TRUE, stringsAsFactors=FALSE)\n')
			text_sub["config"]   += f'config{id_suffix} <- yaml.load_file("{config_file}")\n'

		report_text = report_text.replace("{{WORKING_DIRECTORY}}", os.getcwd() + os.sep)
		report_text = report_text.replace("{{R_COMMON}}", str(rt.report_snippet_base_dir/"../R_common"))
		report_text = report_text.replace("{{LOAD_FILE_TABLE}}", text_sub["file_tab"])
		report_text = report_text.replace("{{LOAD_CONFIG_FILE}}", text_sub["config"])
		
		with open(output[0], "w") as f: f.write(report_text)


## ----------------------- generate html report -------------------------

rule report_html:
	"""Generate html version of the report"""
	input:
		rmd=pph.file_path("report", "Rmd", contrast="all")
	output:
		html=pph.file_path("report_html", "html", contrast="all")
	log:
		out=pph.file_path(step = "report_html", extension = "output.log", log=True, contrast="all")
	run:
		shell("Rscript --vanilla -e 'rmarkdown::render(\"{input.rmd}\", output_file=\"{output.html}\")' &>> '{log.out}'")

## ----------------------- contrast summary table -------------------------

rule contrast_summary:
	""" generate a summary table with contrast results """
	input:
		rds_list = pph.expand_path(step="contrast", extension="rds")
	output:
		xlsx = pph.file_path(step="contrast_summary", extension="xlsx", contrast="all"),
		tsv = pph.file_path(step="contrast_summary", extension="tsv", contrast="all")
	log:
		out = pph.file_path(step = "contrast_summary", extension = "output.log", contrast="all", log=True)
	run:
		rds_list = pph.get_r_repr(input.rds_list, to_type="list")
		contrast_ids = pph.get_r_repr(pph.contrast_ids)

		script = textwrap.dedent(r"""
		#----- import packages
		library(DESeq2)
		library(writexl)
		library(data.table)

		{R_SESSION_INFO}

		#----- variables
		input_rds_list <- {rds_list}
		names(input_rds_list) <- {contrast_ids}
		output_xlsx <- "{output.xlsx}"
		output_tsv  <- "{output.tsv}"

		#----- merge contrast results in table
		merge_contrast_cols <- function(data_frame, rds_index){{
			print(paste("merge", rds_index))
			rds_file <- input_rds_list[[rds_index]]
			add_cols <- as.data.frame(readRDS(rds_file))[,c("log2FoldChange", "padj", "symbol")]
			colnames(add_cols) <- c(paste(c("LFC", "padj"), rds_index, sep="_"), "symbol")
			add_cols$gene <- rownames(add_cols)
			add_cols <- as.data.table(add_cols, key=c("gene", "symbol"))
			if (is.null(data_frame)) return(add_cols)
			merged_table <- merge(data_frame, add_cols, all=TRUE, by=c("gene", "symbol"))
			return(merged_table)
		}}
		indices <- as.list(names(input_rds_list))
		result_summary <- Reduce(merge_contrast_cols, append(indices, list(NULL), after=0))

		#----- save as XLSX and tsv
		write_xlsx(result_summary, path=output_xlsx)
		write.table(result_summary, file=output_tsv, sep="\t", quote=F)
		""")
		script_file = pph.log(log.out, snakemake_format(script), step="contrast_summary", extension="R", contrast = "all", **wildcards)
		shell("Rscript --vanilla '{script_file}' &>> '{log.out}'")