-
Notifications
You must be signed in to change notification settings - Fork 1
/
functional_pipelines.R
210 lines (192 loc) · 7.94 KB
/
functional_pipelines.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
###########################################################################
#
# HEADER
#
###########################################################################
# Author: Matthew Muller
#
# Date: 2023-02-18
#
# Script Name: Functional Pipelines
#
# Notes:
# This script contains functions for running pipelines within an R interface. These functions mainly relate to rna-sequencing, but this is not exclusive.
###########################################################################
#
# LIBRARIES
#
###########################################################################
library(tidyverse)
library(ggplot2)
library(SummarizedExperiment)
library(DESeq2)
library(AnnotationDbi)
library(ggpubr)
library(EnhancedVolcano)
library(ggtree)
# LOAD FUNCTIONS
# space reserved for sourcing in functions
options(stringsAsFactors = FALSE)
source('https://raw.githubusercontent.com/mattmuller0/Rtools/main/general_functions.R')
source('https://raw.githubusercontent.com/mattmuller0/Rtools/main/stats_functions.R')
source('https://raw.githubusercontent.com/mattmuller0/Rtools/main/deseq_helper_functions.R')
source('https://raw.githubusercontent.com/mattmuller0/Rtools/main/geneset_creation_functions.R')
###########################################################################
#
# CODE
#
###########################################################################
##### Geneset Creation Pipeline
# Input:
# - outdir: output directory
# - dds: DESeq2 object
# - target: target variable
# - depthFilter: minimum number of samples a gene must be expressed in
# - sizeFilter: minimum number of genes a geneset must contain
# - percentFilter: minimum percent of genes in a geneset that must be expressed in a sample
# - test_names: names of tests to run for geneset creation
# - test_params: parameters for each test
create_geneset <- function(
# metadata
outdir,
# data
dds, target,
# rna filtering parameters
depthFilter=10, sizeFilter=1e6, percentFilter=0.25,
# geneset creation parameters
deseq_filter = 0.05, fc_filter = 0.5,
cor_filter = NA, cor_method = "spearman",
# predictive modeling parameters
normalization = "cpm",
# sys params
conda = "tf", kwargs = NULL
) {
# rna preprocessing
message("========= Preprocessing Data =========")
message("Filtering Data...")
dds <- rna_preprocessing(dds, file.path(outdir, "preprocessing"), depthFilter, sizeFilter, percentFilter)
# DESeq analysis
message("========= Running DESeq Analysis =========")
message(paste0("Pulling DESeq results for: ", target, "..."))
threshold <- "padj"
deseq_res_name <- c(target, "1", "0")
deseq_res <- results(dds, contrast = deseq_res_name) %>%
as.data.frame()
deseq_sign_genes <- deseq_res %>%
filter(padj < deseq_filter, abs(log2FoldChange) > fc_filter) %>%
rownames()
# If not enough genes identified by FDR correction, switch to pvalue threshold
if (length(deseq_sign_genes) < 75) {
message("Not enough genes identified by FDR correction. Switching to pvalue threshold.")
deseq_sign_genes <- deseq_res %>%
filter(pvalue < deseq_filter, abs(log2FoldChange) > fc_filter) %>%
rownames()
threshold <- "pvalue"
}
message(paste0("DESeq identified genes: n = ", length(deseq_sign_genes)))
# Correlation analysis
message("========= Running Correlation Analysis =========")
message(paste0("Pulling Correlation results for: ", target, "..."))
cor_res <- calculate_correlations(dds, target, method=cor_method)
cor_sign_genes <- cor_res %>%
filter(abs(correlation) > cor_filter) %>%
rownames()
message(paste0("Correlation identified genes: n = ", length(cor_sign_genes)))
# Create geneset
message("========= Creating Geneset =========")
message(paste0("Creating geneset for: ", target, "..."))
geneset_genes <- intersect(deseq_sign_genes, cor_sign_genes)
message(paste0("Geneset contains: ", length(geneset_genes), " genes"))
res <- deseq_res %>% dplyr::select(baseMean, log2FoldChange, pvalue, padj) %>%
mutate(
deseq_pval = pvalue, deseq_padj = padj,
correlation = cor_res$correlation, cor_pval = cor_res$pvalue, cor_padj = cor_res$padj,
pvalue = NULL, padj = NULL
)
# Plot metrics
message("========= Plotting Results =========")
message("Plotting geneset volcano...")
max_log2FC <- max(res$log2FoldChange)
min_log2FC <- min(res$log2FoldChange)
geneset_correlations <- EnhancedVolcano(
res, lab=rownames(res),
x = "log2FoldChange", y = threshold,
pCutoff = deseq_filter, FCcutoff = fc_filter
) +
labs(title = paste0("Geneset Volcano: ", target), x = "log2FoldChange", y = threshold)
ggsave(file.path(outdir, paste0("geneset_volcano_", target, ".png")), geneset_correlations)
message("Plotting geneset correlations...")
geneset_correlations <- EnhancedVolcano(
res, lab=rownames(res),
x = "log2FoldChange", y = "correlation",
pCutoff = cor_filter, FCcutoff = fc_filter
) +
lims(y = c(-1, 1)) +
labs(title = paste0("Geneset Correlations: ", target), x = "log2FoldChange", y = "correlation")
ggsave(file.path(outdir, paste0("geneset_correlations_", target, ".png")), geneset_correlations)
# export geneset
message("Exporting geneset...")
junk <- export_counts_and_labels_from_SE(dds, geneset_genes, target, outdir, normalize = normalization)
# feature elimination and predictive modeling
message("========= Running Predictive Modeling =========")
sys.call(paste0("conda activate ", conda))
sys.call("Python3", c(
"recursive_feature_elimination_and_model_generation.py",
"--input", file.path(outdir, paste0(target, "_counttable")),
"--labels", file.path(outdir, paste0(target, "_labels")),
"--output", file.path(outdir, "modeling"),
"--scoring", "roc_auc",
"--random_state", 420,
"--folds", 5,
"--verbose", TRUE
))
sys.call("conda deactivate")
# Load results
message("========= Loading Predictive Modeling Results =========")
message("Loading results...")
# Load in the final geneset
geneset_final <- read.csv(paste0(outdir, "/modeling/optimal_features.csv"))$X0
# Load in the test and train cohorts
test <- read.csv(paste0(outdir, "/modeling/data/X_test.csv"))
train <- read.csv(paste0(outdir, "/modeling/data/X_train.csv"))
# Stop early if geneset is too small
if (length(geneset_final) < 10) {
message("Geneset is too small. Stopping early.")
return()
}
IRkernel::installspec()
message("Plotting geneset heatmap...")
geneset_heatmap <- plot_geneset_heatmap(dds, geneset_final, target)
ggsave(file.path(outdir, paste0("geneset_heatmap_", target, ".png")), geneset_heatmap)
message("Plotting geneset PCA...")
geneset_pca <- plot_normalized_pca(dds[geneset_final,], target, normalization)
ggsave(file.path(outdir, paste0("geneset_pca_", target, ".png")), geneset_pca)
message("Plotting geneset boxplots...")
geneset_boxplots <- plot_gene_boxplot(dds, geneset_final, target, file.path(outdir, "geneset_boxplots"))
ggsave(file.path(outdir, paste0("geneset_boxplots_", target, ".png")), geneset_boxplots)
message("Plotting geneset singscores...")
up_genes <- hyper_res %>%
filter(log2FoldChange > 0) %>%
select(geneset_final) %>%
row.names()
down_genes <- hyper_res %>%
filter(log2FoldChange < 0) %>%
select(geneset_final) %>%
row.names()
geneset_singscores <- plot_singscores(dds[,rownames(test)], target, file.path(outdir, "geneset_singscores.png"), up_genes, down_genes)
message("Saving parameters...")
params <- data.frame(
target = target,
geneset = geneset,
depthFilter = depthFilter,
sizeFilter = sizeFilter,
percentFilter = percentFilter,
deseq_filter = deseq_filter,
fc_filter = fc_filter,
cor_filter = cor_filter,
cor_method = cor_method,
normalization = normalization,
conda_env = conda
)
write.csv(params, file.path(outdir, paste0(target, "_modeling_params.csv")), row.names = F)
}