-
Notifications
You must be signed in to change notification settings - Fork 1
/
rna_functions.R
683 lines (579 loc) · 22.9 KB
/
rna_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
###########################################################################
#
# rna_functions
#
###########################################################################
# Author: Matthew Muller
# Date: 2023-12-28
# Script Name: rna_functions
#======================== LIBRARIES ========================#
suppressMessages(library(AnnotationDbi))
suppressMessages(library(forcats))
suppressMessages(library(ggplot2))
suppressMessages(library(msigdbr))
suppressMessages(library(ggpubr))
suppressMessages(library(DESeq2))
suppressMessages(library(edgeR))
suppressMessages(library(limma))
suppressMessages(library(ggbiplot))
suppressMessages(library(tidyverse))
# LOAD FUNCTIONS
# space reserved for sourcing in functions
source("https://raw.githubusercontent.com/mattmuller0/Rtools/main/general_functions.R")
source("https://raw.githubusercontent.com/mattmuller0/Rtools/main/plotting_functions.R")
source("https://raw.githubusercontent.com/mattmuller0/Rtools/main/enrichment_functions.R")
source("https://raw.githubusercontent.com/mattmuller0/Rtools/main/converting_functions.R")
source("https://raw.githubusercontent.com/mattmuller0/Rtools/main/filtering_functions.R")
source("https://raw.githubusercontent.com/mattmuller0/Rtools/main/stats_functions.R")
#======================== Reading Functions ========================
#' Function to make a count table from feature counts output directory
#' Arguments:
#' directory: directory containing feature counts output
#' pattern: pattern to match for feature counts output
#' idx: columns to extract for (1) gene name and (2) counts
#' ...: additional arguments to pass to read.table
#' Outputs:
#' data frame of counts
ReadFeatureCounts <- function(f, idx = idx, ...) {
if (missing(f))
stop("f is missing")
if (!file.exists(f))
stop("file not found")
message("reading ", f, " ...")
a <- read.table(f, header = TRUE, ...)
a <- a %>% dplyr::select(1, idx)
return(a)
}
CountTableFromFeatureCounts <- function (directory = ".", pattern = "featureCounts.txt$", idx = 7, ... ) {
if (missing(directory))
stop("directory is missing")
fl <- list.files(directory, pattern = pattern, full.names = TRUE, recursive = TRUE)
message("reading ", length(fl), " samples ...")
sample_names <- basename(fl)
l <- purrr::map(fl, ReadFeatureCounts, idx = idx, ...)
tbl <- purrr::reduce(l, inner_join)
return(tbl)
}
#======================== Testing Functions ========================
#' Wilcoxan Ranked Sum testing on genes in two summarized experiments
#' Arguments:
#' dds: DESeq2 object
#' condition: column of interest
#' controls: control groups
#' pCutoff: pvalue cutoff for volcano plot
#' FCcutoff: fold change cutoff for volcano plot
#' ...: additional arguments to pass to wilcox.test
#' Outputs:
#' data frame of p-values and adjusted p-values
gene_wilcox_test <- function(
dds, outpath,
condition,
pCutoff = 0.05, FCcutoff = 0.5,
...) {
require(DESeq2)
# make the outpath
dir.create(outpath, showWarnings = FALSE, recursive = TRUE)
# Extract count data from the DESeq object
count_data_raw <- counts(dds, normalize = T) %>%
t() %>%
as.data.frame()
count_data <- log2(count_data_raw + 1)
counts_data_raw <- normalize_counts(dds, method = "log2-mor")
# Extract metadata from the DESeq object
meta <- colData(dds) %>%
as.data.frame() %>%
dplyr::select(all_of(condition))
# get the conditions
conditions <- unique(meta[, condition])
# Run the wilcoxon test for each gene using map
test_res <- map(
count_data,
function(x) {
# run the test
test <- wilcox.test(
x[meta[, condition] == conditions[1]],
x[meta[, condition] == conditions[2]],
...
)
# get the pvalue
pvalue <- test$p.value
# get the basemean
basemean <- 2^mean(x)
# get the log2FC
log2FoldChange <- mean(x[meta[, condition] == conditions[2]]) - mean(x[meta[, condition] == conditions[1]])
# return the test
return(list(basemean = basemean, log2FoldChange = log2FoldChange, pvalue = pvalue))
}
)
names(test_res) <- colnames(count_data)
# Extract p-values and adjust for multiple testing using the Benjamini-Hochberg method
res <- list_of_lists_to_df(test_res)
res <- res %>% mutate(padj = p.adjust(pvalue, method="BH"))
# wrote the results to a csv
write.csv(res, file.path(outpath, "wilcox_results.csv"))
# make a volcano plot
volcanoP <- EnhancedVolcano(
res, lab=rownames(res),
x = "log2FoldChange", y = "pvalue",
title = "Volcano Plot", subtitle = "",
pCutoff = pCutoff, FCcutoff = FCcutoff
)
ggsave(file.path(outpath, "volcanoPlot.pdf"), volcanoP)
# get the fc list
fc <- get_fc_list(res, "log2FoldChange")
# run enrichment
gse <- rna_enrichment(fc, outpath)
# View results
return(res)
}
#' Function to run limma voom on a summarized experiment object with TMM normalization
#' Arguments:
#' dds: DESeq2 object
#' condition: vector of conditions
#' controls: vector of control groups
#' ...: additional arguments to pass to voom
#' Outputs:
#' data frame of p-values and adjusted p-values
run_limma <- function(
se, outpath,
condition, controls = NULL,
pCutoff = 0.05, FCcutoff = 0.5,
...) {
# make the directory if it doesn"t exist
dir.create(outpath, showWarnings = FALSE, recursive = TRUE)
# Convert the counts matrix to a matrix if it"s not already
counts <- assay(se)
if (!is.matrix(counts)) {
counts <- as.matrix(counts)
}
condition <- se[[condition]]
if (!is.null(controls)) {
controls <- se[[controls]]
}
# Normalize the counts matrix using TMM
require(edgeR)
counts <- DGEList(counts = counts, group = condition)
counts <- calcNormFactors(counts, method = "TMM")
norm_counts <- cpm(counts, log = TRUE)
# Create the design matrix
if (is.null(controls)) {
design <- model.matrix(~ condition)
} else {
design <- model.matrix(~ condition + controls)
}
# Perform differential expression analysis
fit <- lmFit(norm_counts, design)
fit <- eBayes(fit)
# Get the differential expression results
results <- topTable(fit, coef = 2, number = Inf, ...)
# make a volcano plot
volcanoP <- plot_volcano(results, title = "limma", color = "adj.P.Val", pCutoff = pCutoff)
ggsave(file.path(outpath, "volcanoPlot.pdf"), volcanoP)
# save the results
write.csv(results, file.path(outpath, "limma_results.csv"))
# get the fc list
fc <- get_fc_list(results, "logFC")
# run enrichment
gse <- gsea_analysis(fc, outpath)
return(results)
}
#' Calculate Correlations
#' Arguments:
#' dds: DESeq2 object
#' condition: column of interest
#' method: correlation method to use
#' Outputs:
#' Correlation results for each gene
calculate_correlations <- function(dds, condition, normalize = "mor", method="spearman", ...) {
# Extract the condition vector
condition <- colData(dds)[, condition] %>% as.numeric()
# Extract the gene expression data
gene_expression <- normalize_counts(dds, method = normalize)
# Apply the cor() function to each row of the gene_expression matrix
gene_expression_correlations <- apply(gene_expression, 1, function(x) {
correlation <- cor.test(x, condition, method=method, ...)
return(list(correlation=correlation$estimate, pvalue=correlation$p.value))
})
# Convert the correlations to a data frame
gene_expression_correlations_df <- data.frame(row.names = rownames(dds),
correlation=sapply(gene_expression_correlations, function(x) x$correlation),
pvalue=sapply(gene_expression_correlations, function(x) x$pvalue)
)
# Return the data frame of gene expression correlations
return(gene_expression_correlations_df)
}
#' Run Simple Deseq analysis
#' Arguments:
#' dds: DESeq2 object
#' outpath: path to push results to
#' contrast: contrast to run
#' ...: additional arguments to pass to DESeq2
#' Outputs:
#' results of differential expression analysis
run_deseq <- function(
# main inputs
dds, outpath,
# additional arguments for DESeq2
contrast = NA, ...,
# additional arguments for plotting
pvalue = "padj", pCutoff = 0.05, FCcutoff = 0
) {
require(DESeq2)
require(SummarizedExperiment)
require(tidyverse)
require(ggplot2)
# This function will run differential expression on a premade dds object.
# Gives most metrics you"ll need, and also returns the results
dir.create(outpath, showWarnings = F, recursive = T)
message("Running DESeq2")
dds <- DESeq(dds, ...)
# Get results names
res <- results(dds)
if (is.vector(contrast)) {
name <- paste0(contrast[1], "__", contrast[2], "_vs_", contrast[3])
res <- results(dds, contrast=contrast)
resLFC <- lfcShrink(dds, coef=length(resultsNames(dds)), type="apeglm")
pdf(file.path(outpath, "MAplot.pdf"))
DESeq2::plotMA(resLFC)
dev.off()
# make volcano plot
volcanoP <- plot_volcano(res, title = name, color = pvalue, pCutoff = pCutoff)
ggsave(file.path(outpath, "volcanoPlot.pdf"), volcanoP)
# make a heatmap of the significant genes
tryCatch({
sign_genes <- res[, pvalue] < pCutoff & abs(res[, "log2FoldChange"]) > FCcutoff
is.na(sign_genes) <- FALSE # some error handing for outliers as NA
heatmapP <- plot_gene_heatmap(dds[sign_genes, ], title = name, annotations = contrast[1], normalize = "vst", show_row_names = FALSE, show_column_names = FALSE)
pdf(file.path(outpath, "dge_heatmap.pdf"))
print(heatmapP)
dev.off()
}, error = function(e) {
message("An error occurred while generating the heatmap: ", conditionMessage(e))
})
# make gene list
geneList <- get_fc_list(res)
gse_list <- gsea_analysis(geneList, outpath)
message("Results Summary:")
summary(res)
message("Writing results to outpath")
write.csv(res, file.path(outpath, "deseq_results.csv"))
saveRDS(dds, file = file.path(outpath,"dds.rds"))
return(res)
}
name <- resultsNames(dds)[length(resultsNames(dds))]
res <- results(dds, contrast=contrast)
resLFC <- lfcShrink(dds, coef=name, type="apeglm")
pdf(file.path(outpath, "MAplot.pdf"))
DESeq2::plotMA(resLFC)
dev.off()
# make volcano plot
volcanoP <- plot_volcano(res, title = name, color = pvalue, pCutoff = pCutoff)
ggsave(file.path(outpath, "volcanoPlot.pdf"), volcanoP)
# make a heatmap
sign_genes <- rownames(res)[res$pvalue < pCutoff & abs(res$log2FoldChange) > FCcutoff]
if (length(sign_genes) == 0) {
message("No significant genes found")
} else {
heatmapP <- plot_gene_heatmap(dds[sign_genes, ], title = name, annotations = contrast[1], normalize = "vst")
pdf(file.path(outpath, "dge_heatmap.pdf"))
print(heatmapP)
dev.off()
}
# make gene list
geneList <- get_fc_list(res)
gse_list <- gsea_analysis(geneList, outpath)
message("Results Summary:")
summary(res)
message("Writing results to outpath")
write.csv(res, file.path(outpath, "deseq_results.csv"))
saveRDS(dds, file = file.path(outpath,"dds.rds"))
return(res)
}
#' OVR Deseq Results Function
#' Arguments:
#' dds: DESeq2 object
#' column: column of interest for OVR analysis
#' outpath: path to push results to
#' Outputs:
#' OVR results for each level of column of interest
ovr_deseq_results <- function(dds, column, outpath, controls = NULL, ...) {
require(DESeq2)
require(clusterProfiler)
require(DOSE)
dir.create(outpath, showWarnings = F, recursive = T)
# This function takes in a DDS or SE object, the condition column of interest
# and the out directory path to push results to. It will run OVR differential
# expression analysis on each level within the condition column.
counts <- assay(dds)
lvls <- levels(colData(dds)[,column])
cond <- as.character(colData(dds)[,column])
# loop over condition levels in a one versus rest manner
# ideally this for loop could be an apply statement with a custom function,
# but it works for now so oh well lol
list_out <- list()
for (lvl in lvls) {
print(paste0("Testing ", column, " ", lvl, " versus rest"))
path <- file.path(outpath, paste0(column, "__",lvl,"_v_rest"))
dir.create(path, showWarnings = F, recursive = T)
# Set our OVR analysis
cond_ <- cond
cond_[cond_ != lvl] <- "rest"
dds$condition <- factor(cond_, levels = c("rest", lvl))
input_ <- ifelse(is.null(controls), "condition", paste0(append(controls, "condition"), collapse = " + "))
fmla <- as.formula(paste0("~ ", input_))
design(dds) <- fmlaconda
res <- run_deseq(dds, path, contrast = c("condition", lvl, "rest"))
# saveRDS(dds, file=paste0(path, "/deseqDataset_", column,"__",lvl,"_v_rest.rds"))
# write.csv(res, file=paste0(path, "/dge_results_", column,"__",lvl,"_v_rest.csv"))
# volcanoP <- plot_volcano(res, title = paste0(column,"__",lvl,"_v_rest"))
# ggsave(paste0(path, "/volcanoPlot_", column,"__",lvl,"_v_rest.pdf"), volcanoP)
# make a heatmap of the significant genes
# tryCatch({
# sign_genes <- res[, pvalue] < pCutoff & abs(res[, "log2FoldChange"]) > FCcutoff
# heatmapP <- plot_gene_heatmap(dds[sign_genes, ], title = name, annotations = c(condiition, controls), normalize = "vst")
# pdf(file.path(outpath, "dge_heatmap.pdf"))
# print(heatmapP)
# dev.off()
# }, error = function(e) {
# message("An error occurred while generating the heatmap: ", conditionMessage(e))
# })
# geneList <- get_fc_list(res)
# gse <- gsea_analysis(geneList, path)
list_out[[lvl]] <- res
}
return(list_out)
}
#' Function to run deseq on a variety of conditions
#' Arguments:
#' dds: DESeq2 object
#' conditions: list of conditions to run deseq on
#' controls: list of controls to run deseq on
#' outpath: path to push results to
#' ...: additional arguments to pass to deseq for volcano plots
#' Outputs:
#' results of differential expression analysis
deseq_analysis <- function(dds, conditions, controls = NULL, outpath, ...) {
# make directory
dir.create(outpath, showWarnings = FALSE, recursive = TRUE)
# make list to store results
analysis_list <- list()
summary_df <- data.frame()
# make design matrix
for (condition in conditions) {
# make directory for condition
dir.create(file.path(outpath, condition), showWarnings = FALSE, recursive = TRUE)
# make new dds object with no NAs
if (!is.null(controls)) {
dds_ <- remove_na_variables(dds, c(controls, condition))
# make a stats table of the conditions and controls
df_stats <- as.data.frame(colData(dds_))
stats_table <- stats_table(df_stats, condition, vars = controls)
# save the stats table
write.csv(stats_table, file.path(outpath, condition, paste0(condition, "_stats_table.csv")))
} else {
dds_ <- remove_na_variables(dds, condition)
}
# check if condition is a factor
if (!is.factor(colData(dds_)[, condition])) {
colData(dds_)[, condition] <- as.factor(colData(dds_)[, condition])
print(paste0("Converting ", condition, " to factor"))
print(paste0("Levels: ", paste0(levels(colData(dds_)[, condition]), collapse = ", ")))
}
input_ <- ifelse(is.null(controls), condition, paste0(append(controls, condition), collapse = " + "))
design_matr <- as.formula(paste0("~ ", input_))
dds_ <- DESeqDataSet(dds_, design = design_matr)
levels <- levels(colData(dds_)[, condition])
logg <- file(file.path(outpath, condition, paste0(condition, "_analysis_summary.log")), open = "wt")
sink(logg)
sink(logg, type = "message")
message(paste0("Running Analysis on ", condition))
# make a pca plot
tryCatch({
pcs <- prcomp(scale(t(normalize_counts(dds_, method = "vst"))))
pca_plot <- ggbiplot(pcs, groups = colData(dds_)[, condition], ellipse = TRUE, var.axes = FALSE)
ggsave(file.path(outpath, condition, "pca_plot.pdf"), pca_plot)
}, error = function(e) {
message("An error occurred while generating the PCA plot: ", conditionMessage(e))
})
# if there are more than 2 levels, run OVR analysis
if (length(levels) > 2) {
res <- ovr_deseq_results(dds_, condition, file.path(outpath, condition), ...)
# # summarize res
summary <- lapply(res, summarize_deseq_experiment)
# make condition for the res
# condition <- names(res)
for (i in 1:length(summary)) {
summary[[i]]$condition <- names(res)[i]
}
# summary <- lapply(summary, function(x) {x$condition <- condition; return(x)})
summary <- do.call(rbind, summary)
# reorder columns so condition is first
summary <- summary[, c(ncol(summary), 1:(ncol(summary) - 1))]
# add summary to dataframe
summary_df <- rbind(summary_df, summary)
# add results to list
analysis_list[[condition]] <- res
}
# if there are only 2 levels, run normal deseq
if (length(levels) == 2) {
# flip levels if they are 0 and 1
# if(setequal(levels, c("0", "1"))) {levels <- c("1", "0")}
# get contrast and run deseq
contrast <- c(condition, levels[2], levels[1])
res <- run_deseq(dds_, file.path(outpath, condition), contrast = contrast, ...)
# summarize res
summary <- summarize_deseq_experiment(res, padj_cutoffs = c(0.05, 0.1, 0.2), pvalue_cutoffs = c(0.01, 0.05, 0.1))
summary$condition <- condition
# reorder columns so condition is first
summary <- summary[, c(ncol(summary), 1:(ncol(summary) - 1))]
# add summary to dataframe
summary_df <- rbind(summary_df, summary)
# add results to list
analysis_list[[condition]] <- res
}
# if there is only one level, skip
if (length(levels) == 1) {
message(paste0("Skipping ", condition, " because there is only one level"))
}
# if there are no levels, skip
if (length(levels) == 0) {
message(paste0("Skipping ", condition, " because there are no levels"))
}
sink(type = "message")
sink()
close(logg)
readLines(file.path(outpath, condition, paste0(condition, "_analysis_summary.log")))
}
# save summary dataframe
write.csv(summary_df, file.path(outpath, "deseq_analysis_summary.csv"), row.names = FALSE)
return(analysis_list)
}
#' Function to run statistical tests on a DESeq2 object using rstatix functions
#' Arguments:
#' formula: formula to use for the test
#' dds: DESeq2 object
#' rstatix_test: rstatix test to use
#' ...: additional arguments to pass to rstatix_test
#' Outputs:
#' data frame of p-values and adjusted p-values
test_dds <- function(formula, dds, rstatix_test = wilcox_test, ...) {
require(DESeq2)
message("Extracting data from DESeq object")
# Extract count data from the DESeq object
count_data_raw <- assay(dds) %>%
t() %>%
as.data.frame()
count_data <- log2(count_data_raw + 1)
message("Extracting metadata from DESeq object")
# Extract metadata from the DESeq object
meta <- colData(dds) %>%
as.data.frame() %>%
dplyr::select(all_of(attr(terms(formula), "term.labels")))
message("Running wilcoxon test")
# run the test for all the genes using map
test_res <- map(
count_data,
function(x) {
# combine the metadata and count data on the gene
data <- meta %>% mutate(x = x)
# add the gene to the formula as the response using reformulate
formula <- reformulate(
response = "x",
termlabels = attr(terms(formula), "term.labels")
)
# get the condition as the last term in the formula
condition <- attr(terms(formula), "term.labels")[length(attr(terms(formula), "term.labels"))]
conditions <- unique(meta[, condition])
# run the test
test <- rstatix_test(
data,
formula,
...
)
# get the stats
basemean <- 2^mean(x)
log2FC <- mean(x[meta[, condition] == conditions[2]]) - mean(x[meta[, condition] == conditions[1]])
test$basemean <- basemean
test$log2FC <- log2FC
# return the test
return(test)
}
)
# add the gene names
names(test_res) <- colnames(count_data)
# convert to dataframe
df_out <- list_of_lists_to_df(test_res) %>%
mutate(
pvalue = p,
padj = p.adjust(p, method = "fdr")
) %>%
dplyr::select(
basemean,
log2FC,
pvalue,
padj
)
# return the dataframe
return(df_out)
}
#======================== Summary Functions ========================
#' Function to summarize deseq results and return summary of results
#' Arguments:
#' results: results of differential expression analysis
#' padj_cutoffs: list of padj cutoffs to use
#' pvalue_cutoffs: list of pvalue cutoffs to use
#' Outputs:
#' summary of results
summarize_deseq_experiment <- function(
results,
padj_cutoffs = c(0.05, 0.1, 0.2), pvalue_cutoffs = c(0.01, 0.05, 0.1),
logFC_cutoff = 0
) {
# summary of the results at different padj cutoffs
padj_summary <- data.frame()
for (padj_cutoff in padj_cutoffs) {
padj_summary <- rbind(
padj_summary, data.frame(sign_cutoff = paste0("padj < ", padj_cutoff),
n_genes = filter(as.data.frame(results), padj < padj_cutoff) %>% nrow(),
n_up = filter(as.data.frame(results), padj < padj_cutoff & log2FoldChange > logFC_cutoff) %>% nrow(),
n_down = filter(as.data.frame(results), padj < padj_cutoff & log2FoldChange < logFC_cutoff) %>% nrow())
)
}
# summary of the results at different pvalue cutoffs
pvalue_summary <- data.frame()
for (pvalue_cutoff in pvalue_cutoffs) {
pvalue_summary <- rbind(
pvalue_summary,
data.frame(sign_cutoff = paste0("pvalue < ", pvalue_cutoff), n_genes = filter(as.data.frame(results), pvalue < pvalue_cutoff) %>% nrow(),
n_up = filter(as.data.frame(results), pvalue < pvalue_cutoff & log2FoldChange > logFC_cutoff) %>% nrow(),
n_down = filter(as.data.frame(results), pvalue < pvalue_cutoff & log2FoldChange < -logFC_cutoff) %>% nrow())
)
}
# combine the summaries
summary <- rbind(padj_summary, pvalue_summary)
return(summary)
}
#' Function to compare different results
#' Arguments:
#' results 1: results of differential expression analysis
#' results 2: results of differential expression analysis
#' metric: metric to compare (default is log2FoldChange)
#' by: column to join by (default is NULL)
#' Outputs:
#' data frame of comparison results
compare_results <- function(res1, res2, metric = "log2FoldChange", by = "rowname", suffix = c("_1", "_2")) {
res1 <- as.data.frame(res1)
res2 <- as.data.frame(res2)
# ensure the metric and labels are in the results
if (!metric %in% c(colnames(res1), "rowname")) {
stop("metric not in results 1")
}
out <- inner_join(
res1 %>% rownames_to_column("rowname"),
res2 %>% rownames_to_column("rowname"),
by = by,
suffix = suffix
)
# return the data frame
return(out)
}