-
Notifications
You must be signed in to change notification settings - Fork 1
/
general_functions.R
332 lines (304 loc) · 10.9 KB
/
general_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
# HEADER --------------------------------------------------
# Author: Matthew Muller
#
# Date: 2023-01-11
#
# Script Name: General Functions
#
# Notes:
# Lots of fun functions I've made to work with.
# LOAD LIBRARIES ------------------------------------------
suppressMessages(library(tidyverse))
# CODE BLOCK ----------------------------------------------
#' Function to summarize results more generally
#' Arguments:
#' results: results of differential expression analysis
#' logFC_column: column to use for logFC
#' pvalue_column: column to use for pvalue
#' padj_column: column to use for padj
#' padj_cutoffs: list of padj cutoffs to use
#' pvalue_cutoffs: list of pvalue cutoffs to use
#' logFC_cutoff: logFC cutoff to use
#' Outputs:
#' summary of results
summarize_experiment <- function(
results,
logFC_column = "log2FoldChange",
pvalue_column = "pvalue",
padj_column = "padj",
pvalue_cutoffs = c(0.01, 0.05, 0.1),
padj_cutoffs = c(0.05, 0.1, 0.2),
logFC_cutoff = 0
) {
# summary of the results at different padj cutoffs
summary <- data.frame(
variable = character(),
p_cutoff = numeric(),
fc_cutoff = numeric(),
n_sig = numeric(),
n_up = numeric(),
n_down = numeric()
)
lapply(pvalue_cutoffs, function(pvalue_cutoff) {
summary <<- summary %>%
add_row(
variable = "pvalue",
p_cutoff = pvalue_cutoff,
fc_cutoff = logFC_cutoff,
n_sig = sum(results[[pvalue_column]] < pvalue_cutoff),
n_up = sum(results[[pvalue_column]] < pvalue_cutoff & results[[logFC_column]] > logFC_cutoff),
n_down = sum(results[[pvalue_column]] < pvalue_cutoff & results[[logFC_column]] < -logFC_cutoff)
)
})
lapply(padj_cutoffs, function(padj_cutoff) {
summary <<- summary %>%
add_row(
variable = "padj",
p_cutoff = padj_cutoff,
fc_cutoff = logFC_cutoff,
n_sig = sum(results[[padj_column]] < padj_cutoff),
n_up = sum(results[[padj_column]] < padj_cutoff & results[[logFC_column]] > logFC_cutoff),
n_down = sum(results[[padj_column]] < padj_cutoff & results[[logFC_column]] < -logFC_cutoff)
)
})
return(summary)
}
#' Function to get genes from a results table
#' Arguments:
#' - results: results table
#' - pval: p-value cutoff
#' - log2fc: log2 fold change cutoff
#' - gene_col: gene column name
#' - pval_col: p-value column name
#' - log2fc_col: log2 fold change column name
#' Returns:
#' - dataframe of genes separated by up and down
getGenes <- function(results, pval = 0.05, metric = 0, name_col = "rownames", pval_col = "padj", metric_col = "log2FoldChange") {
if (name_col == "rownames") {results <- rownames_to_column(results, var = name_col)}
up <- results %>%
dplyr::filter(!!sym(pval_col) < pval & !!sym(metric_col) > metric) %>%
dplyr::pull(!!sym(name_col))
down <- results %>%
dplyr::filter(!!sym(pval_col) < pval & !!sym(metric_col) < metric) %>%
dplyr::pull(!!sym(name_col))
out <- data.frame(features = c(up, down), direction = c(rep("up", length(up)), rep("down", length(down))))
return(out)
}
#' Function to add missing rows to a matrix
#' Arguments:
#' - df: matrix, rows = genes, cols = samples
#' - rows: vector of rownames to add
#' - sorted: logical, whether to sort the rows alphabetically
#' Returns:
#' - df: matrix, rows = genes, cols = samples
add_missing_rows <- function(
df, # cols = samples, rows = genes
rows, # cols = samples, rows = genes
sorted = TRUE ) {
missingRowNames <- rows[which(!rows %in% rownames(df))]
print(missingRowNames)
df_tmp <- as.data.frame(matrix(0,
nrow = length(missingRowNames),
ncol = dim(df)[2]
)
)
# print(dim(df_tmp))
# print(length(missingRowNames))
colnames(df_tmp) <- colnames(df)
rownames(df_tmp) <- missingRowNames
# print(head(df_tmp))
# print(head(df))
df <- rbind(df_tmp, df)
if (sorted) {
df <- df[order(rownames(df)),]
}
return(df)
}
#' Function to make a SummarizedExperiment object
#' Arguments:
#' - countsMatr: matrix of counts, rows = genes, cols = samples
#' - colData: data.frame of sample metadata, rows = samples
#' Returns:
#' - se: SummarizedExperiment object
make_se <- function(countsMatr, colData) {
require(SummarizedExperiment)
require(BiocGenerics)
sample_ids <- intersect(colnames(countsMatr), row.names(colData))
se <- SummarizedExperiment(assays = list(counts = as.matrix(countsMatr[,sample_ids])),
colData = colData[sample_ids,])
return(se)
}
#' Function to save a SummarizedExperiment object into multiple files if the slots are filled in the object
#' Arguments:
#' - se: SummarizedExperiment object
#' - path: path to save files
#' - normalize: how to normalize the counts
save_se <- function(se, path, normalize = 'mor') {
# make sure the path exists
dir.create(path, showWarnings = F, recursive = T)
# extract counts, colData, and rowData
counts <- normalize_counts(se, method = normalize)
colData <- as.data.frame(colData(se))
rowData <- as.data.frame(rowData(se))
# Save counts
write.csv(counts, file = file.path(path, paste0('counts_', normalize,'.csv')), quote = F, row.names = T)
# save the colData if it exists
if (!is.null(colData)) {
write.csv(colData, file = file.path(path, 'metadata.csv'), quote = T, row.names = T)
}
# save the rowData if it exists
if (!is.null(rowData)) {
write.csv(rowData, file = file.path(path, 'rowData.csv'), quote = T, row.names = T)
}
}
#' Function to summarize a dataframe
#' Arguments:
#' - df: dataframe to summarize
#' Returns:
#' - df: dataframe with summary statistics
summarize_df <- function(df) {
df %>%
summary() %>%
as.data.frame() %>%
rownames_to_column() %>%
rename('variable' = 'rowname') %>%
mutate(variable = as.character(variable))
return(df)
}
#' Function to normalize dds object and return counts
#' Arguments:
#' - dds: DESeq2 object
#' - method: normalization method
#' Returns:
#' - counts: normalized counts
normalize_counts <- function(dds, method = 'mor', log2 = FALSE) {
suppressPackageStartupMessages(require(DESeq2))
suppressPackageStartupMessages(require(SummarizedExperiment))
suppressPackageStartupMessages(require(BiocGenerics))
suppressPackageStartupMessages(require(edgeR))
suppressPackageStartupMessages(require(singscore))
# Error handling
options <- c('mor', 'vst', 'vsd', 'log2', 'rld', 'cpm', 'rlog', 'rpkm', 'none', 'tmm', 'log2-mor', 'rank')
if (method %in% options) {
normalize <- method
} else {
stop('Invalid normalization method. Please choose from: log2-mor, mor, vst, vsd, log2, rld, cpm, rlog, rpkm, tmm, rank, none')
}
# Get normalized counts
if (normalize == "mor") {counttable <- counts(dds, normalize = T)}
if (normalize %in% c("vsd", "vst")) {counttable <- assay(varianceStabilizingTransformation(dds))}
if (normalize == "log2") {counttable <- log2(assay(dds)+1)}
if (normalize == "log2-mor") {counttable <- log2(counts(dds, normalize = T)+1)}
if (normalize == "rld") {counttable <- rlog(dds)}
if (normalize == "cpm") {counttable <- cpm(dds)}
if (normalize == "rlog") {counttable <- rlog(dds)}
if (normalize == "rpkm") {counttable <- rpkm(dds)}
if (normalize == "none") {counttable <- counts(dds)}
if (normalize == "tmm") {counttable <- cpm(calcNormFactors(dds, method = "TMM"))}
if (normalize == "rank") {counttable <- rankGenes(dds)}
if (normalize == "none") {counttable <- counts(dds)}
counts <- as.data.frame(counttable)
if (log2) {
counts <- log2(counts + 1)
}
return(counts)
}
#' Function to turn list of lists into a dataframe
#' Arguments:
#' - list: list of lists
#' Returns:
#' - df: dataframe
list_of_lists_to_df <- function(list) {
df <- do.call(rbind, lapply(list, function(x) data.frame(x)))
return(df)
}
# Function to remove NA variables from a summarized experiment object
# Arguments:
# se: SummarizedExperiment object
# columns: list of columns to remove NAs from
# Outputs:
# DESeqDataSet object with NAs removed
remove_na_variables <- function(se, columns) {
# Get the colData from the summarized experiment object
col_data <- as.data.frame(colData(se))
assay_data <- assay(se)
# Update the DESeqDataSet object with the modified colData
col_data <- drop_na(col_data, any_of(columns))
col_data <- DataFrame(col_data)
new_se <- make_se(assay_data, col_data)
# Return the new DESeqDataSet object
return(new_se)
}
#' Function to get the number of samples in each group
#' Arguments:
#' - metadata: dataframe, metadata
#' - id: character, column name of the sample ID
#' - events_term: character, prefix of the events columns
#' - subset: character, column name of the grouping variable
get_events_breakdown <- function(metadata, id = 'PATNUM', events_term = 'C_', subset = NULL) {
# get the match of the PATNUMs
metadata_patnums <- metadata[,id]
if (!is.null(subset)) {
patnum_match <- metadata_patnums[metadata[,subset] == 1]
} else {
patnum_match <- metadata_patnums
}
# catch if there are no matches
if (is.na(patnum_match[1])) {
stop('No matches found from the PATNUMs provided')
}
# Subset the metadata and break down the subtypes
subtype_breakdown <- metadata[patnum_match,] %>%
select(starts_with(events_term)) %>%
# count the 1s and 0s
summarise_all(~ sum(., na.rm = TRUE)) %>%
mutate( total = rowSums(.))
return(subtype_breakdown)
}
#' Function to get make pairwise combinations
#' Arguments:
#' - vec: vector of values
#' Returns:
#' - list of pairwise combinations
pairwise_combos <- function(vec) {
unique_vals <- unique(vec)
combos <- combn(unique_vals, 2)
list_of_combos <- list()
for (i in 1:ncol(combos)) {
list_of_combos[[i]] <- combos[, i]
}
return(list_of_combos)
}
#' Function to get the variable name
#' Arguments:
#' - var: variable name
#' Returns:
#' - varName: character, variable name
varName <- function(var) {
deparse(substitute(var))
}
#' Function to one hot encode dataframe column
#' Arguments:
#' - df: dataframe
#' - column: character, column name
#' - binary: logical, whether to make the column binary
#' Returns:
#' - df: dataframe with one hot encoded column
one_hot_encode_ovr <- function(df, column, binary = TRUE) {
# Get the unique values of the column
unique_vals <- unique(df[[column]])
# For each unique value, create a new binary column
for (val in unique_vals) {
# Create a new column name
new_col_name <- paste0(column, "_", val)
message(glue("Creating column: {new_col_name}"))
# Create the new column in the data frame
if (binary) {
df[[new_col_name]] <- ifelse(df[[column]] == val, 1, 0)
} else {
df[[new_col_name]] <- factor(ifelse(df[[column]] == val, val, 'rest'), levels = c('rest', val))
}
}
# Return the modified data frame
return(df)
}