-
Notifications
You must be signed in to change notification settings - Fork 1
/
clustering_functions.R
225 lines (189 loc) · 6.91 KB
/
clustering_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# HEADER --------------------------------------------------
# Author: Matthew Muller
#
# Date: 2023-01-11
#
# Script Name: clustering functions
#
# Notes:
# Lots of fun functions I've made to work with for clustering
# LOAD LIBRARIES ------------------------------------------
suppressMessages(library(tidyverse))
suppressMessages(library(ggplot2))
suppressMessages(library(NMF))
# CODE BLOCK ----------------------------------------------
# Add code here
# Function to prep a counts matrix for nmf
# Arguments:
# - counts_matr: matrix, rows = genes, cols = samples
# - outfile: file to save output to
# - ...: other arguments to pass to nmf
# Returns:
# - output: list of nmf and random nmf results
# - saves output to file
prep_nmf_data <- function(counts_matr) {
# We are doing to log transform the data and then scale it.
# To get around negative values, we are going to make a positive and negative matrix
# and then combine them
# First, log transform the data
log_counts_matr <- log2(counts_matr + 1)
# Next, scale the data
scaled_counts_matr <- t(scale(t(log_counts_matr)))
# Now, we need to make a positive and negative matrix
# First, we need to get the dimensions of the matrix
nrow <- dim(scaled_counts_matr)[1]
ncol <- dim(scaled_counts_matr)[2]
# Now, we can make the positive and negative matrices
pos_matr <- matrix(0, nrow = nrow, ncol = ncol)
neg_matr <- matrix(0, nrow = nrow, ncol = ncol)
# Now, we can fill them
pos_matr[scaled_counts_matr > 0] <- scaled_counts_matr[scaled_counts_matr > 0]
neg_matr[scaled_counts_matr < 0] <- abs(scaled_counts_matr[scaled_counts_matr < 0])
# Now, we can combine them
combined_matr <- rbind(pos_matr, neg_matr)
# Now, we can return the output
return(combined_matr)
}
# Function to estimate the rank of a matrix
# Arguments:
# - counts_matr: matrix, rows = genes, cols = samples
# - outfile: file to save output to
# - ranks: vector of ranks to test
# - runs: number of runs to test
# - options: options for nmf
# - ...: other arguments to pass to nmf
# Returns:
# - output: list of nmf and random nmf results
# - saves output to file
nmf_estimator <- function(
counts_matr,
outfile,
ranks = 2:8, runs = 30,
options = 'v',
...
) {
require(NMF)
# Call nmf with some settings to make test the ranks chosen
# We also are going to randomize the data to see how it compares to
# data that is purely random in terms of residuals and other metrics
# Ideally, the next step is to plot the NMF
nmf_ranks_out <- nmf(counts_matr, ranks, nrun = runs,
.opt = options, ...)
rng_ranks_out <- nmf(randomize(counts_matr), ranks, nrun = runs,
.opt = options, ...)
output <- list('nmf_ranks_out' = nmf_ranks_out, 'rng_ranks_out' = rng_ranks_out)
saveRDS(output, file = outfile)
return(output)
}
# Function to plot the results of nmf_estimator
# Arguments:
# - nmf_out: output from nmf_estimator
# - outfile: file to save output to
# - plot: whether to plot the results
# Returns:
# - output: list of plots
# - saves output to file
nmf_plotter <- function(nmf_out) {
require(NMF)
# Plot the results of nmf_estimator
# We want to create a new metric that is the cophenetic correlation * dispersion
# This will give us a better idea of how well the rank is doing
# First, we need to get the cophenetic correlation
cophenetic_correlation <- nmf_out$nmf_ranks_out$cophenetic
# Next, we need to get the dispersion
dispersion <- nmf_out$nmf_ranks_out$dispersion
# Now, we can multiply them together
cophenetic_dispersion <- cophenetic_correlation * dispersion
# Now, we need to get the random cophenetic dispersion
rng_cophenetic_correlation <- nmf_out$rng_ranks_out$cophenetic
rng_dispersion <- nmf_out$rng_ranks_out$dispersion
rng_cophenetic_dispersion <- rng_cophenetic_correlation * rng_dispersion
# now combine them into a data frame
cophenetic_df <- data.frame('rank' = nmf_out$nmf_ranks_out$rank,
'cophenetic' = cophenetic_correlation,
'dispersion' = dispersion,
'cophenetic_dispersion' = cophenetic_dispersion,
'type' = 'nmf')
rng_cophenetic_df <- data.frame('rank' = nmf_out$rng_ranks_out$rank,
'cophenetic' = rng_cophenetic_correlation,
'dispersion' = rng_dispersion,
'cophenetic_dispersion' = rng_cophenetic_dispersion,
'type' = 'random')
# combine them
combined_df <- rbind(cophenetic_df, rng_cophenetic_df)
# plot them
cophenetic_plot <- ggplot(combined_df, aes(x = rank, y = cophenetic_dispersion, color = type)) +
geom_point() +
labs(x = "Rank", y = "Cophenetic Dispersion") +
theme_bw()
}
# Function to estimate the kmeans clustering of a matrix using the elbow method
# Arguments:
# - counts_matr: matrix, rows = genes, cols = samples
# - outfile: file to save output to
# - ks: vector of ks to test
# - runs: number of runs to test
# - ...: other arguments to pass to kmeans
# Returns:
# - output: list of kmeans
# - saves output to file
kmeans_estimator <- function(
counts_matr,
ks = 2:8,
outfile,
...
) {
km_list <- list()
for (k in ks){
km_list[[k]] <- kmeans(counts_matr, k, ...)
}
output <- list('km_list' = km_list)
saveRDS(output, file = outfile)
return(output)
}
# Function to estimate the hierarchical clustering of a matrix
# Arguments:
# - counts_matr: matrix, rows = genes, cols = samples
# - clusters: vector of clusters to test
# - outfile: file to save output to
# - ...: other arguments to pass to hclust
# Returns:
# - output: list of clustering
# - saves output to file
hclust_estimator <- function(
counts_matr,
clusters = 2:8,
outfile,
...
) {
hclust_list <- list()
for (k in clusters){
hclust_list[[k]] <- hclust(dist(counts_matr), ...)
}
output <- list('hclust_list' = hclust_list)
saveRDS(output, file = outfile)
return(output)
}
#======================== Old Code ========================#
# rank_estimator <- function(
# expression_matrix,
# ranks = c(2:8),
# save_file = NULL
# ){
# require(NMF)
# # Runk NMF ranks
# rank_out = nmf(expression_matrix, rank = ranks,
# method = nmf.getOption("default.algorithm"),
# nrun = 50)
# # Graph cophenetic correlation plots
# cophenetic_correlation_plot <- ggplot(rank_out,
# aes(x=rank, y=cophenetic)) +
# geom_point() + labs(x="Rank", y="Cophenetic Correlation Coefficient")
# # Save data
# if (!is.null(save_file)) {
# save(rank_out, cophentic_correlation_plot,
# file = save_file)
# }
# output <- list(rank_out, cophenetic_correlation_plot)
# return(output)
# }