-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy path13_complexes.Rmd
369 lines (320 loc) · 10.7 KB
/
13_complexes.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
---
title: "Analysis of the complex data"
author: "Sergio Picart-Armada"
date: "December 4, 2017"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
```
# Getting started
```{r}
library(plyr)
library(dplyr)
library(magrittr)
library(tidyr)
library(caret)
library(igraph)
library(ggplot2)
library(ggdendro)
# have all config variables in a different env
config <- new.env(parent = globalenv())
source("03_config.R", local = config)
# load dataset and kernel
load(config$graph_alldisease)
load(config$file_complexes)
# load(config$file_kernel3)
df_disease <- mutate(g_filter$dataset, disease = as.factor(disease.efo_info.label)) %>%
rename(drugs = known_drug_binary, genetic = known_gene_binary) %>%
mutate(validation = drugs)
n <- vcount(g_filter)
df_input <- plyr::ddply(
reshape2::melt(df_disease, measure.vars = c("drugs", "genetic"),
variable.name = "input_type", value.name = "input"),
c("disease", "input_type"),
function(disease) {
# browser()
nm <- V(g_filter)$name
x <- nm %in% (filter(disease, input == 1)$STRING_id)
val <- nm %in% (filter(disease, validation == 1)$STRING_id)
data.frame(
STRING_id = nm,
score = x*1,
validation = val*1
)
},
.progress = "text"
)
```
# Preprocess the complex data
```{r}
# the list in David's file is called "a" and we want the gene-complex mapping
# it is stored in a$all$genes.for.complexes
# (current criteria is in favour of using "all")
list_complex_raw <- a$all$genes.for.complexes
# as a list
list_complex <- plyr::llply(
list_complex_raw,
function(prot) {
intersect(paste0("9606.", as.character(prot)), V(g_filter)$name)
}
)
# as a data frame
df_complex <- plyr::ldply(
list_complex,
function(prot) data.frame(STRING_id = prot),
.id = "complex_id"
)
write.table(
df_complex,
paste0(config$dir_complexes, "/map_complex_to_protein.csv"),
sep = "\t", quote = FALSE,
row.names = FALSE)
```
# Descriptive statistics on complexes
```{r}
df_stats_complex <- plyr::ldply(
list_complex_raw,
function(proteins) {
prot <- intersect(paste0("9606.", as.character(proteins)), V(g_filter)$name)
n_before <- length(proteins)
n_after <- length(prot)
df_complex_disease <- filter(
df_input, (input_type == "drugs") & (STRING_id %in% prot) & (validation == 1L))
n_disease <- length(unique(df_complex_disease$disease))
drug_genes <- unique(df_complex_disease$STRING_id)
n_drug_genes <- length(drug_genes)
data.frame(
n_before = n_before,
n_after = n_after,
proportion_mapping = n_after/n_before,
n_disease = n_disease,
n_drug_genes = n_drug_genes,
n_diff = n_before - n_after,
n_nodrug = n_after - n_drug_genes
)
},
.id = "complex_id",
.progress = "text"
)
write.csv(df_stats_complex,
paste0(config$dir_complexes, "/descriptive_by_complex.csv"),
row.names = FALSE)
```
## Mapping to network
```{r}
ggplot(gather(df_stats_complex, when, n, n_before, n_after, factor_key = TRUE)) +
geom_histogram(aes(x = n), position = "dodge") +
facet_wrap(~when) +
theme_bw() +
ggtitle("Proteins mapping to the network") +
xlab("Number of proteins in complex") +
ylab("Number of complexes") +
theme(aspect.ratio = 1)
ggsave(paste0(config$dir_complexes, "/plot_protein_mapping_effect.png"), width = 6, height = 4)
```
```{r}
ggplot(df_stats_complex, aes(x = proportion_mapping)) +
geom_histogram(bins = 10) +
theme_bw() +
xlab("Fraction of proteins mapping to network") +
ylab("Number of complexes")
ggsave(paste0(config$dir_complexes, "/plot_protein_mapping_fraction.png"), width = 6, height = 4)
```
## Complexes and diseases
```{r}
ggplot(df_stats_complex, aes(x = n_disease)) +
geom_bar(stat = "count") +
theme_bw() +
xlab("Diseases per complex") +
ylab("Number of complexes")
ggsave(paste0(config$dir_complexes, "/plot_diseases_per_complex.png"), width = 6, height = 4)
```
```{r}
ggplot(df_stats_complex, aes(x = n_nodrug/n_after)) +
geom_histogram(bins = 10) +
theme_bw() +
xlab("Proportion of drug-genes") +
ylab("Number of complexes")
ggsave(paste0(config$dir_complexes, "/plot_drug_genes_fraction.png"), width = 6, height = 4)
```
```{r}
# in terms of disease
df_stats_disease <- plyr::ddply(
df_input,
"disease",
function(df) {
# browser()
# find disease genes
gene_disease <- filter(df, validation == 1L)$STRING_id %>% as.character %>% unique
# find their associated complexes
df_compl <- filter(df_complex, STRING_id %in% gene_disease)
# genes in complexes and in the disease
gene_compl <- df_compl$STRING_id %>% unique
# identifiers of the complexes
id_compl <- df_compl$complex_id %>% as.character %>% unique
# find the other genes in the complexes
df_whole_compl <- filter(df_complex, complex_id %in% id_compl) %>%
mutate(complex_id = droplevels(complex_id))
gene_whole_compl <- df_whole_compl$STRING_id %>% as.character %>% unique
data.frame(
n_genes = length(gene_disease),
n_complex_genes = length(gene_compl),
proportion_complex_genes = length(gene_compl)/length(gene_disease),
n_complexes = length(id_compl),
n_genes_union_complexes = length(gene_whole_compl),
proportion_union_in_disease = length(gene_compl)/length(gene_whole_compl)
)
}
)
write.csv(df_stats_disease,
paste0(config$dir_complexes, "/descriptive_by_disease.csv"),
row.names = FALSE)
```
## Intersection
```{r}
union_complex <- list_complex %>% unlist %>% unique
union_disease <- filter(df_input, validation == 1L)$STRING_id %>% unique %>% as.character
id_disease <- V(g_filter)$name %in% union_disease
id_complex <- V(g_filter)$name %in% union_complex
test_overlap <- fisher.test(
x = id_disease,
y = id_complex
)
writeLines(
capture.output(print(test_overlap)),
paste0(config$dir_complexes, "/test_overlap_disease_complex.txt")
)
```
## Cross validation viability
```{r, eval=FALSE}
# cross validation parameters
k_cv <- 3
rep_cv <- 20
set.seed(1)
# Build the folds and compute statistics on them
df_cv <- plyr::ddply(
df_input,
"disease",
function(df) {
# browser()
# find disease genes
gene_disease <- filter(df, validation == 1L)$STRING_id %>% as.character %>% unique
# find their associated complexes
df_compl <- filter(df_complex, STRING_id %in% gene_disease)
# genes in complexes and in the disease
gene_compl <- df_compl$STRING_id %>% unique
# identifiers of the complexes
id_compl <- df_compl$complex_id %>% as.character %>% unique
# have the complexes as a list
list_whole_compl <- list_complex[id_compl]
gene_whole_compl <- list_whole_compl %>% unlist %>% unique
# # merge the complexes until no more merging is needed
# # then... how many complexes do we have?
# # can we run CV on that?
# #
# # Compute binary matrix of complexes
# mat_complex <- plyr::daply(
# df_whole_compl, "complex_id",
# function(df_c) V(g_filter)$name %in% as.character(df_c$STRING_id))
# # compute overlap between complexes
# mat_overlap <- tcrossprod(mat_complex)
# # find connected components
# # I define a graph with these edges
# # we simplify it (no loops)
# g <- graph_from_adjacency_matrix(mat_overlap) %>% simplify
# g_components <- clusters(g)
# union of disease genes and complex genes for that disease
gene_union <- union(gene_whole_compl, gene_disease)
n <- length(gene_union)
# First, an empty graph
g <- graph.empty(n)
V(g)$name <- gene_union
# Modify its adjacency matrix by adding edges between all the
# genes in a complex
mat_adj <- get.adjacency(g)
for (com in list_whole_compl) {
mat_adj[com, com] <- 1
}
# Redefine the graph with the right adjacency
g <- graph_from_adjacency_matrix(mat_adj, mode = "undirected")
g_components <- clusters(g)
# Get the labels for each gene
labels_gene <- g_components$membership
labels <- 1:(g_components$no)
# Number of genes in each component
labels_size <- g_components$csize
# Compute the number of disease genes per component
# (because some of them might not be fully made of disease genes...)
labels_positives <- sapply(
labels,
function(lab) {
genes <- names(labels_gene)[labels_gene == lab]
length(intersect(genes, gene_disease))
}
)
stopifnot(all(labels_size >= labels_positives))
# emulate the cross validation
df_ans <- plyr::ldply(
paste0("Rep", 1:rep_cv),
function(rep_number) {
# browser()
# shuffle the components
shuffle_labels <- sample(labels)
# cumulative number of disease genes in this ordering
shuffle_sum <- labels_positives[shuffle_labels]
shuffle_cumsum <- cumsum(shuffle_sum)
# Total number of disease genes
total_sum <- length(gene_disease)
stopifnot(total_sum == tail(shuffle_cumsum, 1))
# Theoretical endpoints
endpoints <- ((1:k_cv - 1)/k_cv)*total_sum
# Find the closest cuts
cuts <- sapply(endpoints, function(x) which.min(abs(shuffle_cumsum - x)))
# In which cut is each position?
folds <- sapply(seq_along(shuffle_sum), function(x) sum(x >= cuts)) %>%
paste0("Fold", .)
# sums of positives by fold
split_fold <- split(shuffle_sum, f = folds) %>%
ldply(function(fold) data.frame(rep = rep_number, positives = sum(fold)), .id = "fold") %>%
mutate(proportion = positives/total_sum)
split_fold
}
)
df_ans
},
.progress = "text"
)
```
```{r}
# Table with summary of the methods
df_stats_cv <- plyr::ddply(
df_cv, "disease", function(df) {
data.frame(
under_10_percent = mean(df$proportion < .1),
over_50_percent = mean(df$proportion > .5))
}
)
write.csv(df_stats_cv,
paste0(config$dir_complexes, "/descriptive_cv_folds.csv"),
row.names = FALSE)
```
```{r}
ggplot(df_cv, aes(x = disease, y = proportion)) +
geom_hline(yintercept = 1/k_cv, color = "#EE9E9E") +
geom_boxplot() +
# scale_x_reverse()+
xlab("") +
ylab("Proportion of drug genes in fold") +
ggtitle(paste0("Balance of the k-fold cross-validation folds"),
subtitle = paste0("k=", k_cv, " folds, ", rep_cv, " repetitions of k-fold cross-validation")) +
coord_flip() +
theme_bw() +
theme(aspect.ratio = 1)
ggsave(paste0(config$dir_complexes, "/plot_cv_balance_by_disease.png"), width = 8, height = 6)
```
# Reproducibility
```{r}
out <- capture.output(sessionInfo())
writeLines(out, con = paste0(config$dir_metadata, "/13_sessionInfo_disease.txt"))
```