-
Notifications
You must be signed in to change notification settings - Fork 3
/
40_config.R
118 lines (94 loc) · 3.15 KB
/
40_config.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#################################
# dataset and database
#################################
### big files: locally stored in UPC's servers
# regularised laplacian kernels
dir_kernel <- "~/all/devel/big/diffusion/gsk/"
# reproducibility
dir_metadata <- "00_metadata"
# raw data
dir_raw <- "00_rawdata"
file_alldiseases <- paste0(dir_raw, "/17.06_ot_commondisease_associations_filtered.csv")
file_complexes <- paste0(dir_raw, "/OT-000-20-2_out.rda")
file_omnipath <- paste0(dir_raw, "/omnipath.rda")
# Thresholds to generate the dataset
# this has already been computed for the STRING database
col_genetic <- "association_score.datatypes.genetic_association"
threshold_genetic <- .160
# as the number of genes in the new network does not greatly vary
# we are going to use the same diseases as for STRING
file_list_diseases <- "20_data/diseases_over_50_genes.txt"
#################################
# output directories
#################################
# data for the runs
dir_data <- "40_data"
file_mapping <- paste0(dir_data, "/table_map_ensembl2uniprot.RData")
# performance metrics
dir_performance <- "42_performance"
dir_models <- "43_models"
# mashup
dir_mashup <- "45_mashup"
# network
graph_alldiseases <- paste0(dir_data, "/graph_alldiseases.RData")
# big network files
file_kernel <- paste0(dir_kernel, "/omnipath.RData")
file_mashup_features <- paste0(dir_kernel, "/omnipath_features.csv")
# Analysis of the topology and the positives
dir_complexes <- "43_complexes"
dir_boxplots <- "43_boxplots"
dir_contrasts <- "43_contrasts"
dir_topology <- "44_topology"
# MashUp features
file_mashup_names <- "45_mashup/node_names.txt"
# Preprocessed input RData file
file_input <- paste0(dir_data, "/tables_input.RData")
#################################
# Other params
#################################
# regularised laplacian kernel
kernel_normalised <- FALSE
# repeated k-fold cross validation
k_cv <- 3
times_cv <- 25
list_cv_schemes <- c("classic", "block", "representative")
# diffustats methods
list_methods <- c("raw", "gm", "mc", "z")
mc_nperm <- 1e3
# cosnet cost parameter
cosnet_cost <- 1e-4
# RANKS parameters
wsld_d <- 2
knn_k <- 3
# bagged svm and ML (svm, rf) parameters
# can be found in their function definitions
# in 22_performance.Rmd
colname_symbol <- "target.id"
#################################
# Hostnames
#################################
# get the host name (linux)
# our machines at UPC are "eko" and "sun"
host <- Sys.info()["nodename"]
print(paste("You are running the CV on the host:", host))
# options for each machine
# - number of threads to use
# - cv schemes to run (each cv produces its own file,
# outputs can be combined later between machines).
# Order matters!
#
# If you want to add your own machines, you can do so by
# appending more options with their hostnames
if (host == "eko") {
nslaves <- 12
cv_jobs <- list_cv_schemes[3]
} else if (host == "sun") {
nslaves <- 20
cv_jobs <- list_cv_schemes[1:3]
} else {
print("This host is unknown. Using default options in 20_config.R")
# if the host is unknown, use all the cores and
# run all the validation schemes
nslaves <- parallel::detectCores()
cv_jobs <- list_cv_schemes
}