-
Notifications
You must be signed in to change notification settings - Fork 0
/
construct_models.R
94 lines (70 loc) · 2.76 KB
/
construct_models.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# R script to construct prediction models for fecal microbial load
# load library
library(tidyverse)
library(data.table)
library(ggpubr)
library(vegan)
library(doMC)
library(MLmetrics)
library(caret)
library(xgboost)
registerDoMC(cores = 4)
## define function
construct_model <- function(d, count, cutoff = 1E-4, pseud = 1E-4, data){
set.seed(0)
## add Shannon diversity
d$`Shannon diversity` <- diversity(d)
## filtering minor species
keep <- apply(d, 2, mean) > cutoff & apply(d > 0, 2, mean) > 0.1
d <- d[, keep]
## scaling
d <- scale(log10(d + pseud))
#train_control <- trainControl(method = "repeatedcv", savePredictions = T, repeats = 5)
train_control <- trainControl(method = "repeatedcv", savePredictions = T, repeats = 1)
tune_grid <- expand.grid(
nrounds=c(100, 500, 1000),
max_depth = c(3:5),
eta = c(0.01, 0.05, 0.1),
gamma = c(0.01),
colsample_bytree = c(0.75),
subsample = 1,
min_child_weight = 1
)
model <- caret::train(d, count, trControl = train_control, method = "xgbTree", metric = "Rsquared", tuneGrid = tune_grid)
## save result
out <- paste0("out/model/model.", data, ".rds")
saveRDS(model, file = out)
}
## read data
## GALAXY/MicrobLiver
d <- fread("data/GALAXY_mOTUs_v25.tsv") %>% data.frame(check.names = F) %>% column_to_rownames("V1")
c <- read.delim("data/GALAXY_load.tsv", header = T, check.names = F)
count <- c$count %>% log10()
construct_model(d, count, 1E-4, 1E-4, "GALAXY.mOTUs")
## MetaCardis
d <- fread("data/MetaCardis_mOTUs_v25.tsv") %>% data.frame(check.names = F) %>% column_to_rownames("V1")
c <- read.delim("data/MetaCardis_load.tsv", header = T, check.names = F)
count <- c$count %>% log10()
construct_model(d, count, 1E-4, 1E-4, "MetaCardis.mOTUs")
## GALAXY/MicrobLiver (KO)
d <- fread("data/GALAXY_KO.tsv") %>% data.frame(check.names = F) %>% column_to_rownames("V1")
c <- read.delim("data/GALAXY_load.tsv", header = T, check.names = F)
count <- c$count %>% log10()
construct_model(d, count, 1E-6, 1E-7, "GALAXY.KO")
## MetaCardis (KO)
d <- fread("data/MetaCardis_KO.tsv") %>% data.frame(check.names = F) %>% column_to_rownames("V1")
c <- read.delim("data/MetaCardis_load.tsv", header = T, check.names = F)
count <- c$count %>% log10()
construct_model(d, count, 1E-6, 1E-7, "MetaCardis.KO")
## Vandeputte_2021_16S
d <- fread("data/Vandeputte_2021_16S.tsv") %>% data.frame(check.names = F) %>% column_to_rownames("V1")
c <- read.delim("data/Vandeputte_2021_load.tsv", row.names = 1, header = T, check.names = F)
keep <- apply(is.na(d), 1, sum) == 0
d <- d[keep, ]
c <- c[keep, ]
keep <- apply(is.na(c), 1, sum) == 0
d <- d[keep, ]
c <- c[keep, ]
count <- log10(as.numeric(c$count))
d <- d %>% as.matrix() %>% prop.table(1)
construct_model(d, count, 1E-4, 1E-4, "Vandeputte_2021.16S")