forked from syadlowsky/revised-pooled-ascvd
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathreplicate_pces.R
151 lines (123 loc) · 5.41 KB
/
replicate_pces.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
library(caret)
library(config)
library(glmnet)
library(ggsci)
library(grid)
library(ggplot2)
library(gridExtra)
# Maybe don't need all of these...
library(parallel)
library(doParallel)
library(survival)
library(pROC)
library(PredictABEL)
source("utils.R")
source("data_loaders.R")
source("plots.R")
source("nhanes_analysis.R")
source("calculate_stats.R")
source("validation_results.R")
source("subgroup_model.R")
source("baseline_model.R")
source("pce_model.R")
set.seed(conf$seed)
# PCE's use all data from CARDIA, CHS, ARIC, FRAM, and FRAMOFF
# and test on JHS and MESA, so we need to divide up the data differently.
training.data <- load.training.data(paste(conf$working_dir, conf$train_file, sep="/"))
training.data$fold <- NULL
test.data <- load.test.data(paste(conf$working_dir, conf$test_file, sep="/"))
data <- rbind(training.data, test.data)
training.data <- subset(data, study == "CHS" |
study == "FRAM" |
study == "CARD" |
study == "FRAM" |
study == "FRAMOFF" |
study == "ARIC")
test.data <- subset(data, study == "MESA" |
study == "JHS")
nhanes.data <- load.nhanes(paste(conf$working_dir, conf$nhanes_csv, sep="/"))
feature.equation.age.interactions <-
outcome ~ I(log(age)^2) + log(age) * I((1-rxbp) * log(sysbp)) + log(age) * dm +
log(age) * cursmoke + log(age) * log(totchol) +
log(age) * log(hdl) + log(age) * I(rxbp * log(sysbp)) + 0
if (conf$fit_model) {
pce_pre_selected_model <- pce_model(training.data)
pce_model_selection <- race_and_sex_subgroup_model(
feature.equation.age.interactions, fit.baseline.model, training.data)
save(pce_pre_selected_model, pce_model_selection,
file=conf$replication_model)
} else {
load(file=conf$replication_model)
}
cat("PCE Pre-selected Model Coefs\n")
print(pce_pre_selected_model)
cat("PCE Methodology applied to pooled cohorts Coefs\n")
print(pce_model_selection)
svg(paste(conf$working_dir, "nhanes_pce_pre_selected_model.svg", sep="/"), width=4, height=4)
nhanes.ratio(nhanes.data,
pce_pre_selected_model,
"Using model selected by AHA/ACC",
F)
dev.off()
svg(paste(conf$working_dir, "nhanes_pce_model_selection.svg", sep="/"), width=4, height=4)
nhanes.ratio(nhanes.data,
pce_model_selection,
"Using model selection methodology of AHA/ACC",
F)
dev.off()
pce_pre_selected_model.results <- test.data
pce_pre_selected_model.results$risk <- predict(pce_pre_selected_model, pce_pre_selected_model.results)
pce_pre_selected_model.results$link <- link(pce_pre_selected_model, pce_pre_selected_model.results)
pce_model_selection.results <- test.data
pce_model_selection.results$risk <- predict(pce_model_selection, pce_model_selection.results)
pce_model_selection.results$link <- link(pce_model_selection, pce_model_selection.results)
pce_pre_selected_model.stats <- validation.statistics(pce_pre_selected_model.results,
training.data=training.data)
pce_model_selection.stats <- validation.statistics(
pce_model_selection.results, pce_pre_selected_model.results,
training.data=training.data)
print.validation.report(
concat.validation.report(
report.from.validation(
pce_pre_selected_model.stats, name="Using model selected by AHA/ACC"),
report.from.validation(
pce_model_selection.stats, name="Using model selection methodology of AHA/ACC")))
for (group in 1:4) {
svg(paste("replication_calibration",group,".svg", sep=""), width=4, height=4)
group.pce_pre_selected_model.stats <- pce_pre_selected_model.stats[[group]]
group.pce_model_selection.stats <- pce_model_selection.stats[[group]]
p <- ggplot(group.pce_pre_selected_model.stats$calibration,
name="Pre selected model")
p <- ggplot(group.pce_model_selection.stats$calibration,
name="AHA/ACC selection methodolgy", guide=F, plot.obj=p) +
ggtitle(paste("Calibration curves for risk models among ",
group_to_description(group))) +
scale_color_jama(name="Model") +
scale_shape_discrete(name="Model") +
theme(plot.title = element_text(hjust = 0.5, size=10),
axis.title.y = element_text(size=10),
axis.title.x = element_text(size=10),
legend.justification=c(1,0), legend.position=c(0.91,0.112))
p_inset <- ggplot(group.pce_pre_selected_model.stats$calibration,
name="Pre selected model", ylim=0.15, xlim=0.15)
p_inset <- ggplot(group.pce_model_selection.stats$calibration,
name="AHA/ACC selection methodolgy", guide=F, plot.obj=p_inset,
ylim=0.15, xlim=0.15) +
ggtitle("Range [0, 0.15] for clarity") +
scale_color_jama(name="Model") +
scale_shape_discrete(name="Model") +
theme(
axis.title.x=element_blank(),
axis.title.y=element_blank(),
text = element_text(size=7),
plot.title = element_text(size=7),
legend.position="none"
)
p <- p + annotation_custom(grob=ggplotGrob(p_inset),
xmin=0.5,
xmax=1.0,
ymin=0.5,
ymax=1.0)
print(p)
dev.off()
}