-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathabsoluteRisk.R
321 lines (312 loc) · 15.3 KB
/
absoluteRisk.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
# This is where all functions related to absolute risk estimation should appear
#' Compute absolute risks using the fitted hazard function.
#'
#' Using the output of the function \code{fitSmoothHazard}, we can compute
#' absolute risks by integrating the fitted hazard function over a time period
#' and then converting this to an estimated survival for each individual.
#'
#' If \code{newdata = "typical"}, we create a typical covariate profile for the
#' absolute risk computation. This means that we take the median for numerical
#' and date variables, and we take the most common level for factor variables.
#'
#' In general, the output will include a column corresponding to the provided
#' time points. Some modifications of the \code{time} vector are done:
#' \code{time=0} is added, the time points are ordered, and duplicates are
#' removed. All these modifications simplify the computations and give an output
#' that can easily be used to plot risk curves.
#'
#' If there is no competing risk, the output is a matrix where each column
#' corresponds to the several covariate profiles, and where each row corresponds
#' to a time point. If there are competing risks, the output will be a
#' 3-dimensional array, with the third dimension corresponding to the different
#' events.
#'
#' The numerical method should be good enough in most situation, but Monte Carlo
#' integration can give more accurate results when the estimated hazard function
#' is not smooth (e.g. when modeling with time-varying covariates).
#'
#' @param object Output of function \code{\link{fitSmoothHazard}}.
#' @param time A vector of time points at which we should compute the absolute
#' risks.
#' @param newdata Optionally, a data frame in which to look for variables with
#' which to predict. If omitted, the mean absolute risk is returned.
#' Alternatively, if \code{newdata = "typical"}, the absolute risk will be
#' computed at a "typical" covariate profile (see Details).
#' @param method Method used for integration. Defaults to \code{"numerical"},
#' which uses the trapezoidal rule to integrate over all time points together.
#' The only other option is \code{"montecarlo"}, which implements Monte-Carlo
#' integration.
#' @param nsamp Maximal number of subdivisions (if \code{method = "numerical"})
#' or number of sampled points (if \code{method = "montecarlo"}).
#' @param n.trees Number of trees used in the prediction (for class \code{gbm}).
#' @param s Value of the penalty parameter lambda at which predictions are
#' required (for class \code{cv.glmnet}).
#' @param onlyMain Logical. For competing risks, should we return absolute risks
#' only for the main event of interest? Defaults to \code{TRUE}.
#' @param type Type of output. Can be \code{"CI"} (output is on the cumulative
#' incidence scale) or \code{"survival"} (output is on the survival scale,
#' i.e. 1- CI)
#' @param addZero Logical. Should we add time = 0 at the beginning of the
#' output? Defaults to \code{TRUE}.
#' @param ntimes Number of time points (only used if \code{time} is missing).
#' @param ... Extra parameters. Currently these are simply ignored.
#' @return If \code{time} was provided, returns the estimated absolute risk for
#' the user-supplied covariate profiles. This will be stored in a matrix or a
#' higher dimensional array, depending on the input (see details). If both
#' \code{time} and \code{newdata} are missing, returns the original data
#' with a new column containing the risk estimate at failure times.
#' @export
#' @importFrom stats formula delete.response setNames
#' @examples
#' # Simulate censored survival data for two outcome types
#' library(data.table)
#' set.seed(12345)
#' nobs <- 1000
#' tlim <- 20
#'
#' # simulation parameters
#' b1 <- 200
#' b2 <- 50
#'
#' # event type 0-censored, 1-event of interest, 2-competing event
#' # t observed time/endpoint
#' # z is a binary covariate
#' DT <- data.table(z = rbinom(nobs, 1, 0.5))
#' DT[,`:=` ("t_event" = rweibull(nobs, 1, b1),
#' "t_comp" = rweibull(nobs, 1, b2))]
#' DT[,`:=`("event" = 1 * (t_event < t_comp) + 2 * (t_event >= t_comp),
#' "time" = pmin(t_event, t_comp))]
#' DT[time >= tlim, `:=`("event" = 0, "time" = tlim)]
#'
#' out_linear <- fitSmoothHazard(event ~ time + z, DT, ratio = 10)
#'
#' linear_risk <- absoluteRisk(out_linear, time = 10,
#' newdata = data.table("z" = c(0,1)))
absoluteRisk <- function(object, time, newdata,
method = c("numerical", "montecarlo"),
nsamp = 100, s = c("lambda.1se", "lambda.min"),
n.trees, onlyMain = TRUE,
type = c("CI", "survival"),
addZero = TRUE, ntimes = 100, ...) {
if (!inherits(object, c("glm", "cv.glmnet", "gbm", "CompRisk"))) {
stop(paste("object is of class", class(object)[1],
"\nabsoluteRisk should be used with an object of class glm,",
"cv.glmnet, gbm, or CompRisk"),
call. = TRUE)
}
if (inherits(object, "CompRisk")) {
return(absoluteRisk.CompRisk(object, time, newdata, method,
nsamp = nsamp, onlyMain = onlyMain,
type = type, addZero = addZero))
}
# Parse arguments
method <- match.arg(method)
type <- match.arg(type)
if (is.numeric(s)) {
s <- s[1]
if (length(s) > 1) {
warning(paste("More than one value for s has been",
"supplied. Only first entry will be used"))
}
} else if (is.character(s)) {
s <- match.arg(s)
}
if (inherits(object, "gbm")) {
if (missing(n.trees)) stop("n.trees is missing")
} else n.trees <- NULL
if (!missing(newdata) && is.character(newdata) && newdata == "typical") {
newdata <- if (is.null(object$matrix.fit)) {
get_typical(object$originalData)
} else apply(object$originalData$x, 2, median, na.rm = TRUE)
}
# Call the correct function with correct parameters
if (missing(newdata)) {
if (missing(time)) {
# If newdata and time are missing, compute risk for each subject
# at their failure/censoring times
return(estimate_risk(object, method, nsamp, s, n.trees, type = type,
...))
} else {
return(estimate_risk_newtime(object, time,
method = method, nsamp = nsamp,
s = s, n.trees = n.trees, type = type,
addZero = addZero, ...))
}
} else {
if (missing(time)) {
# If only time is missing, compute risk for each row of newdata
# at equidistant points
max_time <- if (is.null(object$matrix.fit)) {
max(object$originalData[object$timeVar][[1]])
} else max(object$originalData$y[, object$timeVar])
time <- seq(0, max_time, length.out = ntimes)
}
return(estimate_risk_newtime(object, time, newdata,
method, nsamp, s, n.trees, type = type,
addZero = addZero, ...))
}
}
# Helper functions----
# estimate_risk computes one survival probability per covariate profile
# Currently, this is only called when we want the survival probabilities
# at failure times for the original data
estimate_risk <- function(object, method, nsamp, s, n.trees, type, ...) {
newdata <- object$originalData
if (inherits(newdata, "data.fit")) newdata <- newdata$x
# Create risk variable and make sure it doesn't already exist
riskVar <- ifelse(type == "CI", "risk", "survival")
while (riskVar %in% names(newdata)) riskVar <- paste0(".", riskVar)
# If both newdata and time are missing
# compute risk at failure times
time_vector <- if (is.null(object$matrix.fit)) {
newdata[object$timeVar][[1]]
} else object$originalData$y[, object$timeVar]
# Create a vectorised hazard function
if (inherits(object, "cv.glmnet") && !is.null(object$matrix.fit)) {
hazard <- function(x, fit, newdata, s, n.trees, ...) {
# Note: the offset should be set to zero when estimating the hazard.
newdata_matrix <- newdata[, colnames(newdata) != fit$timeVar,
drop = FALSE]
# newdata_matrix matches output from fitSmoothHazard.fit
temp_matrix <- model.matrix(update(fit$formula_time, ~ . - 1),
setNames(data.frame(x), fit$timeVar))
newdata_matrix <- as.matrix(cbind(temp_matrix,
as.data.frame(newdata_matrix)))
pred <- estimate_hazard(fit, newdata_matrix, plot = FALSE,
ci = FALSE, s = s, n.trees = n.trees, ...)
return(as.numeric(exp(pred)))
}
} else {
hazard <- function(x, fit, newdata, s, n.trees, ...) {
# Note: the offset should be set to zero when estimating the hazard.
newdata2 <- data.frame(newdata)
newdata2 <- newdata2[rep(1, length(x)), ]
newdata2[fit$timeVar] <- x
pred <- estimate_hazard(fit, newdata2, plot = FALSE, ci = FALSE,
s = s, n.trees = n.trees, ...)
return(as.numeric(exp(pred)))
}
}
# Compute cumulative hazard at each failure time
risk_res <- sapply(seq_len(nrow(newdata)), function(j) {
integrate(hazard, lower = 0, upper = time_vector[j],
fit = object, subdivisions = nsamp,
newdata = newdata[j, , drop = FALSE],
s = s, n.trees = n.trees)$value
})
# Format the output depending on type
if (is.data.frame(newdata)) {
newdata[, riskVar] <- ifelse(type == "CI",
1 - exp(-risk_res),
exp(-risk_res))
} else {
newdata <- if (type == "CI") {
cbind(1 - exp(-risk_res), newdata)
} else cbind(exp(-risk_res), newdata)
colnames(newdata)[1] <- riskVar
}
attr(newdata, "type") <- type
return(newdata)
}
# estimate_risk_newtime computes the whole survival curve for each covariate
# profile
#' @importFrom data.table data.table :=
estimate_risk_newtime <- function(object, time, newdata, method, nsamp,
s, n.trees, type, addZero, ...) {
if (missing(newdata)) {
# Should we use the whole case-base dataset or the original one?
if (is.null(object$originalData)) {
stop(paste("Cannot estimate the mean absolute risk without",
"the original data. See documentation."),
call. = FALSE)
}
newdata <- object$originalData
unselectTime <- (names(newdata) != object$timeVar)
newdata <- subset(newdata, select = unselectTime)
}
time_ordered <- unique(c(0, sort(time)))
output <- matrix(NA, ncol = nrow(newdata) + 1, nrow = length(time_ordered))
output[, 1] <- time_ordered
colnames(output) <- c("time", rep("", nrow(newdata)))
rownames(output) <- rep("", length(time_ordered))
output[1, -1] <- 0
if (length(time_ordered) > 1) {
if (method == "numerical") {
# Compute points at which we evaluate integral
# Note: there's probably a more efficient way of choosing the knots
knots <- seq(0, max(time_ordered),
length.out = (length(time_ordered) - 1) * nsamp)
knots <- unique(sort(c(knots, time_ordered)))
for (j in seq_len(nrow(newdata))) {
# Extract current obs
current_obs <- newdata[j, , drop = FALSE]
# Use trapezoidal rule for integration----
if (inherits(newdata, "matrix")) {
newdata2 <- current_obs[,
colnames(newdata) != object$timeVar,
drop = FALSE]
# newdata2 matches the output from fitSmoothHazard.fit
temp_matrix <- model.matrix(update(object$formula_time,
~ . - 1),
setNames(data.frame(knots),
object$timeVar))
newdata2 <- as.matrix(cbind(temp_matrix,
as.data.frame(newdata2)))
} else {
# Create data.table for prediction
newdata2 <- data.table::data.table(current_obs)
newdata2 <- newdata2[rep(1, length(knots))]
newdata2[, object$timeVar := knots]
}
pred <- estimate_hazard(object = object, newdata = newdata2,
plot = FALSE, ci = FALSE,
s = s, n.trees = n.trees, ...)
# Compute integral using trapezoidal rule
# First remove infinite values (e.g. with log(t))
pred_exp <- exp(pred)
pred_exp[which(pred_exp %in% c(Inf, -Inf))] <- 0
which_knots <- knots %in% c(0, time_ordered)
output[, j + 1] <- trap_int(knots, pred_exp)[which_knots]
}
}
if (method == "montecarlo") {
# Sample points at which we evaluate function
knots <- runif(n = length(time_ordered) * nsamp,
min = 0, max = max(time_ordered))
for (j in seq_len(nrow(newdata))) {
# Extract current obs
current_obs <- newdata[j, , drop = FALSE]
# Create data.table for prediction
newdata2 <- data.table(current_obs)
newdata2 <- newdata2[rep(1, length(knots))]
newdata2[, object$timeVar := knots]
pred <- estimate_hazard(object = object, newdata = newdata2,
plot = FALSE, ci = FALSE,
s = s, n.trees = n.trees, ...)
# Compute integral using MC integration
pred_exp <- exp(pred)
pred_exp[which(pred_exp %in% c(Inf, -Inf))] <- NA
mean_values <- sapply(split(pred_exp,
cut(knots, breaks = time_ordered)),
mean, na.rm = TRUE)
integral_estimates <- mean_values * diff(time_ordered)
output[, j + 1] <- cumsum(c(0, integral_estimates))
}
}
output[, -1] <- exp(-output[, -1])
if (type == "CI") output[, -1] <- 1 - output[, -1]
}
# Sometimes montecarlo integration gives nonsensical probability estimates
if (method == "montecarlo" && (any(output[, -1] < 0) |
any(output[, -1] > 1))) {
warning(paste("Some probabilities are out of range. Consider",
"increasing nsamp or using numerical integration"),
call. = FALSE)
}
# Add time = 0?
if (!addZero) output <- output[-1, , drop = FALSE]
# Add class
class(output) <- c("absRiskCB", class(output))
attr(output, "type") <- type
return(output)
}