-
-
Notifications
You must be signed in to change notification settings - Fork 39
/
Copy pathfind_transformation.R
177 lines (165 loc) · 5.85 KB
/
find_transformation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#' @title Find possible transformation of model variables
#' @name find_transformation
#'
#' @description This functions checks whether any transformation, such as log-
#' or exp-transforming, was applied to the response variable (dependent
#' variable) in a regression formula. Optionally, all model terms can also be
#' checked for any such transformation. Currently, following patterns are
#' detected: `log`, `log1p`, `log2`, `log10`, `exp`, `expm1`, `sqrt`,
#' `log(y+<number>)`, `log-log`, `power` (e.g. to 2nd power, like `I(y^2)`),
#' `inverse` (like `1/y`), `scale` (e.g., `y/3`), and `box-cox` (e-g-,
#' `(y^lambda - 1) / lambda`).
#'
#' @param x A regression model or a character string of the formulation of the
#' (response) variable.
#' @param include_all Logical, if `TRUE`, does not only check the response
#' variable, but all model terms.
#' @param ... Currently not used.
#'
#' @return A string, with the name of the function of the applied transformation.
#' Returns `"identity"` for no transformation, and e.g. `"log(y+3)"` when
#' a specific values was added to the response variables before
#' log-transforming. For unknown transformations, returns `NULL`.
#'
#' @examples
#' # identity, no transformation
#' model <- lm(Sepal.Length ~ Species, data = iris)
#' find_transformation(model)
#'
#' # log-transformation
#' model <- lm(log(Sepal.Length) ~ Species, data = iris)
#' find_transformation(model)
#'
#' # log+2
#' model <- lm(log(Sepal.Length + 2) ~ Species, data = iris)
#' find_transformation(model)
#'
#' # find transformation for all model terms
#' model <- lm(mpg ~ log(wt) + I(gear^2) + exp(am), data = mtcars)
#' find_transformation(model, include_all = TRUE)
#'
#' # inverse, response provided as character string
#' find_transformation("1 / y")
#' @export
find_transformation <- function(x, ...) {
UseMethod("find_transformation")
}
#' @rdname find_transformation
#' @export
find_transformation.default <- function(x, include_all = FALSE, ...) {
# validation check
if (is.null(x) || is.data.frame(x) || !is_model(x)) {
return(NULL)
}
# sanity check for multivariate models
if (is_multivariate(x)) {
result <- lapply(find_terms(x, verbose = FALSE), function(i) {
find_transformation(i[["response"]])
})
unlist(result)
} else if (include_all) {
lapply(find_terms(x, verbose = FALSE), function(i) {
stats::setNames(
unlist(lapply(i, find_transformation), use.names = FALSE),
clean_names(i)
)
})
} else {
# "raw" response
rv <- find_terms(x, verbose = FALSE)[["response"]]
# for divisions, like x/3, `find_response()` returns a character vector
# of length 2, one with the nominator and the denominator. In this case,
# check against original response
original_response <- safe_deparse(find_formula(x, verbose = FALSE)$conditional[[2]])
# check if we have the pattern (x/<number)
if (.is_division(original_response)) {
# if so, check if the pattern really match
nominator <- gsub("/.*", "\\1", original_response)
denominator <- gsub(".*\\/(.*)", "\\1", original_response)
# and if so again, then reconstruct division string
if (all(rv == c(nominator, denominator))) {
rv <- paste(nominator, denominator, sep = "/") # nolint
}
}
find_transformation(rv)
}
}
#' @export
find_transformation.character <- function(x, ...) {
transform_fun <- "identity"
# remove whitespaces
x <- gsub(" ", "", x, fixed = TRUE)
# log-transformation
if (any(grepl("log\\((.*)\\)", x))) {
# do we have log-log models?
if (grepl("log\\(log\\((.*)\\)\\)", x)) {
transform_fun <- "log-log"
} else {
plus_minus <- NULL
# make sure we definitly have a "+" in the log-transformation
if (grepl("+", x, fixed = TRUE)) {
# 1. try: log(x + number)
plus_minus <- .safe(
eval(parse(text = gsub("log\\(([^,\\+)]*)(.*)\\)", "\\2", x)))
)
# 2. try: log(number + x)
if (is.null(plus_minus)) {
plus_minus <- .safe(
eval(parse(text = gsub("log\\(([^,\\+)]*)(.*)\\)", "\\1", x)))
)
}
}
if (is.null(plus_minus) || is.function(plus_minus)) {
transform_fun <- "log"
} else {
transform_fun <- paste0("log(x+", plus_minus, ")")
}
}
} else if (any(grepl("log1p\\((.*)\\)", x))) {
# log1p-transformation
transform_fun <- "log1p"
} else if (any(grepl("expm1\\((.*)\\)", x))) {
# expm1-transformation
transform_fun <- "expm1"
} else if (any(grepl("log2\\((.*)\\)", x))) {
# log2/log10-transformation
transform_fun <- "log2"
} else if (any(grepl("log10\\((.*)\\)", x))) {
transform_fun <- "log10"
} else if (any(grepl("exp\\((.*)\\)", x))) {
# exp-transformation
transform_fun <- "exp"
} else if (any(grepl("sqrt\\((.*)\\)", x))) {
# sqrt-transformation
plus_minus <- eval(parse(text = gsub("sqrt\\(([^,\\+)]*)(.*)\\)", "\\2", x)))
if (is.null(plus_minus) || is.function(plus_minus)) {
transform_fun <- "sqrt"
} else {
transform_fun <- paste0("sqrt(x+", plus_minus, ")")
}
} else if (any(startsWith(x, "1/"))) {
# inverse-transformation
transform_fun <- "inverse"
} else if (.is_division(x)) {
# scale or Box-Cox transformation
if (.is_box_cox(x)) {
transform_fun <- "box-cox"
} else {
transform_fun <- "scale"
}
} else if (any(grepl("(.*)(\\^|\\*\\*)\\s?-?(\\d+|[()])", x))) {
# power-transformation
transform_fun <- "power"
} else if (any(grepl("I\\((.*)\\)", x))) {
# (unknown) I-transformation
transform_fun <- NULL
}
transform_fun
}
# helper -----------------------------
.is_division <- function(x) {
any(grepl("(.*)/([0-9\\.\\+\\-]+)(\\)*)$", x)) && !any(grepl("(.*)(\\^|\\*\\*)\\((.*)/(.*)\\)", x))
}
.is_box_cox <- function(x) {
any(grepl("\\((.*)\\^[0-9\\.\\+\\-]+-1\\)", x))
}