diff --git a/DESCRIPTION b/DESCRIPTION index 3fc895881..bc8c7aefb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -98,6 +98,8 @@ Roxygen: list(markdown = TRUE, r6 = FALSE) RoxygenNote: 7.3.2 VignetteBuilder: knitr Collate: + 'DataBackendJoin.R' + 'DataBackendMultiCbind.R' 'Graph.R' 'GraphLearner.R' 'mlr_pipeops.R' diff --git a/NAMESPACE b/NAMESPACE index 6d8c22381..0aa60b089 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -35,6 +35,8 @@ S3method(unmarshal_model,pipeop_impute_learner_state_marshaled) S3method(unmarshal_model,pipeop_learner_cv_state_marshaled) export("%>>!%") export("%>>%") +export(DataBackendJoin) +export(DataBackendMultiCbind) export(Graph) export(GraphLearner) export(LearnerClassifAvg) diff --git a/R/DataBackendJoin.R b/R/DataBackendJoin.R new file mode 100644 index 000000000..208a1f0dc --- /dev/null +++ b/R/DataBackendJoin.R @@ -0,0 +1,160 @@ + + +#' @export +DataBackendJoin = R6Class("DataBackendJoin", inherit = DataBackend, cloneable = FALSE, + public = list( + initialize = function(b1, b2, type, by_b1 = NULL, by_b2 = NULL, b1_index_colname = NULL, b2_index_colname = NULL) { + assert_backend(b1) + assert_backend(b2) + + if ("data.table" %nin% intersect(b1$data_formats, b2$data_formats)) { + stop("DataBackendJoin currently only supports DataBackends that support 'data.table' format.") + } + + assert_choice(type, c("left", "right", "outer", "inner")) + + colnames_b1 = b1$colnames + colnames_b2 = b2$colnames + allcolnames = union(colnames_b1, colnames_b2) + + assert_choice(by_b1, colnames_b1, null.ok = TRUE) + assert_choice(by_b2, colnames_b2, null.ok = TRUE) + + assert_string(b1_index_colname, null.ok = TRUE) + assert_string(b2_index_colname, null.ok = TRUE) + + if (!is.null(b1_index_colname) && b1_index_colname %in% setdiff(allcolnames, b1$primary_key)) stopf("b1_index_colname '%s' already a non-primary-key column in b1 or b2.", b1_index_colname) + if (!is.null(b2_index_colname) && b2_index_colname %in% setdiff(allcolnames, b2$primary_key)) stopf("b2_index_colname '%s' already a non-primary-key column in b2 or b2.", b2_index_colname) + if (!is.null(b1_index_colname) && !is.null(b2_index_colname) && b1_index_colname == b2_index_colname) stop("b1_index_colname and b2_index_colname must be different, but are both '%s'.", b1_index_colname) + + colnames = unique(c(allcolnames, b1_index_colname, b2_index_colname)) + + rownames_b1 = b1$rownames + rownames_b2 = b2$rownames + + joinby_b1 = if (is.null(by_b1)) rownames_b1 else b1$data(rownames_b1, by_b1, data_format = "data.table")[[1]] + joinby_b2 = if (is.null(by_b2)) rownames_b2 else b2$data(rownames_b2, by_b2, data_format = "data.table")[[1]] + + index_table = merge(data.table(rownames_b1, joinby_b1), data.table(rownames_b2, joinby_b2), by.x = "joinby_b1", by.y = "joinby_b2", + all.x = type %in% c("left", "outer"), all.y = type %in% c("right", "outer"), sort = FALSE, allow.cartesian = TRUE) + + set(index_table, , "joinby_b1", NULL) + + pk = "..row_id" + index = 0 + while (pk %in% allcolnames) { + index = index + 1 + pk = paste0("..row_id.", index) + } + + super$initialize(list( + b1 = b1, b2 = b2, + colnames_b1 = setdiff(colnames_b1, colnames_b2), + allcolnames = unique(c(colnames_b1, colnames_b2, b1_index_colname, b2_index_colname, pk)), + index_table = index_table, + b1_index_colname = b1_index_colname, + b2_index_colname = b2_index_colname, + pk = pk, + aux_hash = calculate_hash(by_b1, by_b2, type, b1_index_colname, b2_index_colname) + ), primary_key = pk, data_formats = "data.table") + }, + + data = function(rows, cols, data_format = "data.table") { + d = private$.data + rows = rows[inrange(rows, 1, nrow(d$index_table))] + indices = d$index_table[rows] + b1_rows = indices[!is.na(rownames_b1), rownames_b1] + b2_rows = indices[!is.na(rownames_b2), rownames_b2] + indices[!is.na(rownames_b1), b1_index := seq_len(length(b1_rows))] + indices[!is.na(rownames_b2), b2_index := seq_len(length(b2_rows))] + b1_index = indices[, b1_index] + b2_index = indices[, b2_index] + + data = d$b2$data(b2_rows, cols, data_format = "data.table")[b2_index] + remainingcols = intersect(cols, d$colnames_b1) + if (length(remainingcols)) { + data = cbind(data, d$b1$data(b1_rows, cols, data_format = "data.table")[b1_index]) + } + setkeyv(data, NULL) + if (d$pk %in% cols) { + set(data, , d$pk, rows) + } + if (!is.null(d$b2_index_colname) && d$b2_index_colname %in% cols) { + rownames_b2 = indices$rownames_b2 + set(data, , d$b2_index_colname, rownames_b2) + } + if (!is.null(d$b1_index_colname) && d$b1_index_colname %in% cols) { + rownames_b1 = indices$rownames_b1 + set(data, ,d$b1_index_colname, rownames_b1) + } + data[, intersect(cols, names(data)), with = FALSE] + }, + + head = function(n = 6L) { + rows = first(self$rownames, n) + self$data(rows = rows, cols = self$colnames) + }, + distinct = function(rows, cols, na_rm = TRUE) { + d = private$.data + indices = d$index_table[rows] + rownames_b1 = rownames_b2 = NULL + b1_rows = indices[!is.na(rownames_b1), rownames_b1] + b2_rows = indices[!is.na(rownames_b2), rownames_b2] + d2 = private$.data$b2$distinct(rows = b2_rows, cols = cols, na_rm = na_rm) + if (!is.null(d$b2_index_colname) && d$b2_index_colname %in% cols) { + d2[[d$b2_index_colname]] = if (na_rm) unique(b2_rows) else unique(indices$rownames_b2) + } + d1 = private$.data$b1$distinct(rows = b1_rows, cols = setdiff(cols, names(d2)), na_rm = na_rm) + if (!is.null(d$b1_index_colname) && d$b1_index_colname %in% cols) { + d1[[d$b1_index_colname]] = if (na_rm) unique(b1_rows) else unique(indices$rownames_b1) + } + + if (!na_rm && length(b1_rows) < length(rows)) { + d1 = map(d1, function(x) if (any(is.na(x))) x else c(x, NA)) + } + if (!na_rm && length(b2_rows) < length(rows)) { + d2 = map(d2, function(x) if (any(is.na(x))) x else c(x, NA)) + } + res = c(d1, d2) + if (d$pk %in% cols) { + res[[d$pk]] = unique(rows) + } + + res[match(cols, names(res), nomatch = 0)] + }, + missings = function(rows, cols) { + d = private$.data + indices = d$index_table[rows] + rownames_b1 = rownames_b2 = NULL + b1_rows = indices[!is.na(rownames_b1), rownames_b1] + b2_rows = indices[!is.na(rownames_b2), rownames_b2] + m2 = private$.data$b2$missings(b2_rows, cols) + if (!is.null(d$b2_index_colname) && d$b2_index_colname %in% cols) { + m2[d$b2_index_colname] = 0L + } + m1 = private$.data$b1$missings(b1_rows, setdiff(cols, names(m2))) + if (!is.null(d$b1_index_colname) && d$b1_index_colname %in% cols) { + m1[d$b1_index_colname] = 0L + } + m1 = m1 + length(rows) - length(b1_rows) + m2 = m2 + length(rows) - length(b2_rows) + res = c(m1, m2) + if (d$pk %in% cols) { + res[d$pk] = 0L + } + res[match(cols, names(res), nomatch = 0)] + } + ), + active = list( + rownames = function() seq_len(nrow(private$.data$index_table)), + colnames = function() private$.data$allcolnames, + nrow = function() nrow(private$.data$index_table), + ncol = function() length(private$.data$allcolnames) + ), + private = list( + .calculate_hash = function() { + d = private$.data + calculate_hash(d$b1$hash, d$b2$hash,d$aux_hash) + } + ) +) diff --git a/R/DataBackendMultiCbind.R b/R/DataBackendMultiCbind.R new file mode 100644 index 000000000..d62136a69 --- /dev/null +++ b/R/DataBackendMultiCbind.R @@ -0,0 +1,134 @@ + + +#' @export +DataBackendMultiCbind = R6Class("DataBackendMultiCbind", inherit = DataBackend, cloneable = FALSE, + public = list( + initialize = function(bs) { + assert_list(bs, min.len = 1) + lapply(bs, assert_backend) + + formats = Reduce(intersect, map(bs, "data_formats")) + + private$.colnames = unique(unlist(map(bs, "colnames"))) + + # primary key: if all backends have the same pk, just use that one. + otherpk = unique(unlist(map(bs, "primary_key"))) + if (length(otherpk) == 1) { + pk = otherpk + } else { + # otherwise: introduce a new primary key that is completely different from the previous ones. + pk = "..row_id" + index = 0 + while (pk %in% private$.colnames) { + index = index + 1 + pk = paste0("..row_id.", index) + } + private$.colnames = c(private$.colnames, pk) + } + + super$initialize(list(bs = rev(bs)), pk, formats) + }, + data = function(rows, cols, data_format = "data.table") { + bs = private$.data$bs + + urows = unique(rows) + + datas = list() + pks = character(length(bs)) + include_pk = logical(length(bs)) + cols_remaining = cols + allrows = list() + for (i in seq_along(bs)) { + ## Not doing 'if (length(cols_remaining)) break' because there could still be tables remaining that add rows + pk = bs[[i]]$primary_key + pks[[i]] = pk + include_pk[[i]] = pk %in% cols_remaining + if (include_pk[[i]]) { + datas[[i]] = bs[[i]]$data(urows, cols_remaining, data_format = data_format) + cols_remaining = setdiff(cols_remaining, colnames(datas[[i]])) + } else { + datas[[i]] = bs[[i]]$data(urows, c(pk, cols_remaining), data_format = data_format) + cols_remaining = setdiff(cols_remaining, colnames(datas[[i]])[-1]) + } + allrows[[i]] = datas[[i]][[pk]] + } + presentrows = unique(unlist(allrows)) + join = list(presentrows) + result = do.call(cbind, pmap(list(datas, pks, include_pk), function(data, pk, include) { + if (include) { + result = data[join, on = pk, nomatch = NA] + set(result, result[[pk]] %nin% data[[pk]], pk, NA) + } else { + data[join, -pk, on = pk, with = FALSE, nomatch = NA] + } + })) + sbk = self$primary_key + + set(result, , sbk, presentrows) + join = list(rows) + result[join, intersect(cols, colnames(result)), with = FALSE, on = sbk, nomatch = NULL] + }, + head = function(n = 6L) { + rows = head(self$rownames, n) + self$data(rows = rows, cols = self$colnames) + }, + distinct = function(rows, cols, na_rm = TRUE) { + bs = private$.data$bs + getpk = self$primary_key %in% cols + reslist = list() + remaining_cols = cols + if (!na_rm || getpk) { + rows = intersect(rows, self$rownames) + } + for (i in seq_along(bs)) { + if (!length(remaining_cols)) break + reslist[[i]] = bs[[i]]$distinct(rows = rows, cols = cols, na_rm = na_rm) + remaining_cols = setdiff(remaining_cols, names(reslist[[i]])) + if (!na_rm && !all(rows %in% bs[[i]]$rownames)) { + reslist[[i]] = map(reslist[[i]], function(x) if (any(is.na(x))) x else c(x, NA)) + } + } + result = unlist(reslist, recursive = FALSE) + if (getpk) { + result[[self$primary_key]] = rows + } + result[match(cols, names(result), nomatch = 0)] + }, + missings = function(rows, cols) { + rows = rows[rows %in% self$rownames] + bs = private$.data$bs + getpk = self$primary_key %in% cols + reslist = list() + remaining_cols = cols + for (i in seq_along(bs)) { + if (!length(remaining_cols)) break + missingrows = sum(rows %nin% bs[[i]]$rownames) + reslist[[i]] = bs[[i]]$missings(rows, remaining_cols) + missingrows + remaining_cols = setdiff(remaining_cols, names(reslist[[i]])) + } + result = unlist(reslist) + if (self$primary_key %in% cols) { + result[[self$primary_key]] = 0L + } + result[match(cols, names(result), nomatch = 0)] + } + ), + active = list( + rownames = function() { + if (is.null(private$.rownames_cache)) private$.rownames_cache = unique(unlist(rev(map(private$.data$bs, "rownames")))) + private$.rownames_cache + }, + colnames = function() { + private$.colnames + }, + nrow = function() length(self$rownames), + ncol = function() length(self$colnames) + ), + private = list( + .rownames_cache = NULL, + .colnames = NULL, + .calculate_hash = function() { + do.call(calculate_hash, private$.data$bs) + } + ) +) diff --git a/R/PipeOpFeatureUnion.R b/R/PipeOpFeatureUnion.R index 61327c75b..95a03f14b 100644 --- a/R/PipeOpFeatureUnion.R +++ b/R/PipeOpFeatureUnion.R @@ -12,8 +12,11 @@ #' across all [`Task`][mlr3::Task]s. Only the target column(s) of the first [`Task`][mlr3::Task] #' are kept. #' -#' If `assert_targets_equal` is `TRUE` then target column names are compared and an error is thrown -#' if they differ across inputs. +#' `PipeOpFeatureUnion` tries to merge columns that are identical, while preventing accidental +#' overwrites of columns that contain differing data. This is controlled using the `feature_clash` +#' (for columns containing features, weights etc.) and `target_clash` (for tharget columns) +#' hyperparameters. The `assert_target_equal` construction parameter / field can still be used +#' as well but is deprecated and will generate a warning. #' #' If input tasks share some feature names but these features are not identical an error is thrown. #' This check is performed by first comparing the features names and if duplicates are found, also @@ -41,6 +44,7 @@ #' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise #' be set during construction. Default `list()`. #' * `assert_targets_equal` :: `logical(1)`\cr +#' DEPRECATED; use `target_clash` hyperparameter instead.\cr #' If `assert_targets_equal` is `TRUE` (Default), task target column names are checked for #' agreement. Disagreeing target column names are usually a bug, so this should often be left at #' the default. @@ -61,7 +65,33 @@ #' The `$state` is left empty (`list()`). #' #' @section Parameters: -#' [`PipeOpFeatureUnion`] has no Parameters. +#' * `target_clash` :: `character(1)`\cr +#' How to handle target columns that differ between input [`Task`][mlr3::Task]s. `"allow_same_hash"` +#' checks the names and `$col_hashes` and throws an error if they disagree. `"allow_same_content"` (default) is +#' more permissive: If `$col_hashes` disagree, then it checks the target content, if the content of both +#' columns agree, then merging of tasks is still allowed. This avoids some rare false-positives, but in cases +#' where hashes *do* disagree this may be slow for [`Task`][mlr3::Task]s with many rows or targets. +#' `"ignore"` does not check for target agreement and overwrites the target with the target of the *rightmost* / +#' highest numbered input [`Task`][mlr3::Task]. Use with caution. This is the only option that allows feature-union of [`Task`][mlr3::Task]s +#' that differ in the names of their target column (and all target columns except the rightmost / highest numbered input +#' [`Task`][mlr3::Task]'s target are dropped in that case).\cr +#' The deprecated field `assert_targets_equal` sets this value to `"allow_same_content"` (i.e. default) when `TRUE` and to +#' `"ignore"` when `FALSE`. +#' * `feature_clash` :: `character(1)`\cr +#' How to handle non-target columns that have the same name but differ between input [`Task`][mlr3::Task]s. `"allow_same_hash"` +#' checks the names and `$col_hashes` and throws an error if they disagree. `"allow_same_content"` (default) is +#' more permissive: If `$col_hashes` disagree, then it checks the column content, if the content of both +#' columns agree, then merging of tasks is still allowed. This avoids some rare false-positives, but in cases +#' where hashes *do* disagree this may be slow for large [`Task`][mlr3::Task]s. +#' `"ignore"` does not check for column data agreement and overwrites columns of the same name with the values of the *rightmost* / +#' highest numbered input [`Task`][mlr3::Task].\cr +#' Some column roles (`"group"`, `"weight"`, `"name"`) do not allow more than one column role present in a [`Task`][mlr3::Task] (see +#' `$col_roles` documentation there). When up to one [`Task`][mlr3::Task] has a column of these column role, it is taken for the +#' resulting [`Task`][mlr3::Task] without any issue. When more than one [`Task`][mlr3::Task] has a column with one of these roles, +#' but with the same name, the `feature_clash` policy applies as described above. When more than one [`Task`][mlr3::Task] has a +#' column with one of these roles, but they have *different* names, then an error is thrown when `feature_clash` is not `"ignore"`. +#' When it is `"ignore"`, the *rightmost* / highest numbered input [`Task`][mlr3::Task]'s column is used and all others of this +#' role are discarded. #' #' @section Internals: #' [`PipeOpFeatureUnion`] uses the [`Task`][mlr3::Task] `$cbind()` method to bind the input values @@ -99,21 +129,26 @@ PipeOpFeatureUnion = R6Class("PipeOpFeatureUnion", inherit = PipeOp, public = list( - assert_targets_equal = NULL, + inprefix = NULL, initialize = function(innum = 0L, collect_multiplicity = FALSE, id = "featureunion", param_vals = list(), assert_targets_equal = TRUE) { assert( check_int(innum, lower = 0L), check_character(innum, min.len = 1L, any.missing = FALSE) ) + params = ps( + target_clash = p_fct(c("allow_same_hash", "allow_same_content", "ignore")), + feature_clash = p_fct(c("forbid", "allow_same_hash", "allow_same_content", "ignore")) + ) + params$values = list(target_clash = "allow_same_content", feature_clash = "allow_same_content") + if (is.numeric(innum)) { self$inprefix = rep("", innum) } else { self$inprefix = innum innum = length(innum) } - assert_flag(assert_targets_equal) - self$assert_targets_equal = assert_targets_equal + inname = if (innum) rep_suffix("input", innum) else "..." intype = "Task" private$.collect = assert_flag(collect_multiplicity) @@ -129,9 +164,25 @@ PipeOpFeatureUnion = R6Class("PipeOpFeatureUnion", output = data.table(name = "output", train = "Task", predict = "Task"), tags = "ensemble" ) + + # the following is DEPRECATED + if (!missing(assert_targets_equal)) { + # do this after init so the AB can modify self$param_set + assert_flag(assert_targets_equal) + self$assert_targets_equal = assert_targets_equal + } + } + ), + active = list( + assert_targets_equal = function(rhs) { + if (!missing(rhs)) private$.assert_targets_equal = rhs + self$param_set$values$target_clash = if (private$.assert_targets_equal) "allow_same_content" else "ignore" + warning("PipeOpFeatureUnion assert_targets_equal is deprecated. Use the 'target_clash' hyperparameter.") + private$.assert_targets_equal } ), private = list( + .assert_targets_equal = NULL, .train = function(inputs) { self$state = list() if (private$.collect) inputs = unclass(inputs[[1]]) diff --git a/R/PipeOpScaleMaxAbs.R b/R/PipeOpScaleMaxAbs.R index 1dae80bfd..20d36c36c 100644 --- a/R/PipeOpScaleMaxAbs.R +++ b/R/PipeOpScaleMaxAbs.R @@ -64,7 +64,7 @@ PipeOpScaleMaxAbs = R6Class("PipeOpScaleMaxAbs", private = list( .get_state_dt = function(dt, levels, target) { - lapply(dt, function(x){ + lapply(dt, function(x) { s = max(abs(range(x, na.rm = TRUE, finite = TRUE))) if (s == 0) { s = 1 diff --git a/attic/pofu.md b/attic/pofu.md new file mode 100644 index 000000000..bcc1a6c35 --- /dev/null +++ b/attic/pofu.md @@ -0,0 +1,203 @@ +# POFU design document + +## Issues + +### 126: POFU could drop all targets except first task's one + +Instead of checking that all targets are the same and throwing an error if they are not. + +### 216: POFU with differing row IDs + +PipeOpFeatureUnion could under some circumstances want to unite tasks that have differing row IDs, e.g. after PipeOpSubsample on two different paths sampled (and `$filter()`ed) different sets of rows. + +```r +graph = greplicate(PipeOpSubsample$new() %>>% + PipeOpLearnerCV$new("classif.rpart"), 2) %>>% + PipeOpFeatureUnion$new() +graph$plot() # this is what it looks like + +graph$train("iris") # assertion error +``` +mlr-org/mlr3#309 could solve part of this, but the problem goes deeper: +* what if we do sampling with replacement? +* what if PipeOpLearnerCV has a resampling that predicts some entries multiple times, e.g. RepCV or bootstrapping? + +### 271: Use DataBackend info to avoid unnecessary data comparison + +Using col_hashes + +### 388: POFU should use DataBackend cbind + +but backends do not have info about col roles + +### 390: POFU assert_targets_equal parameter should go + +### 570: assertion on 'rows' failed + +branch with subsample on one end gives error + +```r +library(mlr3) +library(mlr3pipelines) + +task = tsk("iris") +resampling = rsmp("holdout") + +graph = gunion( + list( + po("pca") %>>% po("learner_cv", id = "featureless", lrn("classif.featureless")), + po("subsample") %>>% po("learner_cv", id = "rpart", lrn("classif.rpart"))) + ) %>>% + po("featureunion") %>>% + po("learner", lrn("classif.rpart")) + +resample(task, graph, resampling) + +``` + +- possibly improve error message +- what are the options here? fill with NAs? aggregate? + +### 571: POFU seems more broken now + +this does not seem to give an error any more: + +```r + pos = PipeOpSubsample$new() + pos$param_set$values$frac = 0.5 + g = pipeline_greplicate( + pos %>>% PipeOpPCA$new(), + 2 + ) %>>% PipeOpFeatureUnion$new(c("a", "b")) + task = mlr_tasks$get("iris") + expect_error(g$train(task), "Assertion on 'rows'") +``` + + +### 607: PipeOpPredictionUnion + +https://github.com/mlr-org/miesmuschel/blob/smashy_ex/R/PipeOpPredictionUnion.R + +name is confusing, since it is rbinding, not cbinding + +### 634: New Pipeop: Split data by row_ids / logical arg + +https://gist.github.com/pfistfl/6b190f0612535817bdd33fe8f8bd6548 + +- how do we combine this with zero-inflated things? + +### 646: Bootstrap resampling + +Apparently the problem is that bootstrapping uses some rows repeatedly, which somehow breaks with mlr3's assumption that row_ids are unique values. + +```r +library("mlr3") +library("mlr3pipelines") +options(mlr3.debug=TRUE) +resample(tsk("iris"), po("pca") %>>% lrn("classif.featureless"), rsmp("bootstrap")) +``` + + +### 696: PipeOpFeatureUnion breaks predict_newdata when all features of original task aver overwritten + + +```r +gr <- list(po("select", selector = selector_none()), po("nop")) %>>!% po("featureunion", innum = c("a", "")) %>>!% + { l <- lrn("classif.rpart") ; l$properties <- setdiff(l$properties, "missings") ; l } +gr$train(tsk("iris")) +#> $classif.rpart.output +#> NULL +#> +gr$predict(tsk("iris")) +#> $classif.rpart.output +#> for 150 observations: +#> row_ids truth response +#> 1 setosa setosa +#> 2 setosa setosa +#> 3 setosa setosa +#> --- +#> 148 virginica virginica +#> 149 virginica virginica +#> 150 virginica virginica + +lr <- as_learner(gr) +lr$train(tsk("iris")) +lr$predict_newdata(iris[1:4]) +#> Error in map_values(names(x), self$old, self$new) : +#> Assertion on 'x' failed: Must be of type 'atomic vector', not 'NULL'. +#> This happened PipeOp classif.rpart's $predict() +``` + +### 697: POFU should use feature_types as reported by input tasks and not the datatypes it gets from $data() (mlr-org/mlr3#685). + +- how does data conversion work between task and backend? + +### cbind backend simplification + +## Notes + +- task filter: integer ids as reported by backend +- backend$data +- duplicated IDs, possibly problem with resampling +- col conversion + - "conversion should hapen" + - setequal factorlevel: convert, otherwise kA + - maybe happens when there are fewer levels than before (? -- check) + - predict-newdata + +- databackendrename +- backends are read-only, but want to be able to copy / extend +- do we want one multicbind, one join? +- how does this id-stuff work again? + +## Synthesis + +### Mostly independent of data backend + +- Handling Tasks with different Targets (126) +- Data Comparison when merging (271) +- Error not triggered (571) +- predict_newdata issues (696: all cols replaced, 697: feature types) + +### POFU behaviour + +- POFU should use DataBackend cbind (388) +- POFU assert_targets_equal parameter should go (390) + +### "merging" with different row IDs + +- 216, 570, 646 +- what if multiple rows are generated? + - fill with NAs? + - aggregate? + - might be relevant for PipeOpLearenerCV with RepCV or bootstrapping +- left, right, outer, inner join? +- train() vs predict() +- predictions + - see how missing predictions / NAs are handled +- how does it cope with prediction IDs being wrong? + +### Predictions + +- PipeOpFeatureUnion (607) + +### Splitting + +- by row_ids / logical arg (634) +- zero inflated zeug + +### What else? + +- predict cols that are then used as input +- auto-simplification + +## Use Cases + + - operation performed on subset of rows, e.g. subset >> op() | otherop() + - join means NAs are introduced + - learner_cv makes prediction only for some inputs, or makes multiple predictions + - join means NAs on missing predictions? + - join aggregation of (learner_cv) predictions? + - rows with missing predictions are dropped? + - in all of the above: predict just cbinds + - diff --git a/man/grapes-greater-than-greater-than-grapes.Rd b/man/grapes-greater-than-greater-than-grapes.Rd index e49cdf9a2..d3adb61d0 100644 --- a/man/grapes-greater-than-greater-than-grapes.Rd +++ b/man/grapes-greater-than-greater-than-grapes.Rd @@ -20,7 +20,7 @@ g1 \%>>!\% g2 \code{\link{Graph}} / \code{\link{PipeOp}} / object-convertible-to-\code{\link{PipeOp}} to put after \code{g1}.} \item{in_place}{(\code{logical(1)})\cr -Whether to try to avoid cloning \code{g1}. If \code{g1} is not a \code{\link{Graph}}, then it is cloned regardless.} +Whether to try to avoid cloning \code{g1}. If \code{g1} is not a \code{\link{Graph}}, then it is cloned regardless.\n} } \value{ \code{\link{Graph}}: the constructed \code{\link{Graph}}. diff --git a/man/mlr_pipeops_featureunion.Rd b/man/mlr_pipeops_featureunion.Rd index a509b87eb..a88f6cdee 100644 --- a/man/mlr_pipeops_featureunion.Rd +++ b/man/mlr_pipeops_featureunion.Rd @@ -15,8 +15,11 @@ Aggregates features from all input tasks by \code{\link[=cbind]{cbind()}}ing the across all \code{\link[mlr3:Task]{Task}}s. Only the target column(s) of the first \code{\link[mlr3:Task]{Task}} are kept. -If \code{assert_targets_equal} is \code{TRUE} then target column names are compared and an error is thrown -if they differ across inputs. +\code{PipeOpFeatureUnion} tries to merge columns that are identical, while preventing accidental +overwrites of columns that contain differing data. This is controlled using the \code{feature_clash} +(for columns containing features, weights etc.) and \code{target_clash} (for tharget columns) +hyperparameters. The \code{assert_target_equal} construction parameter / field can still be used +as well but is deprecated and will generate a warning. If input tasks share some feature names but these features are not identical an error is thrown. This check is performed by first comparing the features names and if duplicates are found, also @@ -45,6 +48,7 @@ Identifier of the resulting object, default \code{"featureunion"}. List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default \code{list()}. \item \code{assert_targets_equal} :: \code{logical(1)}\cr +DEPRECATED; use \code{target_clash} hyperparameter instead.\cr If \code{assert_targets_equal} is \code{TRUE} (Default), task target column names are checked for agreement. Disagreeing target column names are usually a bug, so this should often be left at the default. @@ -72,7 +76,35 @@ The \verb{$state} is left empty (\code{list()}). \section{Parameters}{ -\code{\link{PipeOpFeatureUnion}} has no Parameters. +\itemize{ +\item \code{target_clash} :: \code{character(1)}\cr +How to handle target columns that differ between input \code{\link[mlr3:Task]{Task}}s. \code{"allow_same_hash"} +checks the names and \verb{$col_hashes} and throws an error if they disagree. \code{"allow_same_content"} (default) is +more permissive: If \verb{$col_hashes} disagree, then it checks the target content, if the content of both +columns agree, then merging of tasks is still allowed. This avoids some rare false-positives, but in cases +where hashes \emph{do} disagree this may be slow for \code{\link[mlr3:Task]{Task}}s with many rows or targets. +\code{"ignore"} does not check for target agreement and overwrites the target with the target of the \emph{rightmost} / +highest numbered input \code{\link[mlr3:Task]{Task}}. Use with caution. This is the only option that allows feature-union of \code{\link[mlr3:Task]{Task}}s +that differ in the names of their target column (and all target columns except the rightmost / highest numbered input +\code{\link[mlr3:Task]{Task}}'s target are dropped in that case).\cr +The deprecated field \code{assert_targets_equal} sets this value to \code{"allow_same_content"} (i.e. default) when \code{TRUE} and to +\code{"ignore"} when \code{FALSE}. +\item \code{feature_clash} :: \code{character(1)}\cr +How to handle non-target columns that have the same name but differ between input \code{\link[mlr3:Task]{Task}}s. \code{"allow_same_hash"} +checks the names and \verb{$col_hashes} and throws an error if they disagree. \code{"allow_same_content"} (default) is +more permissive: If \verb{$col_hashes} disagree, then it checks the column content, if the content of both +columns agree, then merging of tasks is still allowed. This avoids some rare false-positives, but in cases +where hashes \emph{do} disagree this may be slow for large \code{\link[mlr3:Task]{Task}}s. +\code{"ignore"} does not check for column data agreement and overwrites columns of the same name with the values of the \emph{rightmost} / +highest numbered input \code{\link[mlr3:Task]{Task}}.\cr +Some column roles (\code{"group"}, \code{"weight"}, \code{"name"}) do not allow more than one column role present in a \code{\link[mlr3:Task]{Task}} (see +\verb{$col_roles} documentation there). When up to one \code{\link[mlr3:Task]{Task}} has a column of these column role, it is taken for the +resulting \code{\link[mlr3:Task]{Task}} without any issue. When more than one \code{\link[mlr3:Task]{Task}} has a column with one of these roles, +but with the same name, the \code{feature_clash} policy applies as described above. When more than one \code{\link[mlr3:Task]{Task}} has a +column with one of these roles, but they have \emph{different} names, then an error is thrown when \code{feature_clash} is not \code{"ignore"}. +When it is \code{"ignore"}, the \emph{rightmost} / highest numbered input \code{\link[mlr3:Task]{Task}}'s column is used and all others of this +role are discarded. +} } \section{Internals}{ diff --git a/man/mlr_pipeops_nmf.Rd b/man/mlr_pipeops_nmf.Rd index 7c8c351df..979dca4a4 100644 --- a/man/mlr_pipeops_nmf.Rd +++ b/man/mlr_pipeops_nmf.Rd @@ -96,7 +96,7 @@ See \code{\link[NMF:nmf]{nmf()}}. \section{Internals}{ -Uses the \code{\link[NMF:nmf]{nmf()}} function as well as \code{\link[NMF:basis]{basis()}}, \code{\link[NMF:coef]{coef()}} and +Uses the \code{\link[NMF:nmf]{nmf()}} function as well as \code{\link[NMF:basis-coef-methods]{basis()}}, \code{\link[NMF:basis-coef-methods]{coef()}} and \code{\link[MASS:ginv]{ginv()}}. } diff --git a/man/mlr_pipeops_rowapply.Rd b/man/mlr_pipeops_rowapply.Rd index 85e0ac30e..cc15306ab 100644 --- a/man/mlr_pipeops_rowapply.Rd +++ b/man/mlr_pipeops_rowapply.Rd @@ -46,14 +46,13 @@ Function to apply to each row in the affected columns of the task. The return value should be a vector of the same length for every input. Initialized as \code{\link[base:identity]{identity()}}. \item \code{col_prefix} :: \code{character(1)}\cr -If specified, prefix to be prepended to the column names of affected columns, separated by a dot (\code{.}). Default is \code{""}. +If specified, prefix to be prepended to the column names of affected columns, separated by a dot (\code{.}). Initialized as \code{""}. } } \section{Internals}{ -Calls \code{\link{apply}} on the data, using the value of \code{applicator} as \code{FUN} and \code{simplify = TRUE}, then coerces the output via -\code{\link[data.table:as.data.table]{as.data.table()}}. +Calls \code{\link{apply}} on the data, using the value of \code{applicator} as \code{FUN}. } \section{Fields}{ diff --git a/tests/testthat/test_DataBackendJoin.R b/tests/testthat/test_DataBackendJoin.R new file mode 100644 index 000000000..de5b94bd9 --- /dev/null +++ b/tests/testthat/test_DataBackendJoin.R @@ -0,0 +1,138 @@ +context("DataBackendJoin") + + +test_that("DataBackendJoin works as expected", { + + d1 <- data.table(x = c(letters[-2], NA), y = LETTERS, z = rep(1:13, 2), id = (1:26) * 10L) + d2 <- data.table(a = c(paste0(letters, LETTERS)[-2], NA), y = letters, idx = (27:2) * 10L) + d3 <- data.table(a = c(paste0(letters, LETTERS)[-2], NA), y = letters, id = (27:2)) + + d1b <- DataBackendDataTable$new(d1, "id") + d2b <- DataBackendDataTable$new(d2, "idx") + d3b <- DataBackendDataTable$new(d3, "id") + + + + dbj <- DataBackendJoin$new(d1b, d2b, type = "outer") + + expect_backend(dbj) + + expect_identical(dbj$distinct(1:2, dbj$colnames), list(x = c("a", "c"), y = "z", z = 1:2, id = c(10L, 20L), a = character(0), idx = 20L, ..row_id = 1:2)) + expect_identical(dbj$distinct(1:2, dbj$colnames, na_rm = FALSE), list(x = c("a", "c"), y = c("z", NA), z = 1:2, id = c(10L, 20L), a = NA_character_, idx = c(20L, NA), ..row_id = 1:2)) + + expect_identical(dbj$missings(1:2, dbj$colnames), c(x = 0L, y = 1L, z = 0L, id = 0L, a = 2L, idx = 1L, ..row_id = 0L)) + expect_identical(dbj$missings(c(1:2, 2:1), dbj$colnames), 2L * c(x = 0L, y = 1L, z = 0L, id = 0L, a = 2L, idx = 1L, ..row_id = 0L)) + + expect_identical(dbj$data(1:3, c("x", "y", "id", "a", "idx")), + data.table(x = letters[1:4][-2], y = c(NA, letters[26:25]), id = (1:3) * 10L, a = c(NA, NA, "zZ"), idx = c(NA, (2:3) * 10L)) + ) + + expect_identical(dbj$data(1:3, c("x", "y", "id", "a", "idx", dbj$primary_key)), + data.table(x = letters[1:4][-2], y = c(NA, letters[26:25]), id = (1:3) * 10L, a = c(NA, NA, "zZ"), idx = c(NA, (2:3) * 10L), ..row_id = 1:3) + ) + + expect_identical(dbj$missings(1:3, c("x", "y", "id", "a", "idx")), c(x = 0L, y = 1L, id = 0L, a = 2L, idx = 1L)) + expect_identical(dbj$missings(1:3, c("x", "y", "id", "a", "idx", dbj$primary_key)), c(x = 0L, y = 1L, id = 0L, a = 2L, idx = 1L, ..row_id = 0L)) + + + dbj <- DataBackendJoin$new(d1b, d3b, by_b1 = "z", b1_index_colname = "b1index", b2_index_colname = "b2index", type = "outer") + + expect_backend(dbj) + + expected = merge(d1[, c("x", "z", "id"), with = FALSE], rev(d3), by.x = "z", by.y = "id", all = TRUE, sort = FALSE)[, .(x, y, z = ifelse(z %inrange% c(1, 13), z, NA), id = z, a, b1index = id, b2index = z)] + expected[, ..row_id := seq_len(nrow(expected))] + expected[id == 1, id := NA] + expected[b2index == 1, b2index := NA] + + expect_equal(dbj$data(dbj$rownames, dbj$colnames), expected, check.attributes = FALSE) + + + dbj <- DataBackendJoin$new(d1b, d3b, by_b1 = "z", b1_index_colname = "b1index", b2_index_colname = "b2index", type = "inner") + expect_backend(dbj) + + expected = merge(d1[, c("x", "z", "id"), with = FALSE], rev(d3), by.x = "z", by.y = "id", all = FALSE, sort = FALSE)[, .(x, y, z = ifelse(z %inrange% c(1, 13), z, NA), id = z, a, b1index = id, b2index = z)] + expected[, ..row_id := seq_len(nrow(expected))] + expect_equal(dbj$data(dbj$rownames, dbj$colnames), expected, check.attributes = FALSE) + + dbj <- DataBackendJoin$new(d1b, d3b, by_b1 = "z", b1_index_colname = "b1index", b2_index_colname = "b2index", type = "left") + expect_backend(dbj) + + expected = merge(d1[, c("x", "z", "id"), with = FALSE], rev(d3), by.x = "z", by.y = "id", all.x = TRUE, all.y = FALSE, sort = FALSE)[, .(x, y, z = ifelse(z %inrange% c(1, 13), z, NA), id = z, a, b1index = id, b2index = z)] + expected[, ..row_id := seq_len(nrow(expected))] + expected[id == 1, id := NA] + expected[b2index == 1, b2index := NA] + + expect_equal(dbj$data(dbj$rownames, dbj$colnames), expected, check.attributes = FALSE) + + dbj <- DataBackendJoin$new(d1b, d3b, by_b1 = "z", b1_index_colname = "b1index", b2_index_colname = "b2index", type = "right") + expect_backend(dbj) + + expected = merge(d1[, c("x", "z", "id"), with = FALSE], rev(d3), by.x = "z", by.y = "id", all.x = FALSE, all.y = TRUE, sort = FALSE)[, .(x, y, z = ifelse(z %inrange% c(1, 13), z, NA), id = z, a, b1index = id, b2index = z)] + expected[, ..row_id := seq_len(nrow(expected))] + + expect_equal(dbj$data(dbj$rownames, dbj$colnames), expected, check.attributes = FALSE) +}) + +test_that("DataBackendJoin edge cases", { + d1 <- data.table(x = c(letters[-2], NA), y = LETTERS, z = rep(1:13, 2), ..row_id = (1:26) * 10L) + d2 <- data.table(a = c(paste0(letters, LETTERS)[-2], NA), y = letters, ..row_id = (27:2) * 10L) + + d1b <- DataBackendDataTable$new(d1, "..row_id") + d2b <- DataBackendDataTable$new(d2, "..row_id") + + dbj = DataBackendJoin$new(d1b, d2b, type = "inner", b1_index_colname = "b1", b2_index_colname = "b2") + expect_backend(dbj) + + expect_set_equal(dbj$colnames, c(colnames(d1), colnames(d2), "..row_id.1", "b1", "b2")) + + expect_equal( + dbj$data(dbj$rownames, dbj$colnames), + d1[d2, .(x, y = i.y, z, ..row_id, a, b1 = ..row_id, b2 = ..row_id, ..row_id.1 = seq_len(25)), on = "..row_id", nomatch = NULL], + check.attributes = FALSE + ) + + d1 = data.table(x = c(1, 2), y = c("a", "b"), ..row_id = 1:2) + d2 = data.table(x = 1, y = "z", ..row_id = 3) + d1b <- DataBackendDataTable$new(d1, "..row_id") + d2b <- DataBackendDataTable$new(d2, "..row_id") + + dbj <- DataBackendJoin$new(d1b, d2b, type = "inner", by_b1 = "x", by_b2 = "x", b1_index_colname = "b1", b2_index_colname = "b2") + expect_backend(dbj) + + expect_equal(dbj$data(dbj$rownames, dbj$colnames), data.table(x = 1, y = "z", ..row_id = 3, b1 = 1, b2 = 3, ..row_id.1 = 1)) + + dbj <- DataBackendJoin$new(d1b, d2b, type = "outer", by_b1 = "x", by_b2 = "x", b1_index_colname = "b1", b2_index_colname = "b2") + expect_backend(dbj) + + expect_equal(dbj$data(dbj$rownames, dbj$colnames), data.table(x = c(1, NA), y = c("z", NA), ..row_id = c(3, NA), b1 = 1:2, b2 = c(3, NA), ..row_id.1 = c(1, 2))) + + dbj <- DataBackendJoin$new(d1b, d2b, type = "inner", b1_index_colname = "b1", b2_index_colname = "b2") + expect_backend(dbj) + + expect_equal(dbj$data(dbj$rownames, dbj$colnames), data.table(x = 1, y = "z", ..row_id = 3, b1 = 1, b2 = 3, ..row_id.1 = 1)[0]) + + dbj <- DataBackendJoin$new(d1b, d2b, type = "outer", b1_index_colname = "b1", b2_index_colname = "b2") + expect_backend(dbj) + + expect_equal(dbj$data(dbj$rownames, dbj$colnames), data.table(x = c(NA, NA, 1), y = c(NA, NA, "z"), ..row_id = c(NA, NA, 3), b1 = c(1, 2, NA), b2 = c(NA, NA, 3), ..row_id.1 = 1:3)) + +}) + +test_that("DataBackendJoin errors", { + + d1 <- data.table(x = c(letters[-2], NA), y = LETTERS, z = rep(1:13, 2), id = (1:26) * 10L) + d2 <- data.table(a = c(paste0(letters, LETTERS)[-2], NA), y = letters, idx = (27:2) * 10L) + + d1b <- DataBackendDataTable$new(d1, "id") + d2b <- DataBackendDataTable$new(d2, "idx") + + expect_error(DataBackendJoin$new(d1b, d2b, type = "outer", by_b1 = "n"), "by_b1.*of set.*but is 'n'") + expect_error(DataBackendJoin$new(d1b, d2b, type = "outer", by_b2 = "n"), "by_b2.*of set.*but is 'n'") + + expect_error(DataBackendJoin$new(d1b, d2b, type = "inner", b1_index_colname = "x"), "already a non-primary-key") + expect_error(DataBackendJoin$new(d1b, d2b, type = "inner", b1_index_colname = "a"), "already a non-primary-key") + expect_error(DataBackendJoin$new(d1b, d2b, type = "inner", b2_index_colname = "x"), "already a non-primary-key") + expect_error(DataBackendJoin$new(d1b, d2b, type = "inner", b2_index_colname = "a"), "already a non-primary-key") + expect_error(DataBackendJoin$new(d1b, d2b, type = "inner", b1_index_colname = "n", b2_index_colname = "n"), "must be different") + +}) diff --git a/tests/testthat/test_DataBackendMultiCbind.R b/tests/testthat/test_DataBackendMultiCbind.R new file mode 100644 index 000000000..7d0a2f361 --- /dev/null +++ b/tests/testthat/test_DataBackendMultiCbind.R @@ -0,0 +1,77 @@ +context("DataBackendMultiCbind") + + +test_that("DataBackendMultiCbind works as expected", { + + d1 = data.table(x = c(letters[1:3], NA), y = LETTERS[1:4], z = c(1, 2, 2, 1), id = (1:4) * 10L) + d2 = data.table(a = c(paste0(letters[1:3], LETTERS[1:3]), NA), y = letters[20:23], id = -(2:5) * 10L, idx = (0:3) * 10L) + d3 = data.table(x = as.character(1:4), z = 9:6, id = (3:6) * 10L) + + d1b <- DataBackendDataTable$new(d1, "id") + d2b <- DataBackendDataTable$new(d2, "idx") + d3b <- DataBackendDataTable$new(d3, "id") + + dbmc <- DataBackendMultiCbind$new(list(d1b, d2b, d3b)) + + expect_backend(dbmc) + + expect_equal( + dbmc$data((0:6) * 10L, dbmc$colnames), + data.table( + x = c(NA_character_, NA, NA, 1:4), y = c(d2$y, NA, NA, NA), z = c(NA, NA, NA, 9:6), + id = c(NA, NA, NA, 3:6) * 10L, a = c(d2$a, NA, NA, NA), idx = c((0:3) * 10L, NA, NA, NA), + ..row_id = (0:6) * 10L + ) + ) + + + dbmc <- DataBackendMultiCbind$new(list(d1b, d3b)) + + expect_backend(dbmc) + + expect_equal( + dbmc$data((0:6) * 10L, dbmc$colnames), + data.table( + x = c(NA_character_, NA, 1:4), y = c(d1$y, NA, NA), z = c(NA, NA, 9:6), + id = (1:6) * 10L + ) + ) + + d0b = DataBackendDataTable$new(data.table(id = c(10:20)), "id") + + dbmc <- DataBackendMultiCbind$new(list(d0b, d1b)) + + expect_backend(dbmc) + + expect_set_equal(dbmc$rownames, c(10:20, 30L, 40L)) + + expect_equal(dbmc$data(c(10:20, 30L, 40L), dbmc$colnames), data.table( + id = c(10:20, 30L, 40L), + x = c("a", rep(NA, 9), letters[2:3], NA), + y = c("A", rep(NA, 9), LETTERS[2:4]), + z = c(1, rep(NA, 9), 2, 2, 1) + )) + + expect_identical(dbmc$data(11, dbmc$colnames), data.table(id = 11L, x = NA_character_, y = NA_character_, z = NA_real_)) + + expect_identical(dbmc$data(11, "x"), data.table(x = NA_character_)) + + + + dbmc <- DataBackendMultiCbind$new(list(d0b, d3b)) + + expect_backend(dbmc) + + + expect_set_equal(dbmc$rownames, c(10:20, (3:6)*10L)) + + expect_equal(dbmc$data(c(10:20, (3:6) * 10L), dbmc$colnames), data.table( + id = c(10:20, (3:6) * 10L), + x = c(rep(NA_character_, 11), 1:4), + z = c(rep(NA, 11), 9:6) + )) + + +}) + +