From 042b7c4f8105dd8668b87e7da5874be44297e8c8 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 26 Jul 2020 00:50:35 -0700 Subject: [PATCH 1/9] [CI] Move lint to a separate script --- R-package/tests/run_lint.R | 24 ++++++++++++++++++++++++ R-package/tests/testthat/test_lint.R | 26 -------------------------- 2 files changed, 24 insertions(+), 26 deletions(-) create mode 100644 R-package/tests/run_lint.R delete mode 100644 R-package/tests/testthat/test_lint.R diff --git a/R-package/tests/run_lint.R b/R-package/tests/run_lint.R new file mode 100644 index 000000000000..7a6038b2149f --- /dev/null +++ b/R-package/tests/run_lint.R @@ -0,0 +1,24 @@ +library(lintr) + +my_linters <- list( + absolute_path_linter = lintr::absolute_path_linter, + assignment_linter = lintr::assignment_linter, + closed_curly_linter = lintr::closed_curly_linter, + commas_linter = lintr::commas_linter, + # commented_code_linter = lintr::commented_code_linter, + infix_spaces_linter = lintr::infix_spaces_linter, + line_length_linter = lintr::line_length_linter, + no_tab_linter = lintr::no_tab_linter, + object_usage_linter = lintr::object_usage_linter, + # snake_case_linter = lintr::snake_case_linter, + # multiple_dots_linter = lintr::multiple_dots_linter, + object_length_linter = lintr::object_length_linter, + open_curly_linter = lintr::open_curly_linter, + # single_quotes_linter = lintr::single_quotes_linter, + spaces_inside_linter = lintr::spaces_inside_linter, + spaces_left_parentheses_linter = lintr::spaces_left_parentheses_linter, + trailing_blank_lines_linter = lintr::trailing_blank_lines_linter, + trailing_whitespace_linter = lintr::trailing_whitespace_linter, + true_false = lintr::T_and_F_symbol_linter +) +lint_package(linters = my_linters) # uncomment this if you want to check code quality diff --git a/R-package/tests/testthat/test_lint.R b/R-package/tests/testthat/test_lint.R deleted file mode 100644 index 77fd264efaf2..000000000000 --- a/R-package/tests/testthat/test_lint.R +++ /dev/null @@ -1,26 +0,0 @@ -context("Code is of high quality and lint free") -test_that("Code Lint", { - skip_on_cran() - my_linters <- list( - absolute_path_linter = lintr::absolute_path_linter, - assignment_linter = lintr::assignment_linter, - closed_curly_linter = lintr::closed_curly_linter, - commas_linter = lintr::commas_linter, - # commented_code_linter = lintr::commented_code_linter, - infix_spaces_linter = lintr::infix_spaces_linter, - line_length_linter = lintr::line_length_linter, - no_tab_linter = lintr::no_tab_linter, - object_usage_linter = lintr::object_usage_linter, - # snake_case_linter = lintr::snake_case_linter, - # multiple_dots_linter = lintr::multiple_dots_linter, - object_length_linter = lintr::object_length_linter, - open_curly_linter = lintr::open_curly_linter, - # single_quotes_linter = lintr::single_quotes_linter, - spaces_inside_linter = lintr::spaces_inside_linter, - spaces_left_parentheses_linter = lintr::spaces_left_parentheses_linter, - trailing_blank_lines_linter = lintr::trailing_blank_lines_linter, - trailing_whitespace_linter = lintr::trailing_whitespace_linter, - true_false = lintr::T_and_F_symbol_linter - ) - lintr::expect_lint_free(linters = my_linters) # uncomment this if you want to check code quality -}) From ea2830055f3d61c5be6a430fd6962d8e8a2b547d Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 26 Jul 2020 01:44:30 -0700 Subject: [PATCH 2/9] [CI] Improved lintr launcher --- R-package/tests/run_lint.R | 13 +++++++- .../tests/testthat/test_model_compatibility.R | 33 ++++++++++--------- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/R-package/tests/run_lint.R b/R-package/tests/run_lint.R index 7a6038b2149f..e6ddf356c167 100644 --- a/R-package/tests/run_lint.R +++ b/R-package/tests/run_lint.R @@ -21,4 +21,15 @@ my_linters <- list( trailing_whitespace_linter = lintr::trailing_whitespace_linter, true_false = lintr::T_and_F_symbol_linter ) -lint_package(linters = my_linters) # uncomment this if you want to check code quality + +found_error <- FALSE + +for (r_file in list.files(path = '.', pattern = '\\.[Rr]$', recursive = TRUE)) { + cat(sprintf("**** %s\n\n", r_file)) + output <- lintr::lint(filename = r_file, linters = my_linters) + print(output) + if (length(output) > 0) { + found_error <- TRUE + } +} +quit(save = 'no', status = ifelse(found_error, 1, 0)) diff --git a/R-package/tests/testthat/test_model_compatibility.R b/R-package/tests/testthat/test_model_compatibility.R index d56207e3bb4f..7204ed89142c 100644 --- a/R-package/tests/testthat/test_model_compatibility.R +++ b/R-package/tests/testthat/test_model_compatibility.R @@ -7,8 +7,8 @@ context("Models from previous versions of XGBoost can be loaded") metadata <- model_generator_metadata() run_model_param_check <- function (config) { - expect_equal(config$learner$learner_model_param$num_feature, '4') - expect_equal(config$learner$learner_train_param$booster, 'gbtree') + testthat::expect_equal(config$learner$learner_model_param$num_feature, '4') + testthat::expect_equal(config$learner$learner_train_param$booster, 'gbtree') } get_num_tree <- function (booster) { @@ -27,22 +27,24 @@ run_booster_check <- function (booster, name) { config <- jsonlite::fromJSON(xgb.config(booster)) run_model_param_check(config) if (name == 'cls') { - expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds * metadata$kClasses) - expect_equal(as.numeric(config$learner$learner_model_param$base_score), 0.5) - expect_equal(config$learner$learner_train_param$objective, 'multi:softmax') - expect_equal(as.numeric(config$learner$learner_model_param$num_class), metadata$kClasses) + testthat::expect_equal(get_num_tree(booster), + metadata$kForests * metadata$kRounds * metadata$kClasses) + testthat::expect_equal(as.numeric(config$learner$learner_model_param$base_score), 0.5) + testthat::expect_equal(config$learner$learner_train_param$objective, 'multi:softmax') + testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class), + metadata$kClasses) } else if (name == 'logit') { - expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds) - expect_equal(as.numeric(config$learner$learner_model_param$num_class), 0) - expect_equal(config$learner$learner_train_param$objective, 'binary:logistic') + testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds) + testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class), 0) + testthat::expect_equal(config$learner$learner_train_param$objective, 'binary:logistic') } else if (name == 'ltr') { - expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds) - expect_equal(config$learner$learner_train_param$objective, 'rank:ndcg') + testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds) + testthat::expect_equal(config$learner$learner_train_param$objective, 'rank:ndcg') } else { - expect_equal(name, 'reg') - expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds) - expect_equal(as.numeric(config$learner$learner_model_param$base_score), 0.5) - expect_equal(config$learner$learner_train_param$objective, 'reg:squarederror') + testthat::expect_equal(name, 'reg') + testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds) + testthat::expect_equal(as.numeric(config$learner$learner_model_param$base_score), 0.5) + testthat::expect_equal(config$learner$learner_train_param$objective, 'reg:squarederror') } } @@ -73,5 +75,4 @@ test_that("Models from previous versions of XGBoost can be loaded", { predict(booster, newdata = pred_data) run_booster_check(booster, name) }) - expect_true(TRUE) }) From 1eb5a95c7a7de82039befdfeb00c681fdb04efe8 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 26 Jul 2020 01:49:42 -0700 Subject: [PATCH 3/9] Add lintr as a separate action --- .github/workflows/main.yml | 41 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e32e81de9174..dada58c0ecfc 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -38,6 +38,47 @@ jobs: mvn test -pl :xgboost4j_2.12 + lintr: + runs-on: ${{ matrix.config.os }} + + name: Run R linters on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }} + + strategy: + matrix: + config: + - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'} + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + RSPM: ${{ matrix.config.rspm }} + + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + + - uses: r-lib/actions/setup-r@master + with: + r-version: ${{ matrix.config.r }} + + - name: Cache R packages + uses: actions/cache@v2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-r-${{ matrix.config.r }}-1-${{ hashFiles('R-package/DESCRIPTION') }} + restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-2- + + - name: Install dependencies + shell: Rscript {0} + run: | + install.packages(c('XML','igraph')) + install.packages(c('data.table','magrittr','stringi','ggplot2','DiagrammeR','Ckmeans.1d.dp','vcd','testthat','lintr','knitr','rmarkdown')) + + - name: Test R + run: | + cd R-package + Rscript.exe tests/run_lint.R + + test-with-R: runs-on: ${{ matrix.config.os }} From a44941b8f0f724fe2cdb6064b160779ba4edcb0d Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 26 Jul 2020 03:11:02 -0700 Subject: [PATCH 4/9] Add custom parsing logic to print out logs --- R-package/tests/run_lint.R | 52 ++++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/R-package/tests/run_lint.R b/R-package/tests/run_lint.R index e6ddf356c167..823c77becf18 100644 --- a/R-package/tests/run_lint.R +++ b/R-package/tests/run_lint.R @@ -1,4 +1,5 @@ library(lintr) +library(crayon) my_linters <- list( absolute_path_linter = lintr::absolute_path_linter, @@ -22,14 +23,49 @@ my_linters <- list( true_false = lintr::T_and_F_symbol_linter ) -found_error <- FALSE +results <- lapply( + list.files(path = '.', pattern = '\\.[Rr]$', recursive = TRUE), + function (r_file) { + cat(sprintf("Processing %s ...\n", r_file)) + list(r_file = r_file, + output = lintr::lint(filename = r_file, linters = my_linters)) + }) +num_issue <- Reduce(sum, lapply(results, function (e) length(e$output))) -for (r_file in list.files(path = '.', pattern = '\\.[Rr]$', recursive = TRUE)) { - cat(sprintf("**** %s\n\n", r_file)) - output <- lintr::lint(filename = r_file, linters = my_linters) - print(output) - if (length(output) > 0) { - found_error <- TRUE +lint2str <- function(lint_entry) { + color <- function(type) { + switch(type, + "warning" = crayon::magenta, + "error" = crayon::red, + "style" = crayon::blue, + crayon::bold + ) } + + paste0( + lapply(lint_entry$output, + function (lint_line) { + paste0( + crayon::bold(lint_entry$r_file, ":", + as.character(lint_line$line_number), ":", + as.character(lint_line$column_number), ": ", sep = ""), + color(lint_line$type)(lint_line$type, ": ", sep = ""), + crayon::bold(lint_line$message), "\n", + lint_line$line, "\n", + lintr:::highlight_string(lint_line$message, lint_line$column_number, lint_line$ranges), + "\n", + collapse = "") + }), + collapse = "") +} + +if (num_issue > 0) { + cat(sprintf('R linters found %d issues:\n', num_issue)) + for (entry in results) { + if (length(entry$output)) { + cat(paste0('**** ', crayon::bold(entry$r_file), '\n')) + cat(paste0(lint2str(entry), collapse = '')) + } + } + quit(save = 'no', status = 1) # Signal error to parent shell } -quit(save = 'no', status = ifelse(found_error, 1, 0)) From 0a855756cf473b423dbb4482a5cdc263fdc3eac4 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 26 Jul 2020 10:19:22 -0700 Subject: [PATCH 5/9] Fix lintr issues in demos --- R-package/demo/README.md | 2 +- R-package/demo/basic_walkthrough.R | 34 ++++----- R-package/demo/boost_from_prediction.R | 10 +-- R-package/demo/caret_wrapper.R | 8 +- R-package/demo/create_sparse_matrix.R | 20 ++--- R-package/demo/cross_validation.R | 18 ++--- R-package/demo/custom_objective.R | 22 +++--- R-package/demo/early_stopping.R | 10 +-- R-package/demo/generalized_linear_model.R | 11 ++- R-package/demo/gpu_accelerated.R | 10 +-- R-package/demo/interaction_constraints.R | 90 ++++++++++++----------- R-package/demo/poisson_regression.R | 9 +-- R-package/demo/predict_first_ntree.R | 20 ++--- R-package/demo/predict_leaf_indices.R | 30 ++++---- R-package/demo/runall.R | 2 +- R-package/demo/tweedie_regression.R | 18 ++--- 16 files changed, 161 insertions(+), 153 deletions(-) mode change 100755 => 100644 R-package/demo/tweedie_regression.R diff --git a/R-package/demo/README.md b/R-package/demo/README.md index e53afeaeaf37..e9a840cb7997 100644 --- a/R-package/demo/README.md +++ b/R-package/demo/README.md @@ -17,4 +17,4 @@ Benchmarks Notes ==== * Contribution of examples, benchmarks is more than welcomed! -* If you like to share how you use xgboost to solve your problem, send a pull request:) +* If you like to share how you use xgboost to solve your problem, send a pull request :) diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R index 914d8b5a02c0..445a19aeeefd 100644 --- a/R-package/demo/basic_walkthrough.R +++ b/R-package/demo/basic_walkthrough.R @@ -3,8 +3,8 @@ require(methods) # we load in the agaricus dataset # In this example, we are aiming to predict whether a mushroom is edible -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') +data(agaricus.train, package = 'xgboost') +data(agaricus.test, package = 'xgboost') train <- agaricus.train test <- agaricus.test # the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1} @@ -26,7 +26,7 @@ bst <- xgboost(data = as.matrix(train$data), label = train$label, max_depth = 2, # you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features print("Training xgboost with xgb.DMatrix") dtrain <- xgb.DMatrix(data = train$data, label = train$label) -bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, nthread = 2, +bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, nthread = 2, objective = "binary:logistic") # Verbose = 0,1,2 @@ -46,7 +46,7 @@ bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, #--------------------basic prediction using xgboost-------------- # you can do prediction using the following line -# you can put in Matrix, sparseMatrix, or xgb.DMatrix +# you can put in Matrix, sparseMatrix, or xgb.DMatrix pred <- predict(bst, test$data) err <- mean(as.numeric(pred > 0.5) != test$label) print(paste("test-error=", err)) @@ -58,31 +58,31 @@ xgb.save(bst, "xgboost.model") bst2 <- xgb.load("xgboost.model") pred2 <- predict(bst2, test$data) # pred2 should be identical to pred -print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred)))) +print(paste("sum(abs(pred2-pred))=", sum(abs(pred2 - pred)))) # save model to R's raw vector -raw = xgb.save.raw(bst) +raw <- xgb.save.raw(bst) # load binary model to R bst3 <- xgb.load(raw) pred3 <- predict(bst3, test$data) # pred3 should be identical to pred -print(paste("sum(abs(pred3-pred))=", sum(abs(pred3-pred)))) +print(paste("sum(abs(pred3-pred))=", sum(abs(pred3 - pred)))) #----------------Advanced features -------------- # to use advanced features, we need to put data in xgb.DMatrix -dtrain <- xgb.DMatrix(data = train$data, label=train$label) -dtest <- xgb.DMatrix(data = test$data, label=test$label) +dtrain <- xgb.DMatrix(data = train$data, label = train$label) +dtest <- xgb.DMatrix(data = test$data, label = test$label) #---------------Using watchlist---------------- # watchlist is a list of xgb.DMatrix, each of them is tagged with name -watchlist <- list(train=dtrain, test=dtest) +watchlist <- list(train = dtrain, test = dtest) # to train with watchlist, use xgb.train, which contains more advanced features -# watchlist allows us to monitor the evaluation result on all data in the list +# watchlist allows us to monitor the evaluation result on all data in the list print("Train xgboost using xgb.train with watchlist") -bst <- xgb.train(data=dtrain, max_depth=2, eta=1, nrounds=2, watchlist=watchlist, +bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist, nthread = 2, objective = "binary:logistic") # we can change evaluation metrics, or use multiple evaluation metrics print("train xgboost using xgb.train with watchlist, watch logloss and error") -bst <- xgb.train(data=dtrain, max_depth=2, eta=1, nrounds=2, watchlist=watchlist, +bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist, eval_metric = "error", eval_metric = "logloss", nthread = 2, objective = "binary:logistic") @@ -90,16 +90,16 @@ bst <- xgb.train(data=dtrain, max_depth=2, eta=1, nrounds=2, watchlist=watchlist xgb.DMatrix.save(dtrain, "dtrain.buffer") # to load it in, simply call xgb.DMatrix dtrain2 <- xgb.DMatrix("dtrain.buffer") -bst <- xgb.train(data=dtrain2, max_depth=2, eta=1, nrounds=2, watchlist=watchlist, +bst <- xgb.train(data = dtrain2, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist, nthread = 2, objective = "binary:logistic") # information can be extracted from xgb.DMatrix using getinfo -label = getinfo(dtest, "label") +label <- getinfo(dtest, "label") pred <- predict(bst, dtest) -err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label) +err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label) print(paste("test-error=", err)) # You can dump the tree you learned using xgb.dump into a text file -dump_path = file.path(tempdir(), 'dump.raw.txt') +dump_path <- file.path(tempdir(), 'dump.raw.txt') xgb.dump(bst, dump_path, with_stats = TRUE) # Finally, you can check which features are the most important. diff --git a/R-package/demo/boost_from_prediction.R b/R-package/demo/boost_from_prediction.R index 0ac0d24a429e..1a3d55369d2f 100644 --- a/R-package/demo/boost_from_prediction.R +++ b/R-package/demo/boost_from_prediction.R @@ -1,7 +1,7 @@ require(xgboost) # load in the agaricus dataset -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') +data(agaricus.train, package = 'xgboost') +data(agaricus.test, package = 'xgboost') dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) @@ -11,12 +11,12 @@ watchlist <- list(eval = dtest, train = dtrain) # print('start running example to start from a initial prediction') # train xgboost for 1 round -param <- list(max_depth=2, eta=1, nthread = 2, objective='binary:logistic') +param <- list(max_depth = 2, eta = 1, nthread = 2, objective = 'binary:logistic') bst <- xgb.train(param, dtrain, 1, watchlist) # Note: we need the margin value instead of transformed prediction in set_base_margin # do predict with output_margin=TRUE, will always give you margin values before logistic transformation -ptrain <- predict(bst, dtrain, outputmargin=TRUE) -ptest <- predict(bst, dtest, outputmargin=TRUE) +ptrain <- predict(bst, dtrain, outputmargin = TRUE) +ptest <- predict(bst, dtest, outputmargin = TRUE) # set the base_margin property of dtrain and dtest # base margin is the base prediction we will boost from setinfo(dtrain, "base_margin", ptrain) diff --git a/R-package/demo/caret_wrapper.R b/R-package/demo/caret_wrapper.R index 9ab5933d2b75..ded5d92d2c2a 100644 --- a/R-package/demo/caret_wrapper.R +++ b/R-package/demo/caret_wrapper.R @@ -1,5 +1,5 @@ # install development version of caret library that contains xgboost models -devtools::install_github("topepo/caret/pkg/caret") +devtools::install_github("topepo/caret/pkg/caret") require(caret) require(xgboost) require(data.table) @@ -13,13 +13,13 @@ df <- data.table(Arthritis, keep.rownames = FALSE) # Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features. # For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values. -df[,AgeDiscret:= as.factor(round(Age/10,0))] +df[, AgeDiscret := as.factor(round(Age / 10, 0))] # Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!). -df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))] +df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))] # We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small). -df[,ID:=NULL] +df[, ID := NULL] #-------------Basic Training using XGBoost in caret Library----------------- # Set up control parameters for caret::train diff --git a/R-package/demo/create_sparse_matrix.R b/R-package/demo/create_sparse_matrix.R index 63d1a5b2f52b..8de81afc990d 100644 --- a/R-package/demo/create_sparse_matrix.R +++ b/R-package/demo/create_sparse_matrix.R @@ -6,10 +6,10 @@ if (!require(vcd)) { require(vcd) } # According to its documentation, Xgboost works only on numbers. -# Sometimes the dataset we have to work on have categorical data. +# Sometimes the dataset we have to work on have categorical data. # A categorical variable is one which have a fixed number of values. By example, if for each observation a variable called "Colour" can have only "red", "blue" or "green" as value, it is a categorical variable. # -# In R, categorical variable is called Factor. +# In R, categorical variable is called Factor. # Type ?factor in console for more information. # # In this demo we will see how to transform a dense dataframe with categorical variables to a sparse matrix before analyzing it in Xgboost. @@ -32,17 +32,17 @@ str(df) # Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features. # For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values. -df[,AgeDiscret:= as.factor(round(Age/10,0))] +df[, AgeDiscret := as.factor(round(Age / 10, 0))] # Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!). -df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))] +df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))] # We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small). -df[,ID:=NULL] +df[, ID := NULL] # List the different values for the column Treatment: Placebo, Treated. cat("Values of the categorical feature Treatment\n") -print(levels(df[,Treatment])) +print(levels(df[, Treatment])) # Next step, we will transform the categorical data to dummy variables. # This method is also called one hot encoding. @@ -52,16 +52,16 @@ print(levels(df[,Treatment])) # # Formulae Improved~.-1 used below means transform all categorical features but column Improved to binary values. # Column Improved is excluded because it will be our output column, the one we want to predict. -sparse_matrix = sparse.model.matrix(Improved~.-1, data = df) +sparse_matrix <- sparse.model.matrix(Improved ~ . - 1, data = df) cat("Encoding of the sparse Matrix\n") print(sparse_matrix) # Create the output vector (not sparse) -# 1. Set, for all rows, field in Y column to 0; -# 2. set Y to 1 when Improved == Marked; +# 1. Set, for all rows, field in Y column to 0; +# 2. set Y to 1 when Improved == Marked; # 3. Return Y column -output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y] +output_vector <- df[, Y := 0][Improved == "Marked", Y := 1][, Y] # Following is the same process as other demo cat("Learning...\n") diff --git a/R-package/demo/cross_validation.R b/R-package/demo/cross_validation.R index 47809ac89787..e55ff3915cd4 100644 --- a/R-package/demo/cross_validation.R +++ b/R-package/demo/cross_validation.R @@ -1,25 +1,25 @@ require(xgboost) # load in the agaricus dataset -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') +data(agaricus.train, package = 'xgboost') +data(agaricus.test, package = 'xgboost') dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) nrounds <- 2 -param <- list(max_depth=2, eta=1, nthread=2, objective='binary:logistic') +param <- list(max_depth = 2, eta = 1, nthread = 2, objective = 'binary:logistic') cat('running cross validation\n') # do cross validation, this will print result out as # [iteration] metric_name:mean_value+std_value # std_value is standard deviation of the metric -xgb.cv(param, dtrain, nrounds, nfold=5, metrics={'error'}) +xgb.cv(param, dtrain, nrounds, nfold = 5, metrics = {'error'}) cat('running cross validation, disable standard deviation display\n') # do cross validation, this will print result out as # [iteration] metric_name:mean_value+std_value # std_value is standard deviation of the metric -xgb.cv(param, dtrain, nrounds, nfold=5, - metrics='error', showsd = FALSE) +xgb.cv(param, dtrain, nrounds, nfold = 5, + metrics = 'error', showsd = FALSE) ### # you can also do cross validation with cutomized loss function @@ -29,18 +29,18 @@ print ('running cross validation, with cutomsized loss function') logregobj <- function(preds, dtrain) { labels <- getinfo(dtrain, "label") - preds <- 1/(1 + exp(-preds)) + preds <- 1 / (1 + exp(-preds)) grad <- preds - labels hess <- preds * (1 - preds) return(list(grad = grad, hess = hess)) } evalerror <- function(preds, dtrain) { labels <- getinfo(dtrain, "label") - err <- as.numeric(sum(labels != (preds > 0)))/length(labels) + err <- as.numeric(sum(labels != (preds > 0))) / length(labels) return(list(metric = "error", value = err)) } -param <- list(max_depth=2, eta=1, +param <- list(max_depth = 2, eta = 1, objective = logregobj, eval_metric = evalerror) # train with customized objective xgb.cv(params = param, data = dtrain, nrounds = nrounds, nfold = 5) diff --git a/R-package/demo/custom_objective.R b/R-package/demo/custom_objective.R index ec7e7e8a2ba4..d251f40059d2 100644 --- a/R-package/demo/custom_objective.R +++ b/R-package/demo/custom_objective.R @@ -1,7 +1,7 @@ require(xgboost) # load in the agaricus dataset -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') +data(agaricus.train, package = 'xgboost') +data(agaricus.test, package = 'xgboost') dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) @@ -15,7 +15,7 @@ num_round <- 2 # this is loglikelihood loss logregobj <- function(preds, dtrain) { labels <- getinfo(dtrain, "label") - preds <- 1/(1 + exp(-preds)) + preds <- 1 / (1 + exp(-preds)) grad <- preds - labels hess <- preds * (1 - preds) return(list(grad = grad, hess = hess)) @@ -29,36 +29,36 @@ logregobj <- function(preds, dtrain) { # Take this in mind when you use the customization, and maybe you need write customized evaluation function evalerror <- function(preds, dtrain) { labels <- getinfo(dtrain, "label") - err <- as.numeric(sum(labels != (preds > 0)))/length(labels) + err <- as.numeric(sum(labels != (preds > 0))) / length(labels) return(list(metric = "error", value = err)) } -param <- list(max_depth=2, eta=1, nthread = 2, verbosity=0, - objective=logregobj, eval_metric=evalerror) +param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0, + objective = logregobj, eval_metric = evalerror) print ('start training with user customized objective') # training with customized objective, we can also do step by step training # simply look at xgboost.py's implementation of train bst <- xgb.train(param, dtrain, num_round, watchlist) # -# there can be cases where you want additional information +# there can be cases where you want additional information # being considered besides the property of DMatrix you can get by getinfo # you can set additional information as attributes if DMatrix -# set label attribute of dtrain to be label, we use label as an example, it can be anything +# set label attribute of dtrain to be label, we use label as an example, it can be anything attr(dtrain, 'label') <- getinfo(dtrain, 'label') # this is new customized objective, where you can access things you set # same thing applies to customized evaluation function logregobjattr <- function(preds, dtrain) { # now you can access the attribute in customized function labels <- attr(dtrain, 'label') - preds <- 1/(1 + exp(-preds)) + preds <- 1 / (1 + exp(-preds)) grad <- preds - labels hess <- preds * (1 - preds) return(list(grad = grad, hess = hess)) } -param <- list(max_depth=2, eta=1, nthread = 2, verbosity=0, - objective=logregobjattr, eval_metric=evalerror) +param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0, + objective = logregobjattr, eval_metric = evalerror) print ('start training with user customized objective, with additional attributes in DMatrix') # training with customized objective, we can also do step by step training # simply look at xgboost.py's implementation of train diff --git a/R-package/demo/early_stopping.R b/R-package/demo/early_stopping.R index 92a3ee812c50..65210a5d366f 100644 --- a/R-package/demo/early_stopping.R +++ b/R-package/demo/early_stopping.R @@ -1,20 +1,20 @@ require(xgboost) # load in the agaricus dataset -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') +data(agaricus.train, package = 'xgboost') +data(agaricus.test, package = 'xgboost') dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) # note: for customized objective function, we leave objective as default # note: what we are getting is margin value in prediction # you must know what you are doing -param <- list(max_depth=2, eta=1, nthread=2, verbosity=0) +param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0) watchlist <- list(eval = dtest) num_round <- 20 # user define objective function, given prediction, return gradient and second order gradient # this is loglikelihood loss logregobj <- function(preds, dtrain) { labels <- getinfo(dtrain, "label") - preds <- 1/(1 + exp(-preds)) + preds <- 1 / (1 + exp(-preds)) grad <- preds - labels hess <- preds * (1 - preds) return(list(grad = grad, hess = hess)) @@ -27,7 +27,7 @@ logregobj <- function(preds, dtrain) { # Take this in mind when you use the customization, and maybe you need write customized evaluation function evalerror <- function(preds, dtrain) { labels <- getinfo(dtrain, "label") - err <- as.numeric(sum(labels != (preds > 0)))/length(labels) + err <- as.numeric(sum(labels != (preds > 0))) / length(labels) return(list(metric = "error", value = err)) } print ('start training with early Stopping setting') diff --git a/R-package/demo/generalized_linear_model.R b/R-package/demo/generalized_linear_model.R index 3c2cdb5420f3..c24fe72cbcad 100644 --- a/R-package/demo/generalized_linear_model.R +++ b/R-package/demo/generalized_linear_model.R @@ -1,7 +1,7 @@ require(xgboost) # load in the agaricus dataset -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') +data(agaricus.train, package = 'xgboost') +data(agaricus.test, package = 'xgboost') dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) ## @@ -11,14 +11,14 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) ## # change booster to gblinear, so that we are fitting a linear model -# alpha is the L1 regularizer +# alpha is the L1 regularizer # lambda is the L2 regularizer # you can also set lambda_bias which is L2 regularizer on the bias term param <- list(objective = "binary:logistic", booster = "gblinear", nthread = 2, alpha = 0.0001, lambda = 1) # normally, you do not need to set eta (step_size) -# XGBoost uses a parallel coordinate descent algorithm (shotgun), +# XGBoost uses a parallel coordinate descent algorithm (shotgun), # there could be affection on convergence with parallelization on certain cases # setting eta to be smaller value, e.g 0.5 can make the optimization more stable @@ -30,5 +30,4 @@ num_round <- 2 bst <- xgb.train(param, dtrain, num_round, watchlist) ypred <- predict(bst, dtest) labels <- getinfo(dtest, 'label') -cat('error of preds=', mean(as.numeric(ypred>0.5)!=labels),'\n') - +cat('error of preds=', mean(as.numeric(ypred > 0.5) != labels), '\n') diff --git a/R-package/demo/gpu_accelerated.R b/R-package/demo/gpu_accelerated.R index 321255c72cb5..14ed9392b7d1 100644 --- a/R-package/demo/gpu_accelerated.R +++ b/R-package/demo/gpu_accelerated.R @@ -1,9 +1,9 @@ # An example of using GPU-accelerated tree building algorithms -# -# NOTE: it can only run if you have a CUDA-enable GPU and the package was +# +# NOTE: it can only run if you have a CUDA-enable GPU and the package was # specially compiled with GPU support. # -# For the current functionality, see +# For the current functionality, see # https://xgboost.readthedocs.io/en/latest/gpu/index.html # @@ -21,8 +21,8 @@ m <- X[, sel] %*% betas - 1 + rnorm(N) y <- rbinom(N, 1, plogis(m)) tr <- sample.int(N, N * 0.75) -dtrain <- xgb.DMatrix(X[tr,], label = y[tr]) -dtest <- xgb.DMatrix(X[-tr,], label = y[-tr]) +dtrain <- xgb.DMatrix(X[tr, ], label = y[tr]) +dtest <- xgb.DMatrix(X[-tr, ], label = y[-tr]) wl <- list(train = dtrain, test = dtest) # An example of running 'gpu_hist' algorithm diff --git a/R-package/demo/interaction_constraints.R b/R-package/demo/interaction_constraints.R index 41c32d0e8b12..cc8617d9d5a4 100644 --- a/R-package/demo/interaction_constraints.R +++ b/R-package/demo/interaction_constraints.R @@ -4,33 +4,38 @@ library(data.table) set.seed(1024) # Function to obtain a list of interactions fitted in trees, requires input of maximum depth -treeInteractions <- function(input_tree, input_max_depth){ - trees <- copy(input_tree) # copy tree input to prevent overwriting +treeInteractions <- function(input_tree, input_max_depth) { + ID_merge <- i.id <- i.feature <- NULL # Suppress warning "no visible binding for global variable" + + trees <- data.table::copy(input_tree) # copy tree input to prevent overwriting if (input_max_depth < 2) return(list()) # no interactions if max depth < 2 if (nrow(input_tree) == 1) return(list()) # Attach parent nodes - for (i in 2:input_max_depth){ - if (i == 2) trees[, ID_merge:=ID] else trees[, ID_merge:=get(paste0('parent_',i-2))] - parents_left <- trees[!is.na(Split), list(i.id=ID, i.feature=Feature, ID_merge=Yes)] - parents_right <- trees[!is.na(Split), list(i.id=ID, i.feature=Feature, ID_merge=No)] - - setorderv(trees, 'ID_merge') - setorderv(parents_left, 'ID_merge') - setorderv(parents_right, 'ID_merge') - - trees <- merge(trees, parents_left, by='ID_merge', all.x=TRUE) - trees[!is.na(i.id), c(paste0('parent_', i-1), paste0('parent_feat_', i-1)):=list(i.id, i.feature)] - trees[, c('i.id','i.feature'):=NULL] - - trees <- merge(trees, parents_right, by='ID_merge', all.x=TRUE) - trees[!is.na(i.id), c(paste0('parent_', i-1), paste0('parent_feat_', i-1)):=list(i.id, i.feature)] - trees[, c('i.id','i.feature'):=NULL] + for (i in 2:input_max_depth) { + if (i == 2) trees[, ID_merge := ID] else trees[, ID_merge := get(paste0('parent_', i - 2))] + parents_left <- trees[!is.na(Split), list(i.id = ID, i.feature = Feature, ID_merge = Yes)] + parents_right <- trees[!is.na(Split), list(i.id = ID, i.feature = Feature, ID_merge = No)] + + data.table::setorderv(trees, 'ID_merge') + data.table::setorderv(parents_left, 'ID_merge') + data.table::setorderv(parents_right, 'ID_merge') + + trees <- merge(trees, parents_left, by = 'ID_merge', all.x = TRUE) + trees[!is.na(i.id), c(paste0('parent_', i - 1), paste0('parent_feat_', i - 1)) + := list(i.id, i.feature)] + trees[, c('i.id', 'i.feature') := NULL] + + trees <- merge(trees, parents_right, by = 'ID_merge', all.x = TRUE) + trees[!is.na(i.id), c(paste0('parent_', i - 1), paste0('parent_feat_', i - 1)) + := list(i.id, i.feature)] + trees[, c('i.id', 'i.feature') := NULL] } # Extract nodes with interactions - interaction_trees <- trees[!is.na(Split) & !is.na(parent_1), - c('Feature',paste0('parent_feat_',1:(input_max_depth-1))), with=FALSE] + interaction_trees <- trees[!is.na(Split) & !is.na(parent_1), + c('Feature', paste0('parent_feat_', 1:(input_max_depth - 1))), + with = FALSE] interaction_trees_split <- split(interaction_trees, 1:nrow(interaction_trees)) interaction_list <- lapply(interaction_trees_split, as.character) @@ -47,59 +52,62 @@ treeInteractions <- function(input_tree, input_max_depth){ # Generate sample data x <- list() -for (i in 1:10){ - x[[i]] = i*rnorm(1000, 10) +for (i in 1:10) { + x[[i]] <- i * rnorm(1000, 10) } x <- as.data.table(x) -y = -1*x[, rowSums(.SD)] + x[['V1']]*x[['V2']] + x[['V3']]*x[['V4']]*x[['V5']] + rnorm(1000, 0.001) + 3*sin(x[['V7']]) +y <- -1 * x[, rowSums(.SD)] + x[['V1']] * x[['V2']] + x[['V3']] * x[['V4']] * x[['V5']] + + rnorm(1000, 0.001) + 3 * sin(x[['V7']]) -train = as.matrix(x) +train <- as.matrix(x) # Interaction constraint list (column names form) -interaction_list <- list(c('V1','V2'),c('V3','V4','V5')) +interaction_list <- list(c('V1', 'V2'), c('V3', 'V4', 'V5')) # Convert interaction constraint list into feature index form cols2ids <- function(object, col_names) { LUT <- seq_along(col_names) - 1 names(LUT) <- col_names - rapply(object, function(x) LUT[x], classes="character", how="replace") + rapply(object, function(x) LUT[x], classes = "character", how = "replace") } -interaction_list_fid = cols2ids(interaction_list, colnames(train)) +interaction_list_fid <- cols2ids(interaction_list, colnames(train)) # Fit model with interaction constraints -bst = xgboost(data = train, label = y, max_depth = 4, - eta = 0.1, nthread = 2, nrounds = 1000, - interaction_constraints = interaction_list_fid) +bst <- xgboost(data = train, label = y, max_depth = 4, + eta = 0.1, nthread = 2, nrounds = 1000, + interaction_constraints = interaction_list_fid) bst_tree <- xgb.model.dt.tree(colnames(train), bst) -bst_interactions <- treeInteractions(bst_tree, 4) # interactions constrained to combinations of V1*V2 and V3*V4*V5 +bst_interactions <- treeInteractions(bst_tree, 4) + # interactions constrained to combinations of V1*V2 and V3*V4*V5 # Fit model without interaction constraints -bst2 = xgboost(data = train, label = y, max_depth = 4, - eta = 0.1, nthread = 2, nrounds = 1000) +bst2 <- xgboost(data = train, label = y, max_depth = 4, + eta = 0.1, nthread = 2, nrounds = 1000) bst2_tree <- xgb.model.dt.tree(colnames(train), bst2) bst2_interactions <- treeInteractions(bst2_tree, 4) # much more interactions # Fit model with both interaction and monotonicity constraints -bst3 = xgboost(data = train, label = y, max_depth = 4, - eta = 0.1, nthread = 2, nrounds = 1000, - interaction_constraints = interaction_list_fid, - monotone_constraints = c(-1,0,0,0,0,0,0,0,0,0)) +bst3 <- xgboost(data = train, label = y, max_depth = 4, + eta = 0.1, nthread = 2, nrounds = 1000, + interaction_constraints = interaction_list_fid, + monotone_constraints = c(-1, 0, 0, 0, 0, 0, 0, 0, 0, 0)) bst3_tree <- xgb.model.dt.tree(colnames(train), bst3) -bst3_interactions <- treeInteractions(bst3_tree, 4) # interactions still constrained to combinations of V1*V2 and V3*V4*V5 +bst3_interactions <- treeInteractions(bst3_tree, 4) + # interactions still constrained to combinations of V1*V2 and V3*V4*V5 # Show monotonic constraints still apply by checking scores after incrementing V1 x1 <- sort(unique(x[['V1']])) for (i in 1:length(x1)){ testdata <- copy(x[, -c('V1')]) testdata[['V1']] <- x1[i] - testdata <- testdata[, paste0('V',1:10), with=FALSE] + testdata <- testdata[, paste0('V', 1:10), with = FALSE] pred <- predict(bst3, as.matrix(testdata)) - + # Should not print out anything due to monotonic constraints if (i > 1) if (any(pred > prev_pred)) print(i) - prev_pred <- pred + prev_pred <- pred } diff --git a/R-package/demo/poisson_regression.R b/R-package/demo/poisson_regression.R index f9dc4ac62153..121ac17f2173 100644 --- a/R-package/demo/poisson_regression.R +++ b/R-package/demo/poisson_regression.R @@ -1,7 +1,6 @@ data(mtcars) head(mtcars) -bst = xgboost(data=as.matrix(mtcars[,-11]),label=mtcars[,11], - objective='count:poisson',nrounds=5) -pred = predict(bst,as.matrix(mtcars[,-11])) -sqrt(mean((pred-mtcars[,11])^2)) - +bst <- xgboost(data = as.matrix(mtcars[, -11]), label = mtcars[, 11], + objective = 'count:poisson', nrounds = 5) +pred <- predict(bst, as.matrix(mtcars[, -11])) +sqrt(mean((pred - mtcars[, 11]) ^ 2)) diff --git a/R-package/demo/predict_first_ntree.R b/R-package/demo/predict_first_ntree.R index 384a6fb0a58f..02c168b77e43 100644 --- a/R-package/demo/predict_first_ntree.R +++ b/R-package/demo/predict_first_ntree.R @@ -1,23 +1,23 @@ require(xgboost) # load in the agaricus dataset -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') +data(agaricus.train, package = 'xgboost') +data(agaricus.test, package = 'xgboost') dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) -param <- list(max_depth=2, eta=1, objective='binary:logistic') +param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic') watchlist <- list(eval = dtest, train = dtrain) -nrounds = 2 +nrounds <- 2 # training the model for two rounds -bst = xgb.train(param, dtrain, nrounds, nthread = 2, watchlist) +bst <- xgb.train(param, dtrain, nrounds, nthread = 2, watchlist) cat('start testing prediction from first n trees\n') -labels <- getinfo(dtest,'label') +labels <- getinfo(dtest, 'label') ### predict using first 1 tree -ypred1 = predict(bst, dtest, ntreelimit=1) +ypred1 <- predict(bst, dtest, ntreelimit = 1) # by default, we predict using all the trees -ypred2 = predict(bst, dtest) +ypred2 <- predict(bst, dtest) -cat('error of ypred1=', mean(as.numeric(ypred1>0.5)!=labels),'\n') -cat('error of ypred2=', mean(as.numeric(ypred2>0.5)!=labels),'\n') +cat('error of ypred1=', mean(as.numeric(ypred1 > 0.5) != labels), '\n') +cat('error of ypred2=', mean(as.numeric(ypred2 > 0.5) != labels), '\n') diff --git a/R-package/demo/predict_leaf_indices.R b/R-package/demo/predict_leaf_indices.R index 492c17b138f7..0f5d8f329618 100644 --- a/R-package/demo/predict_leaf_indices.R +++ b/R-package/demo/predict_leaf_indices.R @@ -5,34 +5,34 @@ require(Matrix) set.seed(1982) # load in the agaricus dataset -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') +data(agaricus.train, package = 'xgboost') +data(agaricus.test, package = 'xgboost') dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label) dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label) -param <- list(max_depth=2, eta=1, objective='binary:logistic') -nrounds = 4 +param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic') +nrounds <- 4 # training the model for two rounds -bst = xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2) +bst <- xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2) # Model accuracy without new features -accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label) +accuracy.before <- (sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) + / length(agaricus.test$label)) # by default, we predict using all the trees - -pred_with_leaf = predict(bst, dtest, predleaf = TRUE) +pred_with_leaf <- predict(bst, dtest, predleaf = TRUE) head(pred_with_leaf) create.new.tree.features <- function(model, original.features){ pred_with_leaf <- predict(model, original.features, predleaf = TRUE) cols <- list() - for(i in 1:model$niter){ + for (i in 1:model$niter) { # max is not the real max but it s not important for the purpose of adding features - leaf.id <- sort(unique(pred_with_leaf[,i])) - cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id) + leaf.id <- sort(unique(pred_with_leaf[, i])) + cols[[i]] <- factor(x = pred_with_leaf[, i], level = leaf.id) } - cbind(original.features, sparse.model.matrix( ~ . -1, as.data.frame(cols))) + cbind(original.features, sparse.model.matrix(~ . - 1, as.data.frame(cols))) } # Convert previous features to one hot encoding @@ -47,7 +47,9 @@ watchlist <- list(train = new.dtrain) bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2) # Model accuracy with new features -accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label) +accuracy.after <- (sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) + / length(agaricus.test$label)) # Here the accuracy was already good and is now perfect. -cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n")) +cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", + accuracy.after, "!\n")) diff --git a/R-package/demo/runall.R b/R-package/demo/runall.R index 0c1392ac8c97..e518ed3817e4 100644 --- a/R-package/demo/runall.R +++ b/R-package/demo/runall.R @@ -11,4 +11,4 @@ demo(early_stopping) demo(poisson_regression) demo(caret_wrapper) demo(tweedie_regression) -#demo(gpu_accelerated) # can only run when built with GPU support \ No newline at end of file +#demo(gpu_accelerated) # can only run when built with GPU support diff --git a/R-package/demo/tweedie_regression.R b/R-package/demo/tweedie_regression.R old mode 100755 new mode 100644 index 8b84ed555223..dfaf6a2ae2ce --- a/R-package/demo/tweedie_regression.R +++ b/R-package/demo/tweedie_regression.R @@ -8,7 +8,7 @@ data(AutoClaim) dt <- data.table(AutoClaim) # exclude these columns from the model matrix -exclude <- c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY') +exclude <- c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY') # retains the missing values # NOTE: this dataset is comes ready out of the box @@ -21,29 +21,29 @@ y <- dt[, CLM_AMT5] d_train <- xgb.DMatrix(data = x, label = y, missing = NA) -# the tweedie_variance_power parameter determines the shape of +# the tweedie_variance_power parameter determines the shape of # distribution # - closer to 1 is more poisson like and the mass -# is more concentrated near zero -# - closer to 2 is more gamma like and the mass spreads to the +# is more concentrated near zero +# - closer to 2 is more gamma like and the mass spreads to the # the right with less concentration near zero params <- list( objective = 'reg:tweedie', - eval_metric = 'rmse', + eval_metric = 'rmse', tweedie_variance_power = 1.4, max_depth = 6, eta = 1) bst <- xgb.train( - data = d_train, - params = params, + data = d_train, + params = params, maximize = FALSE, - watchlist = list(train = d_train), + watchlist = list(train = d_train), nrounds = 20) var_imp <- xgb.importance(attr(x, 'Dimnames')[[2]], model = bst) preds <- predict(bst, d_train) -rmse <- sqrt(sum(mean((y - preds)^2))) \ No newline at end of file +rmse <- sqrt(sum(mean((y - preds) ^ 2))) From 4cee55cda4f2999c0b558135344c67caec26e779 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 26 Jul 2020 10:43:49 -0700 Subject: [PATCH 6/9] Run R demos --- .github/workflows/main.yml | 13 +++++++++---- R-package/demo/runall.R | 26 +++++++++++++------------- tests/ci_build/test_r_package.py | 8 ++++++++ 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index dada58c0ecfc..6642cc4e1df7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -6,6 +6,9 @@ name: XGBoost-CI # events but only for the master branch on: [push, pull_request] +env: + R_PACKAGES: c('XML', 'igraph', 'data.table', 'magrittr', 'stringi', 'ggplot2', 'DiagrammeR', 'Ckmeans.1d.dp', 'vcd', 'testthat', 'lintr', 'knitr', 'rmarkdown', 'e1071', 'cplm') + # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: test-with-jvm: @@ -70,8 +73,9 @@ jobs: - name: Install dependencies shell: Rscript {0} run: | - install.packages(c('XML','igraph')) - install.packages(c('data.table','magrittr','stringi','ggplot2','DiagrammeR','Ckmeans.1d.dp','vcd','testthat','lintr','knitr','rmarkdown')) + install.packages(${{ env.R_PACKAGES }}, + repos = 'http://cloud.r-project.org', + dependencies = c('Depends', 'Imports', 'LinkingTo')) - name: Test R run: | @@ -119,8 +123,9 @@ jobs: - name: Install dependencies shell: Rscript {0} run: | - install.packages(c('XML','igraph')) - install.packages(c('data.table','magrittr','stringi','ggplot2','DiagrammeR','Ckmeans.1d.dp','vcd','testthat','lintr','knitr','rmarkdown')) + install.packages(${{ env.R_PACKAGES }}, + repos = 'http://cloud.r-project.org', + dependencies = c('Depends', 'Imports', 'LinkingTo')) - uses: actions/setup-python@v2 with: diff --git a/R-package/demo/runall.R b/R-package/demo/runall.R index e518ed3817e4..0608bcb40bdd 100644 --- a/R-package/demo/runall.R +++ b/R-package/demo/runall.R @@ -1,14 +1,14 @@ # running all scripts in demo folder -demo(basic_walkthrough) -demo(custom_objective) -demo(boost_from_prediction) -demo(predict_first_ntree) -demo(generalized_linear_model) -demo(cross_validation) -demo(create_sparse_matrix) -demo(predict_leaf_indices) -demo(early_stopping) -demo(poisson_regression) -demo(caret_wrapper) -demo(tweedie_regression) -#demo(gpu_accelerated) # can only run when built with GPU support +demo(basic_walkthrough, package = 'xgboost') +demo(custom_objective, package = 'xgboost') +demo(boost_from_prediction, package = 'xgboost') +demo(predict_first_ntree, package = 'xgboost') +demo(generalized_linear_model, package = 'xgboost') +demo(cross_validation, package = 'xgboost') +demo(create_sparse_matrix, package = 'xgboost') +demo(predict_leaf_indices, package = 'xgboost') +demo(early_stopping, package = 'xgboost') +demo(poisson_regression, package = 'xgboost') +demo(caret_wrapper, package = 'xgboost') +demo(tweedie_regression, package = 'xgboost') +#demo(gpu_accelerated, package = 'xgboost') # can only run when built with GPU support diff --git a/tests/ci_build/test_r_package.py b/tests/ci_build/test_r_package.py index 0590152c0f65..582e76b07654 100644 --- a/tests/ci_build/test_r_package.py +++ b/tests/ci_build/test_r_package.py @@ -46,6 +46,10 @@ def test_with_autotools(args): 'R.exe', '-q', '-e', "library(testthat); setwd('tests'); source('testthat.R')" ]) + subprocess.check_call([ + 'R.exe', '-q', '-e', + "demo(runall, package = 'xgboost')" + ]) def test_with_cmake(args): @@ -79,6 +83,10 @@ def test_with_cmake(args): 'R.exe', '-q', '-e', "library(testthat); setwd('tests'); source('testthat.R')" ]) + subprocess.check_call([ + 'R.exe', '-q', '-e', + "demo(runall, package = 'xgboost')" + ]) def main(args): From 4dab38a738d5befbec2258a14837a41e1006f781 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 26 Jul 2020 10:54:20 -0700 Subject: [PATCH 7/9] Fix CRAN checks --- R-package/DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index be9c62649f17..468098a083e6 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -54,7 +54,8 @@ Suggests: lintr, igraph (>= 1.0.1), jsonlite, - float + float, + crayon Depends: R (>= 3.3.0) Imports: From 6293d013b57d15489ae54b2139168e4925fd1a63 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 26 Jul 2020 11:13:37 -0700 Subject: [PATCH 8/9] Install XGBoost into R env before running lintr --- .github/workflows/main.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6642cc4e1df7..c8325b21ed9d 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -77,9 +77,10 @@ jobs: repos = 'http://cloud.r-project.org', dependencies = c('Depends', 'Imports', 'LinkingTo')) - - name: Test R + - name: Run lintr run: | cd R-package + R.exe CMD INSTALL . Rscript.exe tests/run_lint.R From 6297156ee3aab7e6497f57c52ab4b1cead0a07cb Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 26 Jul 2020 11:14:41 -0700 Subject: [PATCH 9/9] Install devtools (needed to run demos) --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c8325b21ed9d..9d4196feb754 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,7 +7,7 @@ name: XGBoost-CI on: [push, pull_request] env: - R_PACKAGES: c('XML', 'igraph', 'data.table', 'magrittr', 'stringi', 'ggplot2', 'DiagrammeR', 'Ckmeans.1d.dp', 'vcd', 'testthat', 'lintr', 'knitr', 'rmarkdown', 'e1071', 'cplm') + R_PACKAGES: c('XML', 'igraph', 'data.table', 'magrittr', 'stringi', 'ggplot2', 'DiagrammeR', 'Ckmeans.1d.dp', 'vcd', 'testthat', 'lintr', 'knitr', 'rmarkdown', 'e1071', 'cplm', 'devtools') # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: