Skip to content

Commit

Permalink
[CI] Improve R linter script (#5944)
Browse files Browse the repository at this point in the history
* [CI] Move lint to a separate script

* [CI] Improved lintr launcher

* Add lintr as a separate action

* Add custom parsing logic to print out logs

* Fix lintr issues in demos

* Run R demos

* Fix CRAN checks

* Install XGBoost into R env before running lintr

* Install devtools (needed to run demos)
  • Loading branch information
hcho3 authored Jul 27, 2020
1 parent 8943eb4 commit 5879acd
Show file tree
Hide file tree
Showing 22 changed files with 320 additions and 210 deletions.
51 changes: 49 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ name: XGBoost-CI
# events but only for the master branch
on: [push, pull_request]

env:
R_PACKAGES: c('XML', 'igraph', 'data.table', 'magrittr', 'stringi', 'ggplot2', 'DiagrammeR', 'Ckmeans.1d.dp', 'vcd', 'testthat', 'lintr', 'knitr', 'rmarkdown', 'e1071', 'cplm', 'devtools')

# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
test-with-jvm:
Expand Down Expand Up @@ -38,6 +41,49 @@ jobs:
mvn test -pl :xgboost4j_2.12
lintr:
runs-on: ${{ matrix.config.os }}

name: Run R linters on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }}

strategy:
matrix:
config:
- {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'}
env:
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
RSPM: ${{ matrix.config.rspm }}

steps:
- uses: actions/checkout@v2
with:
submodules: 'true'

- uses: r-lib/actions/setup-r@master
with:
r-version: ${{ matrix.config.r }}

- name: Cache R packages
uses: actions/cache@v2
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-1-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-2-

- name: Install dependencies
shell: Rscript {0}
run: |
install.packages(${{ env.R_PACKAGES }},
repos = 'http://cloud.r-project.org',
dependencies = c('Depends', 'Imports', 'LinkingTo'))
- name: Run lintr
run: |
cd R-package
R.exe CMD INSTALL .
Rscript.exe tests/run_lint.R
test-with-R:
runs-on: ${{ matrix.config.os }}

Expand Down Expand Up @@ -78,8 +124,9 @@ jobs:
- name: Install dependencies
shell: Rscript {0}
run: |
install.packages(c('XML','igraph'))
install.packages(c('data.table','magrittr','stringi','ggplot2','DiagrammeR','Ckmeans.1d.dp','vcd','testthat','lintr','knitr','rmarkdown'))
install.packages(${{ env.R_PACKAGES }},
repos = 'http://cloud.r-project.org',
dependencies = c('Depends', 'Imports', 'LinkingTo'))
- uses: actions/setup-python@v2
with:
Expand Down
3 changes: 2 additions & 1 deletion R-package/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ Suggests:
lintr,
igraph (>= 1.0.1),
jsonlite,
float
float,
crayon
Depends:
R (>= 3.3.0)
Imports:
Expand Down
2 changes: 1 addition & 1 deletion R-package/demo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ Benchmarks
Notes
====
* Contribution of examples, benchmarks is more than welcomed!
* If you like to share how you use xgboost to solve your problem, send a pull request:)
* If you like to share how you use xgboost to solve your problem, send a pull request :)
34 changes: 17 additions & 17 deletions R-package/demo/basic_walkthrough.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ require(methods)

# we load in the agaricus dataset
# In this example, we are aiming to predict whether a mushroom is edible
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
data(agaricus.train, package = 'xgboost')
data(agaricus.test, package = 'xgboost')
train <- agaricus.train
test <- agaricus.test
# the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1}
Expand All @@ -26,7 +26,7 @@ bst <- xgboost(data = as.matrix(train$data), label = train$label, max_depth = 2,
# you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features
print("Training xgboost with xgb.DMatrix")
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, nthread = 2,
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, nthread = 2,
objective = "binary:logistic")

# Verbose = 0,1,2
Expand All @@ -46,7 +46,7 @@ bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2,

#--------------------basic prediction using xgboost--------------
# you can do prediction using the following line
# you can put in Matrix, sparseMatrix, or xgb.DMatrix
# you can put in Matrix, sparseMatrix, or xgb.DMatrix
pred <- predict(bst, test$data)
err <- mean(as.numeric(pred > 0.5) != test$label)
print(paste("test-error=", err))
Expand All @@ -58,48 +58,48 @@ xgb.save(bst, "xgboost.model")
bst2 <- xgb.load("xgboost.model")
pred2 <- predict(bst2, test$data)
# pred2 should be identical to pred
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2 - pred))))

# save model to R's raw vector
raw = xgb.save.raw(bst)
raw <- xgb.save.raw(bst)
# load binary model to R
bst3 <- xgb.load(raw)
pred3 <- predict(bst3, test$data)
# pred3 should be identical to pred
print(paste("sum(abs(pred3-pred))=", sum(abs(pred3-pred))))
print(paste("sum(abs(pred3-pred))=", sum(abs(pred3 - pred))))

#----------------Advanced features --------------
# to use advanced features, we need to put data in xgb.DMatrix
dtrain <- xgb.DMatrix(data = train$data, label=train$label)
dtest <- xgb.DMatrix(data = test$data, label=test$label)
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
dtest <- xgb.DMatrix(data = test$data, label = test$label)
#---------------Using watchlist----------------
# watchlist is a list of xgb.DMatrix, each of them is tagged with name
watchlist <- list(train=dtrain, test=dtest)
watchlist <- list(train = dtrain, test = dtest)
# to train with watchlist, use xgb.train, which contains more advanced features
# watchlist allows us to monitor the evaluation result on all data in the list
# watchlist allows us to monitor the evaluation result on all data in the list
print("Train xgboost using xgb.train with watchlist")
bst <- xgb.train(data=dtrain, max_depth=2, eta=1, nrounds=2, watchlist=watchlist,
bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist,
nthread = 2, objective = "binary:logistic")
# we can change evaluation metrics, or use multiple evaluation metrics
print("train xgboost using xgb.train with watchlist, watch logloss and error")
bst <- xgb.train(data=dtrain, max_depth=2, eta=1, nrounds=2, watchlist=watchlist,
bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist,
eval_metric = "error", eval_metric = "logloss",
nthread = 2, objective = "binary:logistic")

# xgb.DMatrix can also be saved using xgb.DMatrix.save
xgb.DMatrix.save(dtrain, "dtrain.buffer")
# to load it in, simply call xgb.DMatrix
dtrain2 <- xgb.DMatrix("dtrain.buffer")
bst <- xgb.train(data=dtrain2, max_depth=2, eta=1, nrounds=2, watchlist=watchlist,
bst <- xgb.train(data = dtrain2, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist,
nthread = 2, objective = "binary:logistic")
# information can be extracted from xgb.DMatrix using getinfo
label = getinfo(dtest, "label")
label <- getinfo(dtest, "label")
pred <- predict(bst, dtest)
err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label)
print(paste("test-error=", err))

# You can dump the tree you learned using xgb.dump into a text file
dump_path = file.path(tempdir(), 'dump.raw.txt')
dump_path <- file.path(tempdir(), 'dump.raw.txt')
xgb.dump(bst, dump_path, with_stats = TRUE)

# Finally, you can check which features are the most important.
Expand Down
10 changes: 5 additions & 5 deletions R-package/demo/boost_from_prediction.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
data(agaricus.train, package = 'xgboost')
data(agaricus.test, package = 'xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)

Expand All @@ -11,12 +11,12 @@ watchlist <- list(eval = dtest, train = dtrain)
#
print('start running example to start from a initial prediction')
# train xgboost for 1 round
param <- list(max_depth=2, eta=1, nthread = 2, objective='binary:logistic')
param <- list(max_depth = 2, eta = 1, nthread = 2, objective = 'binary:logistic')
bst <- xgb.train(param, dtrain, 1, watchlist)
# Note: we need the margin value instead of transformed prediction in set_base_margin
# do predict with output_margin=TRUE, will always give you margin values before logistic transformation
ptrain <- predict(bst, dtrain, outputmargin=TRUE)
ptest <- predict(bst, dtest, outputmargin=TRUE)
ptrain <- predict(bst, dtrain, outputmargin = TRUE)
ptest <- predict(bst, dtest, outputmargin = TRUE)
# set the base_margin property of dtrain and dtest
# base margin is the base prediction we will boost from
setinfo(dtrain, "base_margin", ptrain)
Expand Down
8 changes: 4 additions & 4 deletions R-package/demo/caret_wrapper.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# install development version of caret library that contains xgboost models
devtools::install_github("topepo/caret/pkg/caret")
devtools::install_github("topepo/caret/pkg/caret")
require(caret)
require(xgboost)
require(data.table)
Expand All @@ -13,13 +13,13 @@ df <- data.table(Arthritis, keep.rownames = FALSE)

# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
df[,AgeDiscret:= as.factor(round(Age/10,0))]
df[, AgeDiscret := as.factor(round(Age / 10, 0))]

# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!).
df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]

# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
df[,ID:=NULL]
df[, ID := NULL]

#-------------Basic Training using XGBoost in caret Library-----------------
# Set up control parameters for caret::train
Expand Down
20 changes: 10 additions & 10 deletions R-package/demo/create_sparse_matrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ if (!require(vcd)) {
require(vcd)
}
# According to its documentation, Xgboost works only on numbers.
# Sometimes the dataset we have to work on have categorical data.
# Sometimes the dataset we have to work on have categorical data.
# A categorical variable is one which have a fixed number of values. By example, if for each observation a variable called "Colour" can have only "red", "blue" or "green" as value, it is a categorical variable.
#
# In R, categorical variable is called Factor.
# In R, categorical variable is called Factor.
# Type ?factor in console for more information.
#
# In this demo we will see how to transform a dense dataframe with categorical variables to a sparse matrix before analyzing it in Xgboost.
Expand All @@ -32,17 +32,17 @@ str(df)
# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.

# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
df[,AgeDiscret:= as.factor(round(Age/10,0))]
df[, AgeDiscret := as.factor(round(Age / 10, 0))]

# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!).
df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]

# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
df[,ID:=NULL]
df[, ID := NULL]

# List the different values for the column Treatment: Placebo, Treated.
cat("Values of the categorical feature Treatment\n")
print(levels(df[,Treatment]))
print(levels(df[, Treatment]))

# Next step, we will transform the categorical data to dummy variables.
# This method is also called one hot encoding.
Expand All @@ -52,16 +52,16 @@ print(levels(df[,Treatment]))
#
# Formulae Improved~.-1 used below means transform all categorical features but column Improved to binary values.
# Column Improved is excluded because it will be our output column, the one we want to predict.
sparse_matrix = sparse.model.matrix(Improved~.-1, data = df)
sparse_matrix <- sparse.model.matrix(Improved ~ . - 1, data = df)

cat("Encoding of the sparse Matrix\n")
print(sparse_matrix)

# Create the output vector (not sparse)
# 1. Set, for all rows, field in Y column to 0;
# 2. set Y to 1 when Improved == Marked;
# 1. Set, for all rows, field in Y column to 0;
# 2. set Y to 1 when Improved == Marked;
# 3. Return Y column
output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y]
output_vector <- df[, Y := 0][Improved == "Marked", Y := 1][, Y]

# Following is the same process as other demo
cat("Learning...\n")
Expand Down
18 changes: 9 additions & 9 deletions R-package/demo/cross_validation.R
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
data(agaricus.train, package = 'xgboost')
data(agaricus.test, package = 'xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)

nrounds <- 2
param <- list(max_depth=2, eta=1, nthread=2, objective='binary:logistic')
param <- list(max_depth = 2, eta = 1, nthread = 2, objective = 'binary:logistic')

cat('running cross validation\n')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, nrounds, nfold=5, metrics={'error'})
xgb.cv(param, dtrain, nrounds, nfold = 5, metrics = {'error'})

cat('running cross validation, disable standard deviation display\n')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, nrounds, nfold=5,
metrics='error', showsd = FALSE)
xgb.cv(param, dtrain, nrounds, nfold = 5,
metrics = 'error', showsd = FALSE)

###
# you can also do cross validation with cutomized loss function
Expand All @@ -29,18 +29,18 @@ print ('running cross validation, with cutomsized loss function')

logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
preds <- 1 / (1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
return(list(metric = "error", value = err))
}

param <- list(max_depth=2, eta=1,
param <- list(max_depth = 2, eta = 1,
objective = logregobj, eval_metric = evalerror)
# train with customized objective
xgb.cv(params = param, data = dtrain, nrounds = nrounds, nfold = 5)
Expand Down
Loading

0 comments on commit 5879acd

Please sign in to comment.