forked from dmlc/XGBoost.jl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
basic_walkthrough.jl
106 lines (89 loc) · 4.13 KB
/
basic_walkthrough.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
using XGBoost
const DATAPATH = joinpath(dirname(@__FILE__()), "..", "data")
# we load in the agaricus dataset
# In this example, we are aiming to predict whether a mushroom can be eaten
function readlibsvm(fname::String, shape)
dmx = zeros(Float32, shape)
label = Float32[]
fi = open(fname, "r")
cnt = 1
for line in eachline(fi)
line = split(line, " ")
push!(label, parse(Float64, line[1]))
line = line[2:end]
for itm in line
itm = split(itm, ":")
dmx[cnt, parse(Int, itm[1]) + 1] = parse(Int, itm[2])
end
cnt += 1
end
close(fi)
return (dmx, label)
end
# we use auxiliary function to read LIBSVM format into julia Matrix
train_X, train_Y = readlibsvm(joinpath(DATAPATH, "agaricus.txt.train"), (6513, 126))
test_X, test_Y = readlibsvm(joinpath(DATAPATH, "agaricus.txt.test"), (1611, 126))
#-------------Basic Training using XGBoost-----------------
# note: xgboost naturally handles sparse input
# use sparse matrix when your feature is sparse(e.g. when you using one-hot encoding vector)
# model parameters can be set as parameters for ```xgboost``` function, or use a Vector{String} / Dict()
num_round = 2
print("training xgboost with dense matrix\n")
# you can directly pass Julia's matrix or sparse matrix as data,
# by calling xgboost(data, num_round, label=label, training-parameters)
bst = xgboost(train_X, num_round, label = train_Y, eta = 1, max_depth = 2,
objective = "binary:logistic")
print("training xgboost with sparse matrix\n")
using SparseArrays: sparse
sptrain = sparse(train_X)
# alternatively, you can pass parameters in as a map
param = ["max_depth" => 2,
"eta" => 1,
"objective" => "binary:logistic"]
bst = xgboost(sptrain, num_round, label = train_Y, param = param)
# you can also put in xgboost's DMatrix object
# DMatrix stores label, data and other meta datas needed for advanced features
print("training xgboost with DMatrix\n")
dtrain = DMatrix(train_X, label = train_Y)
bst = xgboost(dtrain, num_round, eta = 1, objective = "binary:logistic")
# you can also specify data as file path to a LibSVM format input
bst = xgboost(joinpath(DATAPATH, "agaricus.txt.train"), num_round, max_depth = 2, eta = 1,
objective = "binary:logistic")
#--------------------basic prediction using XGBoost--------------
# you can do prediction using the following line
# you can put in Matrix, SparseMatrix or DMatrix
preds = predict(bst, test_X)
print("test-error=", sum((preds .> 0.5) .!= test_Y) / float(size(preds)[1]), "\n")
#-------------------save and load models-------------------------
# save model to binary local file
save(bst, "xgb.model")
# load binary model to julia
bst2 = Booster(model_file = "xgb.model")
preds2 = predict(bst2, test_X)
print("sum(abs(pred2-pred))=", sum(abs, preds2 .- preds), "\n")
#----------------Advanced features --------------
# to use advanced features, we need to put data in xgb.DMatrix
dtrain = DMatrix(train_X, label = train_Y)
dtest = DMatrix(test_X, label = test_Y)
#---------------Using watchlist----------------
# watchlist is a list of DMatrix, each of them tagged with name
# DMatrix in watchlist should have label (for evaluation)
watchlist = [(dtest,"eval"), (dtrain,"train")]
# we can change evaluation metrics, or use multiple evaluation metrics
bst = xgboost(dtrain, num_round, param = param, watchlist = watchlist,
metrics = ["logloss", "error"])
# we can also save DMatrix into binary file, then we can load it faster next time
save(dtest, "dtest.buffer")
save(dtrain, "dtrain.buffer")
# load model and data
dtrain = DMatrix("dtrain.buffer")
dtest = DMatrix("dtest.buffer")
bst = Booster(model_file = "xgb.model")
# information can be extracted from DMatrix using get_info
label = get_info(dtest, "label")
pred = predict(bst, dtest)
print("test-error=", sum((pred .> 0.5) .!= label) / float(size(pred)[1]), "\n")
# Finally, you can dump the tree you learned using dump_model into a text file
dump_model(bst, "dump.raw.txt")
# If you have feature map file, you can dump the model in a more readable way
dump_model(bst, "dump.nice.txt", fmap = joinpath(DATAPATH, "featmap.txt"))