-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
lgb.Dataset.R
1246 lines (1044 loc) · 38 KB
/
lgb.Dataset.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#' @name lgb_shared_dataset_params
#' @title Shared Dataset parameter docs
#' @description Parameter docs for fields used in \code{lgb.Dataset} construction
#' @param label vector of labels to use as the target variable
#' @param weight numeric vector of sample weights
#' @param init_score initial score is the base prediction lightgbm will boost from
#' @param group used for learning-to-rank tasks. An integer vector describing how to
#' group rows together as ordered results from the same set of candidate results
#' to be ranked. For example, if you have a 100-document dataset with
#' \code{group = c(10, 20, 40, 10, 10, 10)}, that means that you have 6 groups,
#' where the first 10 records are in the first group, records 11-30 are in the
#' second group, etc.
#' @keywords internal
NULL
# [description] List of valid keys for "info" arguments in lgb.Dataset.
# Wrapped in a function to take advantage of lazy evaluation
# (so it doesn't matter what order R sources files during installation).
# [return] A character vector of names.
.INFO_KEYS <- function() {
return(c("label", "weight", "init_score", "group"))
}
#' @importFrom methods is
#' @importFrom R6 R6Class
#' @importFrom utils modifyList
Dataset <- R6::R6Class(
classname = "lgb.Dataset",
cloneable = FALSE,
public = list(
# Finalize will free up the handles
finalize = function() {
.Call(
LGBM_DatasetFree_R
, private$handle
)
private$handle <- NULL
return(invisible(NULL))
},
# Initialize will create a starter dataset
initialize = function(data,
params = list(),
reference = NULL,
colnames = NULL,
categorical_feature = NULL,
predictor = NULL,
free_raw_data = TRUE,
used_indices = NULL,
label = NULL,
weight = NULL,
group = NULL,
init_score = NULL) {
# validate inputs early to avoid unnecessary computation
if (!(is.null(reference) || lgb.is.Dataset(reference))) {
stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset"))
}
if (!(is.null(predictor) || lgb.is.Predictor(predictor))) {
stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
}
info <- list()
if (!is.null(label)) {
info[["label"]] <- label
}
if (!is.null(weight)) {
info[["weight"]] <- weight
}
if (!is.null(group)) {
info[["group"]] <- group
}
if (!is.null(init_score)) {
info[["init_score"]] <- init_score
}
# Check for matrix format
if (is.matrix(data)) {
# Check whether matrix is the correct type first ("double")
if (storage.mode(data) != "double") {
storage.mode(data) <- "double"
}
}
# Setup private attributes
private$raw_data <- data
private$params <- params
private$reference <- reference
private$colnames <- colnames
private$categorical_feature <- categorical_feature
private$predictor <- predictor
private$free_raw_data <- free_raw_data
private$used_indices <- sort(used_indices, decreasing = FALSE)
private$info <- info
private$version <- 0L
return(invisible(NULL))
},
create_valid = function(data,
label = NULL,
weight = NULL,
group = NULL,
init_score = NULL,
params = list()) {
# the Dataset's existing parameters should be overwritten by any passed in to this call
params <- modifyList(private$params, params)
# Create new dataset
ret <- Dataset$new(
data = data
, params = params
, reference = self
, colnames = private$colnames
, categorical_feature = private$categorical_feature
, predictor = private$predictor
, free_raw_data = private$free_raw_data
, used_indices = NULL
, label = label
, weight = weight
, group = group
, init_score = init_score
)
return(invisible(ret))
},
# Dataset constructor
construct = function() {
# Check for handle null
if (!lgb.is.null.handle(x = private$handle)) {
return(invisible(self))
}
# Get feature names
cnames <- NULL
if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
cnames <- colnames(private$raw_data)
}
# set feature names if they do not exist
if (is.null(private$colnames) && !is.null(cnames)) {
private$colnames <- as.character(cnames)
}
# Get categorical feature index
if (!is.null(private$categorical_feature)) {
# Check for character name
if (is.character(private$categorical_feature)) {
cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1L)
# Provided indices, but some indices are missing?
if (sum(is.na(cate_indices)) > 0L) {
stop(
"lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
, sQuote(private$categorical_feature[is.na(cate_indices)])
)
}
} else {
# Check if more categorical features were output over the feature space
if (max(private$categorical_feature) > length(private$colnames)) {
stop(
"lgb.self.get.handle: supplied a too large value in categorical_feature: "
, max(private$categorical_feature)
, " but only "
, length(private$colnames)
, " features"
)
}
# Store indices as [0, n-1] indexed instead of [1, n] indexed
cate_indices <- as.list(private$categorical_feature - 1L)
}
# Store indices for categorical features
private$params$categorical_feature <- cate_indices
}
# Generate parameter str
params_str <- lgb.params2str(params = private$params)
# Get handle of reference dataset
ref_handle <- NULL
if (!is.null(private$reference)) {
ref_handle <- private$reference$.__enclos_env__$private$get_handle()
}
# not subsetting, constructing from raw data
if (is.null(private$used_indices)) {
if (is.null(private$raw_data)) {
stop(paste0(
"Attempting to create a Dataset without any raw data. "
, "This can happen if you have called Dataset$finalize() or if this Dataset was saved with saveRDS(). "
, "To avoid this error in the future, use lgb.Dataset.save() or "
, "Dataset$save_binary() to save lightgbm Datasets."
))
}
# Are we using a data file?
if (is.character(private$raw_data)) {
handle <- .Call(
LGBM_DatasetCreateFromFile_R
, path.expand(private$raw_data)
, params_str
, ref_handle
)
} else if (is.matrix(private$raw_data)) {
# Are we using a matrix?
handle <- .Call(
LGBM_DatasetCreateFromMat_R
, private$raw_data
, nrow(private$raw_data)
, ncol(private$raw_data)
, params_str
, ref_handle
)
} else if (methods::is(private$raw_data, "dgCMatrix")) {
if (length(private$raw_data@p) > 2147483647L) {
stop("Cannot support large CSC matrix")
}
# Are we using a dgCMatrix (sparse matrix column compressed)
handle <- .Call(
LGBM_DatasetCreateFromCSC_R
, private$raw_data@p
, private$raw_data@i
, private$raw_data@x
, length(private$raw_data@p)
, length(private$raw_data@x)
, nrow(private$raw_data)
, params_str
, ref_handle
)
} else {
# Unknown data type
stop(
"lgb.Dataset.construct: does not support constructing from "
, sQuote(class(private$raw_data))
)
}
} else {
# Reference is empty
if (is.null(private$reference)) {
stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
}
# Construct subset
handle <- .Call(
LGBM_DatasetGetSubset_R
, ref_handle
, c(private$used_indices) # Adding c() fixes issue in R v3.5
, length(private$used_indices)
, params_str
)
}
if (lgb.is.null.handle(x = handle)) {
stop("lgb.Dataset.construct: cannot create Dataset handle")
}
# Setup class and private type
class(handle) <- "lgb.Dataset.handle"
private$handle <- handle
# Set feature names
if (!is.null(private$colnames)) {
self$set_colnames(colnames = private$colnames)
}
# Load init score if requested
if (!is.null(private$predictor) && is.null(private$used_indices)) {
# Setup initial scores
init_score <- private$predictor$predict(
data = private$raw_data
, rawscore = TRUE
)
# Not needed to transpose, for is col_marjor
init_score <- as.vector(init_score)
private$info$init_score <- init_score
}
# Should we free raw data?
if (isTRUE(private$free_raw_data)) {
private$raw_data <- NULL
}
# Get private information
if (length(private$info) > 0L) {
# Set infos
for (i in seq_along(private$info)) {
p <- private$info[i]
self$set_field(
field_name = names(p)
, data = p[[1L]]
)
}
}
# Get label information existence
if (is.null(self$get_field(field_name = "label"))) {
stop("lgb.Dataset.construct: label should be set")
}
return(invisible(self))
},
# Dimension function
dim = function() {
# Check for handle
if (!lgb.is.null.handle(x = private$handle)) {
num_row <- 0L
num_col <- 0L
# Get numeric data and numeric features
.Call(
LGBM_DatasetGetNumData_R
, private$handle
, num_row
)
.Call(
LGBM_DatasetGetNumFeature_R
, private$handle
, num_col
)
return(
c(num_row, num_col)
)
} else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
# Check if dgCMatrix (sparse matrix column compressed)
# NOTE: requires Matrix package
return(dim(private$raw_data))
} else {
# Trying to work with unknown dimensions is not possible
stop(
"dim: cannot get dimensions before dataset has been constructed, "
, "please call lgb.Dataset.construct explicitly"
)
}
},
# Get number of bins for feature
get_feature_num_bin = function(feature) {
if (lgb.is.null.handle(x = private$handle)) {
stop("Cannot get number of bins in feature before constructing Dataset.")
}
num_bin <- integer(1L)
.Call(
LGBM_DatasetGetFeatureNumBin_R
, private$handle
, feature - 1L
, num_bin
)
return(num_bin)
},
# Get column names
get_colnames = function() {
# Check for handle
if (!lgb.is.null.handle(x = private$handle)) {
private$colnames <- .Call(
LGBM_DatasetGetFeatureNames_R
, private$handle
)
return(private$colnames)
} else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
# Check if dgCMatrix (sparse matrix column compressed)
return(colnames(private$raw_data))
} else {
# Trying to work with unknown formats is not possible
stop(
"Dataset$get_colnames(): cannot get column names before dataset has been constructed, please call "
, "lgb.Dataset.construct() explicitly"
)
}
},
# Set column names
set_colnames = function(colnames) {
# Check column names non-existence
if (is.null(colnames)) {
return(invisible(self))
}
# Check empty column names
colnames <- as.character(colnames)
if (length(colnames) == 0L) {
return(invisible(self))
}
# Write column names
private$colnames <- colnames
if (!lgb.is.null.handle(x = private$handle)) {
# Merge names with tab separation
merged_name <- paste0(as.list(private$colnames), collapse = "\t")
.Call(
LGBM_DatasetSetFeatureNames_R
, private$handle
, merged_name
)
}
return(invisible(self))
},
get_field = function(field_name) {
# Check if attribute key is in the known attribute list
if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
stop(
"Dataset$get_field(): field_name must one of the following: "
, paste0(sQuote(.INFO_KEYS()), collapse = ", ")
)
}
# Check for info name and handle
if (is.null(private$info[[field_name]])) {
if (lgb.is.null.handle(x = private$handle)) {
stop("Cannot perform Dataset$get_field() before constructing Dataset.")
}
# Get field size of info
info_len <- 0L
.Call(
LGBM_DatasetGetFieldSize_R
, private$handle
, field_name
, info_len
)
if (info_len > 0L) {
# Get back fields
ret <- NULL
ret <- if (field_name == "group") {
integer(info_len)
} else {
numeric(info_len)
}
.Call(
LGBM_DatasetGetField_R
, private$handle
, field_name
, ret
)
private$info[[field_name]] <- ret
}
}
return(private$info[[field_name]])
},
set_field = function(field_name, data) {
# Check if attribute key is in the known attribute list
if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
stop(
"Dataset$set_field(): field_name must one of the following: "
, paste0(sQuote(.INFO_KEYS()), collapse = ", ")
)
}
# Check for type of information
data <- if (field_name == "group") {
as.integer(data)
} else {
as.numeric(data)
}
# Store information privately
private$info[[field_name]] <- data
if (!lgb.is.null.handle(x = private$handle) && !is.null(data)) {
if (length(data) > 0L) {
.Call(
LGBM_DatasetSetField_R
, private$handle
, field_name
, data
, length(data)
)
private$version <- private$version + 1L
}
}
return(invisible(self))
},
slice = function(idxset) {
return(
Dataset$new(
data = NULL
, params = private$params
, reference = self
, colnames = private$colnames
, categorical_feature = private$categorical_feature
, predictor = private$predictor
, free_raw_data = private$free_raw_data
, used_indices = sort(idxset, decreasing = FALSE)
)
)
},
# [description] Update Dataset parameters. If it has not been constructed yet,
# this operation just happens on the R side (updating private$params).
# If it has been constructed, parameters will be updated on the C++ side.
update_params = function(params) {
if (length(params) == 0L) {
return(invisible(self))
}
new_params <- utils::modifyList(private$params, params)
if (lgb.is.null.handle(x = private$handle)) {
private$params <- new_params
} else {
tryCatch({
.Call(
LGBM_DatasetUpdateParamChecking_R
, lgb.params2str(params = private$params)
, lgb.params2str(params = new_params)
)
private$params <- new_params
}, error = function(e) {
# If updating failed but raw data is not available, raise an error because
# achieving what the user asked for is not possible
if (is.null(private$raw_data)) {
stop(e)
}
# If updating failed but raw data is available, modify the params
# on the R side and re-set ("deconstruct") the Dataset
private$params <- new_params
self$finalize()
})
}
return(invisible(self))
},
# [description] Get only Dataset-specific parameters. This is primarily used by
# Booster to update its parameters based on the characteristics of
# a Dataset. It should not be used by other methods in this class,
# since "verbose" is not a Dataset parameter and needs to be passed
# through to avoid globally re-setting verbosity.
get_params = function() {
dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
ret <- list()
for (param_key in names(private$params)) {
if (param_key %in% dataset_params) {
ret[[param_key]] <- private$params[[param_key]]
}
}
return(ret)
},
# Set categorical feature parameter
set_categorical_feature = function(categorical_feature) {
# Check for identical input
if (identical(private$categorical_feature, categorical_feature)) {
return(invisible(self))
}
# Check for empty data
if (is.null(private$raw_data)) {
stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
}
# Overwrite categorical features
private$categorical_feature <- categorical_feature
# Finalize and return self
self$finalize()
return(invisible(self))
},
set_reference = function(reference) {
# setting reference to this same Dataset object doesn't require any changes
if (identical(private$reference, reference)) {
return(invisible(self))
}
# changing the reference removes the Dataset object on the C++ side, so it should only
# be done if you still have the raw_data available, so that the new Dataset can be reconstructed
if (is.null(private$raw_data)) {
stop("set_reference: cannot set reference after freeing raw data,
please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
}
if (!lgb.is.Dataset(reference)) {
stop("set_reference: Can only use lgb.Dataset as a reference")
}
# Set known references
self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature)
self$set_colnames(colnames = reference$get_colnames())
private$set_predictor(predictor = reference$.__enclos_env__$private$predictor)
# Store reference
private$reference <- reference
# Finalize and return self
self$finalize()
return(invisible(self))
},
# Save binary model
save_binary = function(fname) {
# Store binary data
self$construct()
.Call(
LGBM_DatasetSaveBinary_R
, private$handle
, path.expand(fname)
)
return(invisible(self))
}
),
private = list(
handle = NULL,
raw_data = NULL,
params = list(),
reference = NULL,
colnames = NULL,
categorical_feature = NULL,
predictor = NULL,
free_raw_data = TRUE,
used_indices = NULL,
info = NULL,
version = 0L,
get_handle = function() {
# Get handle and construct if needed
if (lgb.is.null.handle(x = private$handle)) {
self$construct()
}
return(private$handle)
},
set_predictor = function(predictor) {
if (identical(private$predictor, predictor)) {
return(invisible(self))
}
# Check for empty data
if (is.null(private$raw_data)) {
stop("set_predictor: cannot set predictor after free raw data,
please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
}
# Check for empty predictor
if (!is.null(predictor)) {
# Predictor is unknown
if (!lgb.is.Predictor(predictor)) {
stop("set_predictor: Can only use lgb.Predictor as predictor")
}
}
# Store predictor
private$predictor <- predictor
# Finalize and return self
self$finalize()
return(invisible(self))
}
)
)
#' @title Construct \code{lgb.Dataset} object
#' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
#' or local file (that was created previously by saving an \code{lgb.Dataset}).
#' @inheritParams lgb_shared_dataset_params
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#' a character representing a path to a text file (CSV, TSV, or LibSVM),
#' or a character representing a path to a binary \code{lgb.Dataset} file
#' @param params a list of parameters. See
#' \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#' The "Dataset Parameters" section of the documentation} for a list of parameters
#' and valid values.
#' @param reference reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning
#' continuous features into histograms. If you want to apply the same bin boundaries from an existing
#' dataset to new \code{data}, pass that existing Dataset to this argument.
#' @param colnames names of columns
#' @param categorical_feature categorical features. This can either be a character vector of feature
#' names or an integer vector with the indices of the features (e.g.
#' \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param free_raw_data LightGBM constructs its data format, called a "Dataset", from tabular data.
#' By default, that Dataset object on the R side does not keep a copy of the raw data.
#' This reduces LightGBM's memory consumption, but it means that the Dataset object
#' cannot be changed after it has been constructed. If you'd prefer to be able to
#' change the Dataset object after construction, set \code{free_raw_data = FALSE}.
#'
#' @return constructed dataset
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
#' lgb.Dataset.construct(dtrain)
#' }
#' @export
lgb.Dataset <- function(data,
params = list(),
reference = NULL,
colnames = NULL,
categorical_feature = NULL,
free_raw_data = TRUE,
label = NULL,
weight = NULL,
group = NULL,
init_score = NULL) {
return(
invisible(Dataset$new(
data = data
, params = params
, reference = reference
, colnames = colnames
, categorical_feature = categorical_feature
, predictor = NULL
, free_raw_data = free_raw_data
, used_indices = NULL
, label = label
, weight = weight
, group = group
, init_score = init_score
))
)
}
#' @name lgb.Dataset.create.valid
#' @title Construct validation data
#' @description Construct validation data according to training data
#' @inheritParams lgb_shared_dataset_params
#' @param dataset \code{lgb.Dataset} object, training data
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#' a character representing a path to a text file (CSV, TSV, or LibSVM),
#' or a character representing a path to a binary \code{Dataset} file
#' @param params a list of parameters. See
#' \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#' The "Dataset Parameters" section of the documentation} for a list of parameters
#' and valid values. If this is an empty list (the default), the validation Dataset
#' will have the same parameters as the Dataset passed to argument \code{dataset}.
#'
#' @return constructed dataset
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#'
#' # parameters can be changed between the training data and validation set,
#' # for example to account for training data in a text file with a header row
#' # and validation data in a text file without it
#' train_file <- tempfile(pattern = "train_", fileext = ".csv")
#' write.table(
#' data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#' , file = train_file
#' , sep = ","
#' , col.names = TRUE
#' , row.names = FALSE
#' , quote = FALSE
#' )
#'
#' valid_file <- tempfile(pattern = "valid_", fileext = ".csv")
#' write.table(
#' data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#' , file = valid_file
#' , sep = ","
#' , col.names = FALSE
#' , row.names = FALSE
#' , quote = FALSE
#' )
#'
#' dtrain <- lgb.Dataset(
#' data = train_file
#' , params = list(has_header = TRUE)
#' )
#' dtrain$construct()
#'
#' dvalid <- lgb.Dataset(
#' data = valid_file
#' , params = list(has_header = FALSE)
#' )
#' dvalid$construct()
#' }
#' @export
lgb.Dataset.create.valid <- function(dataset,
data,
label = NULL,
weight = NULL,
group = NULL,
init_score = NULL,
params = list()) {
if (!lgb.is.Dataset(x = dataset)) {
stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
}
# Create validation dataset
return(invisible(
dataset$create_valid(
data = data
, label = label
, weight = weight
, group = group
, init_score = init_score
, params = params
)
))
}
#' @name lgb.Dataset.construct
#' @title Construct Dataset explicitly
#' @description Construct Dataset explicitly
#' @param dataset Object of class \code{lgb.Dataset}
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' }
#' @return constructed dataset
#' @export
lgb.Dataset.construct <- function(dataset) {
if (!lgb.is.Dataset(x = dataset)) {
stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
}
return(invisible(dataset$construct()))
}
#' @title Dimensions of an \code{lgb.Dataset}
#' @description Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
#' @param x Object of class \code{lgb.Dataset}
#'
#' @return a vector of numbers of rows and of columns
#'
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#'
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
#' }
#' @rdname dim
#' @export
dim.lgb.Dataset <- function(x) {
if (!lgb.is.Dataset(x = x)) {
stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
}
return(x$dim())
}
#' @title Handling of column names of \code{lgb.Dataset}
#' @description Only column names are supported for \code{lgb.Dataset}, thus setting of
#' row names would have no effect and returned row names would be NULL.
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
#' and the second one is column names
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
#' colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
#' print(dtrain, verbose = TRUE)
#' }
#' @rdname dimnames.lgb.Dataset
#' @return A list with the dimension names of the dataset
#' @export
dimnames.lgb.Dataset <- function(x) {
if (!lgb.is.Dataset(x = x)) {
stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
}
# Return dimension names
return(list(NULL, x$get_colnames()))
}
#' @rdname dimnames.lgb.Dataset
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
# Check if invalid element list
if (!identical(class(value), "list") || length(value) != 2L) {
stop("invalid ", sQuote("value"), " given: must be a list of two elements")
}
# Check for unknown row names
if (!is.null(value[[1L]])) {
stop("lgb.Dataset does not have rownames")
}