-
-
Notifications
You must be signed in to change notification settings - Fork 8.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Export Python Interface for external memory. (#7070)
* Add Python iterator interface. * Add tests. * Add demo. * Add documents. * Handle empty dataset.
- Loading branch information
1 parent
e64ee65
commit e608836
Showing
34 changed files
with
961 additions
and
200 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,17 @@ | ||
cmake_minimum_required(VERSION 3.13) | ||
project(api-demo LANGUAGES C VERSION 0.0.1) | ||
find_package(xgboost REQUIRED) | ||
project(xgboost-c-examples) | ||
|
||
# xgboost is built as static libraries, all cxx dependencies need to be linked into the | ||
# executable. | ||
if (XGBOOST_BUILD_STATIC_LIB) | ||
enable_language(CXX) | ||
# find again for those cxx libraries. | ||
find_package(xgboost REQUIRED) | ||
endif(XGBOOST_BUILD_STATIC_LIB) | ||
add_subdirectory(basic) | ||
add_subdirectory(external-memory) | ||
|
||
add_executable(api-demo c-api-demo.c) | ||
target_link_libraries(api-demo PRIVATE xgboost::xgboost) | ||
enable_testing() | ||
add_test( | ||
NAME test_xgboost_demo_c_basic | ||
COMMAND api-demo | ||
WORKING_DIRECTORY ${xgboost-c-examples_BINARY_DIR} | ||
) | ||
add_test( | ||
NAME test_xgboost_demo_c_external_memory | ||
COMMAND external-memory-demo | ||
WORKING_DIRECTORY ${xgboost-c-examples_BINARY_DIR} | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
project(api-demo LANGUAGES C VERSION 0.0.1) | ||
find_package(xgboost REQUIRED) | ||
|
||
# xgboost is built as static libraries, all cxx dependencies need to be linked into the | ||
# executable. | ||
if (XGBOOST_BUILD_STATIC_LIB) | ||
enable_language(CXX) | ||
# find again for those cxx libraries. | ||
find_package(xgboost REQUIRED) | ||
endif(XGBOOST_BUILD_STATIC_LIB) | ||
|
||
add_executable(api-demo c-api-demo.c) | ||
target_link_libraries(api-demo PRIVATE xgboost::xgboost) |
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
cmake_minimum_required(VERSION 3.13) | ||
project(external-memory-demo LANGUAGES C VERSION 0.0.1) | ||
|
||
find_package(xgboost REQUIRED) | ||
|
||
add_executable(external-memory-demo external_memory.c) | ||
target_link_libraries(external-memory-demo PRIVATE xgboost::xgboost) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
Defining a Custom Data Iterator to Load Data from External Memory | ||
================================================================= | ||
|
||
A simple demo for using custom data iterator with XGBoost. The feature is still | ||
**experimental** and not ready for production use. If you are not familiar with C API, | ||
please read its introduction in our tutorials and visit the basic demo first. | ||
|
||
Defining Data Iterator | ||
---------------------- | ||
|
||
In the example, we define a custom data iterator with 2 methods: `reset` and `next`. The | ||
`next` method passes data into XGBoost and tells XGBoost whether the iterator has reached | ||
its end, and the `reset` method resets iterations. One important detail when using the C | ||
API for data iterator is users need to make sure that the data passed into `next` method | ||
must be kept in memory until the next iteration or `reset` is called. The external memory | ||
DMatrix is not limited to training, but also valid for other features like prediction. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,179 @@ | ||
/*! | ||
* Copyright 2021 XGBoost contributors | ||
* | ||
* \brief A simple example of using xgboost data callback API. | ||
*/ | ||
|
||
#include <stddef.h> | ||
#include <stdlib.h> | ||
#include <string.h> | ||
#include <xgboost/c_api.h> | ||
|
||
#define safe_xgboost(err) \ | ||
if ((err) != 0) { \ | ||
fprintf(stderr, "%s:%d: error in %s: %s\n", __FILE__, __LINE__, #err, \ | ||
XGBGetLastError()); \ | ||
exit(1); \ | ||
} | ||
|
||
#define N_BATCHS 32 | ||
#define BATCH_LEN 512 | ||
|
||
/* Shorthands. */ | ||
typedef DMatrixHandle DMatrix; | ||
typedef BoosterHandle Booster; | ||
|
||
typedef struct _DataIter { | ||
/* Data of each batch. */ | ||
float **data; | ||
/* Labels of each batch */ | ||
float **labels; | ||
/* Length of each batch. */ | ||
size_t *lengths; | ||
/* Total number of batches. */ | ||
size_t n; | ||
/* Current iteration. */ | ||
size_t cur_it; | ||
|
||
/* Private fields */ | ||
DMatrix _proxy; | ||
char _array[128]; | ||
} DataIter; | ||
|
||
#define safe_malloc(ptr) \ | ||
if ((ptr) == NULL) { \ | ||
fprintf(stderr, "%s:%d: Failed to allocate memory.\n", __FILE__, \ | ||
__LINE__); \ | ||
exit(1); \ | ||
} | ||
|
||
/** | ||
* Initialize with random data for demo. In practice the data should be loaded | ||
* from external memory. We just demonstrate how to use the iterator in | ||
* XGBoost. | ||
* | ||
* \param batch_size Number of elements for each batch. The demo here is only using 1 | ||
* column. | ||
* \param n_batches Number of batches. | ||
*/ | ||
void DataIterator_Init(DataIter *self, size_t batch_size, size_t n_batches) { | ||
self->n = n_batches; | ||
|
||
self->lengths = (size_t *)malloc(self->n * sizeof(size_t)); | ||
safe_malloc(self->lengths); | ||
for (size_t i = 0; i < self->n; ++i) { | ||
self->lengths[i] = batch_size; | ||
} | ||
|
||
self->data = (float **)malloc(self->n * sizeof(float *)); | ||
safe_malloc(self->data); | ||
self->labels = (float **)malloc(self->n * sizeof(float *)); | ||
safe_malloc(self->labels); | ||
|
||
/* Generate some random data. */ | ||
for (size_t i = 0; i < self->n; ++i) { | ||
self->data[i] = (float *)malloc(self->lengths[i] * sizeof(float)); | ||
safe_malloc(self->data[i]); | ||
for (size_t j = 0; j < self->lengths[i]; ++j) { | ||
float x = (float)rand() / (float)(RAND_MAX); | ||
self->data[i][j] = x; | ||
} | ||
|
||
self->labels[i] = (float *)malloc(self->lengths[i] * sizeof(float)); | ||
safe_malloc(self->labels[i]); | ||
for (size_t j = 0; j < self->lengths[i]; ++j) { | ||
float y = (float)rand() / (float)(RAND_MAX); | ||
self->labels[i][j] = y; | ||
} | ||
} | ||
|
||
self->cur_it = 0; | ||
safe_xgboost(XGProxyDMatrixCreate(&self->_proxy)); | ||
} | ||
|
||
void DataIterator_Free(DataIter *self) { | ||
for (size_t i = 0; i < self->n; ++i) { | ||
free(self->data[i]); | ||
free(self->labels[i]); | ||
} | ||
free(self->data); | ||
free(self->lengths); | ||
safe_xgboost(XGDMatrixFree(self->_proxy)); | ||
}; | ||
|
||
int DataIterator_Next(DataIterHandle handle) { | ||
DataIter *self = (DataIter *)(handle); | ||
if (self->cur_it == self->n) { | ||
self->cur_it = 0; | ||
return 0; /* At end */ | ||
} | ||
|
||
/* A JSON string encoding array interface (standard from numpy). */ | ||
char array[] = "{\"data\": [%lu, false], \"shape\":[%lu, 1], \"typestr\": " | ||
"\"<f4\", \"version\": 3}"; | ||
memset(self->_array, '\0', sizeof(self->_array)); | ||
sprintf(self->_array, array, (size_t)self->data[self->cur_it], | ||
self->lengths[self->cur_it]); | ||
|
||
safe_xgboost(XGProxyDMatrixSetDataDense(self->_proxy, self->_array)); | ||
/* The data passed in the iterator must remain valid (not being freed until the next | ||
* iteration or reset) */ | ||
safe_xgboost(XGDMatrixSetDenseInfo(self->_proxy, "label", | ||
self->labels[self->cur_it], | ||
self->lengths[self->cur_it], 1)); | ||
self->cur_it++; | ||
return 1; /* Continue. */ | ||
} | ||
|
||
void DataIterator_Reset(DataIterHandle handle) { | ||
DataIter *self = (DataIter *)(handle); | ||
self->cur_it = 0; | ||
} | ||
|
||
/** | ||
* Train a regression model and save it into JSON model file. | ||
*/ | ||
void TrainModel(DMatrix Xy) { | ||
/* Create booster for training. */ | ||
Booster booster; | ||
DMatrix cache[] = {Xy}; | ||
safe_xgboost(XGBoosterCreate(cache, 1, &booster)); | ||
/* Use approx for external memory training. */ | ||
safe_xgboost(XGBoosterSetParam(booster, "tree_method", "approx")); | ||
safe_xgboost(XGBoosterSetParam(booster, "objective", "reg:squarederror")); | ||
|
||
/* Start training. */ | ||
const char *validation_names[1] = {"train"}; | ||
const char *validation_result = NULL; | ||
size_t n_rounds = 10; | ||
for (size_t i = 0; i < n_rounds; ++i) { | ||
safe_xgboost(XGBoosterUpdateOneIter(booster, i, Xy)); | ||
safe_xgboost(XGBoosterEvalOneIter(booster, i, cache, validation_names, 1, | ||
&validation_result)); | ||
printf("%s\n", validation_result); | ||
} | ||
|
||
/* Save the model to a JSON file. */ | ||
safe_xgboost(XGBoosterSaveModel(booster, "model.json")); | ||
|
||
safe_xgboost(XGBoosterFree(booster)); | ||
} | ||
|
||
int main() { | ||
DataIter iter; | ||
DataIterator_Init(&iter, BATCH_LEN, N_BATCHS); | ||
|
||
/* Create DMatrix from iterator. During training, some cache files with the | ||
* prefix "cache-" will be generated in current directory */ | ||
char config[] = "{\"missing\": NaN, \"cache_prefix\": \"cache\"}"; | ||
DMatrix Xy; | ||
safe_xgboost(XGDMatrixCreateFromCallback( | ||
&iter, iter._proxy, DataIterator_Reset, DataIterator_Next, config, &Xy)); | ||
|
||
TrainModel(Xy); | ||
|
||
safe_xgboost(XGDMatrixFree(Xy)); | ||
|
||
DataIterator_Free(&iter); | ||
return 0; | ||
} |
Oops, something went wrong.