From 9e26cc9557aedd93bfa589d4dffdc3277d257db6 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 16 Dec 2024 14:14:39 +0800 Subject: [PATCH 1/6] Small cleanups for DMatrix constructor. - Cleanup the C API documentation with consistent naming. - Avoid virtual function call `Info()` in the ctor. --- include/xgboost/c_api.h | 227 ++++++++++++++-------------- src/data/extmem_quantile_dmatrix.cc | 12 +- src/data/extmem_quantile_dmatrix.cu | 12 +- src/data/iterative_dmatrix.cc | 6 +- src/data/iterative_dmatrix.cu | 12 +- 5 files changed, 137 insertions(+), 132 deletions(-) diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index 111e9cacc95b..0740c9c23ba4 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -138,7 +138,6 @@ XGB_DLL int XGDMatrixCreateFromFile(const char *fname, int silent, DMatrixHandle /*! * \brief load a data matrix * \param config JSON encoded parameters for DMatrix construction. Accepted fields are: - * - uri: The URI of the input file. The URI parameter `format` is required when loading text data. * \verbatim embed:rst:leading-asterisk * See :doc:`/tutorials/input_format` for more info. @@ -162,9 +161,12 @@ XGB_DLL int XGDMatrixCreateFromCSREx(const size_t *indptr, const unsigned *indic /** * @brief Create a DMatrix from columnar data. (table) * - * @param data See @ref XGBoosterPredictFromColumnar for details. + * A special type of input to the `DMatrix` is the columnar format, which refers to + * column-based dataframes based on the arrow formatt. + * + * @param data A list of JSON-encoded array interfaces. * @param config See @ref XGDMatrixCreateFromDense for details. - * @param out The created dmatrix. + * @param out The created DMatrix. * * @return 0 when success, -1 when failure happens */ @@ -173,46 +175,48 @@ XGB_DLL int XGDMatrixCreateFromColumnar(char const *data, char const *config, DM /** * @example c-api-demo.c */ -/*! - * \brief Create a matrix from CSR matrix. - * \param indptr JSON encoded __array_interface__ to row pointers in CSR. - * \param indices JSON encoded __array_interface__ to column indices in CSR. - * \param data JSON encoded __array_interface__ to values in CSR. - * \param ncol Number of columns. - * \param config JSON encoded configuration. Required values are: - * - missing: Which value to represent missing value. - * - nthread (optional): Number of threads used for initializing DMatrix. - * - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row. - * \param out created dmatrix - * \return 0 when success, -1 when failure happens +/** + * @brief Create a DMatrix from CSR matrix. + * @param indptr JSON encoded __array_interface__ to row pointers in CSR. + * @param indices JSON encoded __array_interface__ to column indices in CSR. + * @param data JSON encoded __array_interface__ to values in CSR. + * @param ncol The number of columns. + * @param config See @ref XGDMatrixCreateFromDense for details. + * @param out The created dmatrix + * + * @return 0 when success, -1 when failure happens */ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr, char const *indices, char const *data, bst_ulong ncol, char const *config, DMatrixHandle *out); -/*! - * \brief Create a matrix from dense array. - * \param data JSON encoded __array_interface__ to array values. - * \param config JSON encoded configuration. Required values are: +/** + * @brief Create a DMatrix from dense array. + * + * The array interface is defined in https://numpy.org/doc/2.1/reference/arrays.interface.html + * We encode the interface as a JSON object. + * + * @param data JSON encoded __array_interface__ to array values. + * @param config JSON encoded configuration. Required values are: * - missing: Which value to represent missing value. * - nthread (optional): Number of threads used for initializing DMatrix. * - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row. - * \param out created dmatrix - * \return 0 when success, -1 when failure happens + * @param out The created DMatrix + * + * @return 0 when success, -1 when failure happens */ XGB_DLL int XGDMatrixCreateFromDense(char const *data, char const *config, DMatrixHandle *out); -/*! - * \brief Create a matrix from a CSC matrix. - * \param indptr JSON encoded __array_interface__ to column pointers in CSC. - * \param indices JSON encoded __array_interface__ to row indices in CSC. - * \param data JSON encoded __array_interface__ to values in CSC. - * \param nrow number of rows in the matrix. - * \param config JSON encoded configuration. Supported values are: - * - missing: Which value to represent missing value. - * - nthread (optional): Number of threads used for initializing DMatrix. - * - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row. - * \param out created dmatrix - * \return 0 when success, -1 when failure happens +/** + * @brief Create a DMatrix from a CSC matrix. + * + * @param indptr JSON encoded __array_interface__ to column pointers in CSC. + * @param indices JSON encoded __array_interface__ to row indices in CSC. + * @param data JSON encoded __array_interface__ to values in CSC. + * @param nrow The number of rows in the matrix. + * @param config See @ref XGDMatrixCreateFromDense for details. + * @param out The created dmatrix. + * + * @return 0 when success, -1 when failure happens */ XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char const *data, bst_ulong nrow, char const *config, DMatrixHandle *out); @@ -255,15 +259,16 @@ XGB_DLL int XGDMatrixCreateFromMat_omp(const float *data, // NOLINT float missing, DMatrixHandle *out, int nthread); -/*! - * \brief Create DMatrix from CUDA columnar format. (cuDF) - * \param data Array of JSON encoded __cuda_array_interface__ for each column. - * \param config JSON encoded configuration. Required values are: - * - missing: Which value to represent missing value. - * - nthread (optional): Number of threads used for initializing DMatrix. - * - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row. - * \param out created dmatrix - * \return 0 when success, -1 when failure happens +/** + * @brief Create DMatrix from CUDA columnar format. (cuDF) + * + * See @ref XGDMatrixCreateFromColumnar for a brief description of the columnar format. + * + * @param data A list of JSON-encoded array interfaces. + * @param config See @ref XGDMatrixCreateFromDense for details. + * @param out Created dmatrix + * + * @return 0 when success, -1 when failure happens */ XGB_DLL int XGDMatrixCreateFromCudaColumnar(char const *data, char const *config, DMatrixHandle *out); @@ -323,6 +328,7 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data, char const * * - @ref XGDMatrixCallbackNext * - @ref DataIterResetCallback * - @ref XGProxyDMatrixSetDataCudaArrayInterface + * - @ref XGProxyDMatrixSetDataColumnar * - @ref XGProxyDMatrixSetDataCudaColumnar * - @ref XGProxyDMatrixSetDataDense * - @ref XGProxyDMatrixSetDataCSR @@ -469,7 +475,7 @@ XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy * - Step 0: Define a data iterator with 2 methods `reset`, and `next`. * - Step 1: Create a DMatrix proxy by @ref XGProxyDMatrixCreate and hold the handle. * - Step 2: Pass the iterator handle, proxy handle and 2 methods into - * `XGQuantileDMatrixCreateFromCallback`. + * @ref XGQuantileDMatrixCreateFromCallback. * - Step 3: Call appropriate data setters in `next` functions. * * See test_iterative_dmatrix.cu or Python interface for examples. @@ -537,52 +543,47 @@ XGB_DLL int XGExtMemQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatr XGDMatrixCallbackNext *next, char const *config, DMatrixHandle *out); -/*! - * \brief Set data on a DMatrix proxy. +/** + * @brief Set data on a DMatrix proxy. * - * \param handle A DMatrix proxy created by \ref XGProxyDMatrixCreate - * \param c_interface_str Null terminated JSON document string representation of CUDA - * array interface. + * @param handle A DMatrix proxy created by @ref XGProxyDMatrixCreate + * @param data Null terminated JSON document string representation of CUDA + * array interface. * - * \return 0 when success, -1 when failure happens + * @return 0 when success, -1 when failure happens */ -XGB_DLL int -XGProxyDMatrixSetDataCudaArrayInterface(DMatrixHandle handle, - const char *c_interface_str); +XGB_DLL int XGProxyDMatrixSetDataCudaArrayInterface(DMatrixHandle handle, const char *data); /** * @brief Set columnar (table) data on a DMatrix proxy. * - * @param handle A DMatrix proxy created by @ref XGProxyDMatrixCreate - * @param c_interface_str See @ref XGBoosterPredictFromColumnar for details. + * @param handle A DMatrix proxy created by @ref XGProxyDMatrixCreate + * @param data See @ref XGDMatrixCreateFromColumnar for details. * * @return 0 when success, -1 when failure happens */ -XGB_DLL int XGProxyDMatrixSetDataColumnar(DMatrixHandle handle, char const *c_interface_str); +XGB_DLL int XGProxyDMatrixSetDataColumnar(DMatrixHandle handle, char const *data); -/*! - * \brief Set data on a DMatrix proxy. +/** + * @brief Set CUDA-based columnar (table) data on a DMatrix proxy. * - * \param handle A DMatrix proxy created by \ref XGProxyDMatrixCreate - * \param c_interface_str Null terminated JSON document string representation of CUDA - * array interface, with an array of columns. + * @param handle A DMatrix proxy created by @ref XGProxyDMatrixCreate + * @param data See @ref XGDMatrixCreateFromColumnar for details. * - * \return 0 when success, -1 when failure happens + * @return 0 when success, -1 when failure happens */ -XGB_DLL int XGProxyDMatrixSetDataCudaColumnar(DMatrixHandle handle, - const char *c_interface_str); +XGB_DLL int XGProxyDMatrixSetDataCudaColumnar(DMatrixHandle handle, const char *data); -/*! - * \brief Set data on a DMatrix proxy. +/** + * @brief Set data on a DMatrix proxy. * - * \param handle A DMatrix proxy created by \ref XGProxyDMatrixCreate - * \param c_interface_str Null terminated JSON document string representation of array - * interface. + * @param handle A DMatrix proxy created by @ref XGProxyDMatrixCreate + * @param data Null terminated JSON document string representation of array + * interface. * - * \return 0 when success, -1 when failure happens + * @return 0 when success, -1 when failure happens */ -XGB_DLL int XGProxyDMatrixSetDataDense(DMatrixHandle handle, - char const *c_interface_str); +XGB_DLL int XGProxyDMatrixSetDataDense(DMatrixHandle handle, char const *data); /*! * \brief Set data on a DMatrix proxy. @@ -636,26 +637,30 @@ XGB_DLL int XGDMatrixFree(DMatrixHandle handle); * @example c-api-demo.c inference.c external_memory.c */ -/*! - * \brief load a data matrix into binary file - * \param handle a instance of data matrix - * \param fname file name - * \param silent print statistics when saving - * \return 0 when success, -1 when failure happens +/** + * @brief Save the DMatrix object into a file. `QuantileDMatrix` and external memory + * DMatrix are not supported. + * + * @param handle a instance of data matrix + * @param fname file name + * @param silent print statistics when saving + * + * @return 0 when success, -1 when failure happens */ XGB_DLL int XGDMatrixSaveBinary(DMatrixHandle handle, const char *fname, int silent); -/*! - * \brief Set content in array interface to a content in info. - * \param handle a instance of data matrix - * \param field field name. - * \param c_interface_str JSON string representation of array interface. - * \return 0 when success, -1 when failure happens +/** + * @brief Set content in array interface to a content in info. + * + * @param handle An instance of data matrix + * @param field Field name. + * @param data JSON encoded __array_interface__ to values in the dense matrix/vector. + * + * @return 0 when success, -1 when failure happens */ -XGB_DLL int XGDMatrixSetInfoFromInterface(DMatrixHandle handle, - char const* field, - char const* c_interface_str); +XGB_DLL int XGDMatrixSetInfoFromInterface(DMatrixHandle handle, char const *field, + char const *data); /*! * \brief set float vector to a content in info @@ -1106,7 +1111,7 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle, * * \return 0 when success, -1 when failure happens * - * \see XGBoosterPredictFromDense XGBoosterPredictFromCSR XGBoosterPredictFromCudaArray XGBoosterPredictFromCudaColumnar + * @see XGBoosterPredictFromDense XGBoosterPredictFromCSR XGBoosterPredictFromCudaArray XGBoosterPredictFromCudaColumnar */ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle, DMatrixHandle dmat, char const *config, bst_ulong const **out_shape, @@ -1149,7 +1154,7 @@ XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, char const *values, * prediction with DMatrix with a performance warning. * * @param handle Booster handle. - * @param values An JSON array of __array_interface__ for each column. + * @param data See @ref XGDMatrixCreateFromColumnar for more info. * @param config See @ref XGBoosterPredictFromDMatrix for more info. * Additional fields for inplace prediction are: * - "missing": float @@ -1196,50 +1201,50 @@ XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, ch bst_ulong *out_dim, const float **out_result); /** - * \brief Inplace prediction from CUDA Dense matrix (cupy in Python). + * @brief Inplace prediction from CUDA Dense matrix (cupy in Python). * - * \note If the booster is configured to run on a CPU, XGBoost falls back to run + * @note If the booster is configured to run on a CPU, XGBoost falls back to run * prediction with DMatrix with a performance warning. * - * \param handle Booster handle - * \param values JSON encoded __cuda_array_interface__ to values. - * \param config See \ref XGBoosterPredictFromDMatrix for more info. + * @param handle Booster handle + * @param values JSON encoded __cuda_array_interface__ to values. + * @param config See @ref XGBoosterPredictFromDMatrix for more info. * Additional fields for inplace prediction are: * - "missing": float - * \param m An optional (NULL if not available) proxy DMatrix instance + * @param proxy An optional (NULL if not available) proxy DMatrix instance * storing meta info. - * \param out_shape See \ref XGBoosterPredictFromDMatrix for more info. - * \param out_dim See \ref XGBoosterPredictFromDMatrix for more info. - * \param out_result See \ref XGBoosterPredictFromDMatrix for more info. + * @param out_shape See @ref XGBoosterPredictFromDMatrix for more info. + * @param out_dim See @ref XGBoosterPredictFromDMatrix for more info. + * @param out_result See @ref XGBoosterPredictFromDMatrix for more info. * - * \return 0 when success, -1 when failure happens + * @return 0 when success, -1 when failure happens */ XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *values, - char const *config, DMatrixHandle m, + char const *config, DMatrixHandle proxy, bst_ulong const **out_shape, bst_ulong *out_dim, const float **out_result); /** - * \brief Inplace prediction from CUDA dense dataframe (cuDF in Python). + * @brief Inplace prediction from CUDA dense dataframe (cuDF in Python). * - * \note If the booster is configured to run on a CPU, XGBoost falls back to run + * @note If the booster is configured to run on a CPU, XGBoost falls back to run * prediction with DMatrix with a performance warning. * - * \param handle Booster handle - * \param values List of __cuda_array_interface__ for all columns encoded in JSON list. - * \param config See \ref XGBoosterPredictFromDMatrix for more info. + * @param handle Booster handle + * @param data See @ref XGDMatrixCreateFromColumnar for more info. + * @param config See @ref XGBoosterPredictFromDMatrix for more info. * Additional fields for inplace prediction are: * - "missing": float - * \param m An optional (NULL if not available) proxy DMatrix instance + * @param proxy An optional (NULL if not available) proxy DMatrix instance * storing meta info. - * \param out_shape See \ref XGBoosterPredictFromDMatrix for more info. - * \param out_dim See \ref XGBoosterPredictFromDMatrix for more info. - * \param out_result See \ref XGBoosterPredictFromDMatrix for more info. + * @param out_shape See @ref XGBoosterPredictFromDMatrix for more info. + * @param out_dim See @ref XGBoosterPredictFromDMatrix for more info. + * @param out_result See @ref XGBoosterPredictFromDMatrix for more info. * - * \return 0 when success, -1 when failure happens + * @return 0 when success, -1 when failure happens */ -XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *values, - char const *config, DMatrixHandle m, +XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *data, + char const *config, DMatrixHandle proxy, bst_ulong const **out_shape, bst_ulong *out_dim, const float **out_result); diff --git a/src/data/extmem_quantile_dmatrix.cc b/src/data/extmem_quantile_dmatrix.cc index 028d8a884c86..97e94110b3b4 100644 --- a/src/data/extmem_quantile_dmatrix.cc +++ b/src/data/extmem_quantile_dmatrix.cc @@ -9,7 +9,7 @@ #include "../tree/param.h" // FIXME(jiamingy): Find a better way to share this parameter. #include "batch_utils.h" // for CheckParam, RegenGHist -#include "proxy_dmatrix.h" // for DataIterProxy, HostAdapterDispatch +#include "proxy_dmatrix.h" // for DataIterProxy #include "quantile_dmatrix.h" // for GetDataShape, MakeSketches #include "simple_batch_iterator.h" // for SimpleBatchIteratorImpl #include "sparse_page_source.h" // for MakeCachePrefix @@ -84,7 +84,7 @@ void ExtMemQuantileDMatrix::InitFromCPU( * Generate quantiles */ std::vector h_ft; - cpu_impl::MakeSketches(ctx, iter.get(), proxy, ref, missing, &cuts, p, this->Info(), ext_info, + cpu_impl::MakeSketches(ctx, iter.get(), proxy, ref, missing, &cuts, p, this->info_, ext_info, &h_ft); /** @@ -92,7 +92,7 @@ void ExtMemQuantileDMatrix::InitFromCPU( */ auto id = MakeCache(this, ".gradient_index.page", false, cache_prefix_, &cache_info_); this->ghist_index_source_ = std::make_unique( - ctx, missing, &this->Info(), cache_info_.at(id), p, cuts, iter, proxy, ext_info.base_rowids); + ctx, missing, &this->info_, cache_info_.at(id), p, cuts, iter, proxy, ext_info.base_rowids); /** * Force initialize the cache and do some sanity checks along the way @@ -102,15 +102,15 @@ void ExtMemQuantileDMatrix::InitFromCPU( for (auto const &page : this->GetGradientIndexImpl()) { n_total_samples += page.Size(); CHECK_EQ(page.base_rowid, ext_info.base_rowids[k]); - CHECK_EQ(page.Features(), this->Info().num_col_); + CHECK_EQ(page.Features(), this->info_.num_col_); ++k, ++batch_cnt; } CHECK_EQ(batch_cnt, ext_info.n_batches); CHECK_EQ(n_total_samples, ext_info.accumulated_rows); if (cuts.HasCategorical()) { - CHECK(!this->Info().feature_types.Empty()); + CHECK(!this->info_.feature_types.Empty()); } - CHECK_EQ(cuts.HasCategorical(), this->Info().HasCategorical()); + CHECK_EQ(cuts.HasCategorical(), this->info_.HasCategorical()); } [[nodiscard]] BatchSet ExtMemQuantileDMatrix::GetGradientIndexImpl() { diff --git a/src/data/extmem_quantile_dmatrix.cu b/src/data/extmem_quantile_dmatrix.cu index c71c4fb0c2ee..1a3ed14c9463 100644 --- a/src/data/extmem_quantile_dmatrix.cu +++ b/src/data/extmem_quantile_dmatrix.cu @@ -51,7 +51,7 @@ void ExtMemQuantileDMatrix::InitFromCUDA( */ auto cuts = std::make_shared(); ExternalDataInfo ext_info; - cuda_impl::MakeSketches(ctx, iter.get(), proxy, ref, p, config.missing, cuts, this->Info(), + cuda_impl::MakeSketches(ctx, iter.get(), proxy, ref, p, config.missing, cuts, this->info_, max_quantile_blocks, &ext_info); ext_info.SetInfo(ctx, &this->info_); @@ -62,7 +62,7 @@ void ExtMemQuantileDMatrix::InitFromCUDA( // overhead with inference. But the training procedures can confortably overlap with the // data transfer. auto cinfo = EllpackCacheInfo{p, (ref != nullptr), config.max_num_device_pages, config.missing}; - CalcCacheMapping(ctx, this->Info().IsDense(), cuts, + CalcCacheMapping(ctx, this->info_.IsDense(), cuts, DftMinCachePageBytes(config.min_cache_page_bytes), ext_info, &cinfo); CHECK_EQ(cinfo.cache_mapping.size(), ext_info.n_batches); auto n_batches = cinfo.buffer_rows.size(); // The number of batches after page concatenation. @@ -79,8 +79,8 @@ void ExtMemQuantileDMatrix::InitFromCUDA( std::visit( [&](auto &&ptr) { using SourceT = typename std::remove_reference_t::element_type; - ptr = std::make_shared(ctx, &this->Info(), ext_info, cache_info_.at(id), cuts, - iter, proxy, cinfo); + ptr = std::make_shared(ctx, &this->info_, ext_info, cache_info_.at(id), cuts, iter, + proxy, cinfo); }, ellpack_page_source_); @@ -105,9 +105,9 @@ void ExtMemQuantileDMatrix::InitFromCUDA( } this->n_batches_ = this->cache_info_.at(id)->Size(); if (cuts->HasCategorical()) { - CHECK(!this->Info().feature_types.Empty()); + CHECK(!this->info_.feature_types.Empty()); } - CHECK_EQ(cuts->HasCategorical(), this->Info().HasCategorical()); + CHECK_EQ(cuts->HasCategorical(), this->info_.HasCategorical()); } [[nodiscard]] BatchSet ExtMemQuantileDMatrix::GetEllpackPageImpl() { diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc index 42bd43655fff..43525a156598 100644 --- a/src/data/iterative_dmatrix.cc +++ b/src/data/iterative_dmatrix.cc @@ -72,12 +72,12 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p, * Generate quantiles */ std::vector h_ft; - cpu_impl::MakeSketches(ctx, &iter, proxy, ref, missing, &cuts, p, this->Info(), ext_info, &h_ft); + cpu_impl::MakeSketches(ctx, &iter, proxy, ref, missing, &cuts, p, this->info_, ext_info, &h_ft); /** * Generate gradient index. */ - this->ghist_ = std::make_unique(this->Info(), std::move(cuts), p.max_bin); + this->ghist_ = std::make_unique(this->info_, std::move(cuts), p.max_bin); std::size_t rbegin = 0; std::size_t prev_sum = 0; std::size_t i = 0; @@ -119,7 +119,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p, CHECK_EQ(proxy->Info().labels.Size(), 0); } - Info().feature_types.HostVector() = h_ft; + info_.feature_types.HostVector() = h_ft; } BatchSet IterativeDMatrix::GetGradientIndex(Context const* ctx, diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu index 79f62c1c9805..72292207b265 100644 --- a/src/data/iterative_dmatrix.cu +++ b/src/data/iterative_dmatrix.cu @@ -42,7 +42,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, */ auto cuts = std::make_shared(); ExternalDataInfo ext_info; - cuda_impl::MakeSketches(ctx, &iter, proxy, ref, p, missing, cuts, this->Info(), + cuda_impl::MakeSketches(ctx, &iter, proxy, ref, p, missing, cuts, this->info_, max_quantile_blocks, &ext_info); ext_info.SetInfo(ctx, &this->info_); @@ -105,11 +105,11 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, IterativeDMatrix::IterativeDMatrix(std::shared_ptr ellpack, MetaInfo const& info, BatchParam batch) { this->ellpack_ = ellpack; - CHECK_EQ(this->Info().num_row_, 0); - CHECK_EQ(this->Info().num_col_, 0); - this->Info().Extend(info, true, true); - this->Info().num_nonzero_ = info.num_nonzero_; - CHECK_EQ(this->Info().num_row_, info.num_row_); + CHECK_EQ(this->info_.num_row_, 0); + CHECK_EQ(this->info_.num_col_, 0); + this->info_.Extend(info, true, true); + this->info_.num_nonzero_ = info.num_nonzero_; + CHECK_EQ(this->info_.num_row_, info.num_row_); this->batch_ = batch; } From 69aaee10611666ca38ece30fb5bb9c818bdc15d5 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 16 Dec 2024 14:41:54 +0800 Subject: [PATCH 2/6] Note for proxy. --- include/xgboost/c_api.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index 0740c9c23ba4..ce9fd60c0e6c 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -410,15 +410,21 @@ XGB_DLL int XGDMatrixCreateFromDataIter( /** * Second set of callback functions, used by constructing Quantile DMatrix or external - * memory DMatrix using custom iterator. + * memory DMatrix using a custom iterator. */ -/*! - * \brief Create a DMatrix proxy for setting data, can be free by XGDMatrixFree. +/** + * @brief Create a DMatrix proxy for setting data, can be freed by @ref XGDMatrixFree. * - * \param out The created Device Quantile DMatrix + * The DMatrix proxy is only a temporary reference (wrapper) to the actual user data. For + * instance, if a dense matrix (like a numpy/cupy array) is passed into the proxy DMatrix + * via the @ref XGProxyDMatrixSetDataCudaArrayInterface method, then the proxy DMatrix + * holds only a reference and the input array cannot be freed until the next iteration + * starts, signaled by a call to the @ref XGDMatrixCallbackNext by XGBoost. * - * \return 0 when success, -1 when failure happens + * @param out The created Device Quantile DMatrix + * + * @return 0 when success, -1 when failure happens */ XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle* out); From 85d702799035ac03e929d6625b2c61b673559807 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 16 Dec 2024 14:42:27 +0800 Subject: [PATCH 3/6] typo. --- include/xgboost/c_api.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index ce9fd60c0e6c..94145626e8ec 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -422,9 +422,9 @@ XGB_DLL int XGDMatrixCreateFromDataIter( * holds only a reference and the input array cannot be freed until the next iteration * starts, signaled by a call to the @ref XGDMatrixCallbackNext by XGBoost. * - * @param out The created Device Quantile DMatrix + * @param out The created Proxy DMatrix. * - * @return 0 when success, -1 when failure happens + * @return 0 when success, -1 when failure happens. */ XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle* out); From 3ebe50458646b7c1c5b58004d9ac42cdeef1f89b Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 16 Dec 2024 14:49:56 +0800 Subject: [PATCH 4/6] note. --- include/xgboost/c_api.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index 94145626e8ec..078b5bdf87a8 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -417,10 +417,13 @@ XGB_DLL int XGDMatrixCreateFromDataIter( * @brief Create a DMatrix proxy for setting data, can be freed by @ref XGDMatrixFree. * * The DMatrix proxy is only a temporary reference (wrapper) to the actual user data. For - * instance, if a dense matrix (like a numpy/cupy array) is passed into the proxy DMatrix - * via the @ref XGProxyDMatrixSetDataCudaArrayInterface method, then the proxy DMatrix - * holds only a reference and the input array cannot be freed until the next iteration - * starts, signaled by a call to the @ref XGDMatrixCallbackNext by XGBoost. + * instance, if a dense matrix (like a numpy array) is passed into the proxy DMatrix via + * the @ref XGProxyDMatrixSetDataCudaArrayInterface method, then the proxy DMatrix holds + * only a reference and the input array cannot be freed until the next iteration starts, + * signaled by a call to the @ref XGDMatrixCallbackNext by XGBoost. It's called + * `ProxyDMatrix` because it reuses the interface of the DMatrix class in XGBoost, but + * it's just a middle interface for the @ref XGDMatrixCreateFromCallback and related + * constructors to consume various user input types. * * @param out The created Proxy DMatrix. * From 436c7c36fbfa258f772812df5ecb79715c396e64 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 16 Dec 2024 14:56:43 +0800 Subject: [PATCH 5/6] notes. --- doc/contrib/ci.rst | 2 ++ include/xgboost/c_api.h | 18 +++++++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst index c9c79231a2ec..edc144326cdc 100644 --- a/doc/contrib/ci.rst +++ b/doc/contrib/ci.rst @@ -513,9 +513,11 @@ which are translated to the following ``docker run`` invocations: .. _vm_images: + ------------------ Notes on VM images ------------------ + In the ``vm_images/`` directory of `dmlc/xgboost-devops `_, we define Packer scripts to build images for Virtual Machines (VM) on `Amazon EC2 `_. diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index 078b5bdf87a8..4879dd62ebfd 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -422,26 +422,30 @@ XGB_DLL int XGDMatrixCreateFromDataIter( * only a reference and the input array cannot be freed until the next iteration starts, * signaled by a call to the @ref XGDMatrixCallbackNext by XGBoost. It's called * `ProxyDMatrix` because it reuses the interface of the DMatrix class in XGBoost, but - * it's just a middle interface for the @ref XGDMatrixCreateFromCallback and related + * it's just a mid interface for the @ref XGDMatrixCreateFromCallback and related * constructors to consume various user input types. * + * @code{.unparsed} + * User inputs -> Proxy DMatrix (wrapper) -> Actual DMatrix + * @endcode + * * @param out The created Proxy DMatrix. * * @return 0 when success, -1 when failure happens. */ XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle* out); -/*! - * \brief Callback function prototype for getting next batch of data. +/** + * @brief Callback function prototype for getting next batch of data. * - * \param iter A handler to the user defined iterator. + * @param iter A handler to the user defined iterator. * - * \return 0 when success, -1 when failure happens + * @return 0 when success, -1 when failure happens. */ XGB_EXTERN_C typedef int XGDMatrixCallbackNext(DataIterHandle iter); // NOLINT(*) -/*! - * \brief Callback function prototype for resetting external iterator +/** + * @brief Callback function prototype for resetting the external iterator. */ XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLINT(*) From e5cd15ccdb7c4d2289ad93f94a2f5d74431d3018 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 16 Dec 2024 14:57:18 +0800 Subject: [PATCH 6/6] cpu in. --- include/xgboost/c_api.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index 4879dd62ebfd..852f65d38f52 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -418,12 +418,12 @@ XGB_DLL int XGDMatrixCreateFromDataIter( * * The DMatrix proxy is only a temporary reference (wrapper) to the actual user data. For * instance, if a dense matrix (like a numpy array) is passed into the proxy DMatrix via - * the @ref XGProxyDMatrixSetDataCudaArrayInterface method, then the proxy DMatrix holds - * only a reference and the input array cannot be freed until the next iteration starts, - * signaled by a call to the @ref XGDMatrixCallbackNext by XGBoost. It's called - * `ProxyDMatrix` because it reuses the interface of the DMatrix class in XGBoost, but - * it's just a mid interface for the @ref XGDMatrixCreateFromCallback and related - * constructors to consume various user input types. + * the @ref XGProxyDMatrixSetDataDense method, then the proxy DMatrix holds only a + * reference and the input array cannot be freed until the next iteration starts, signaled + * by a call to the @ref XGDMatrixCallbackNext by XGBoost. It's called `ProxyDMatrix` + * because it reuses the interface of the DMatrix class in XGBoost, but it's just a mid + * interface for the @ref XGDMatrixCreateFromCallback and related constructors to consume + * various user input types. * * @code{.unparsed} * User inputs -> Proxy DMatrix (wrapper) -> Actual DMatrix