From 9c16f40989751344189aef94682680bb62563759 Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 30 Apr 2020 12:29:31 +0800 Subject: [PATCH 1/3] Move device dmatrix construction code into ellpack. * Make ellpack more self contained. * Delete duplicated `GetRowStride`. --- include/xgboost/data.h | 2 + src/data/device_adapter.cuh | 25 ++++ src/data/device_dmatrix.cu | 205 ++-------------------------- src/data/ellpack_page.cu | 227 ++++++++++++++++++++++++++++---- src/data/ellpack_page.cuh | 17 +++ src/data/ellpack_page_source.cu | 14 -- src/data/sparse_page_source.h | 15 ++- 7 files changed, 266 insertions(+), 239 deletions(-) diff --git a/include/xgboost/data.h b/include/xgboost/data.h index 7a09439c57ac..a55a800f9357 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -350,6 +350,8 @@ class EllpackPage { /*! \brief Destructor. */ ~EllpackPage(); + EllpackPage(EllpackPage&& that); + /*! \return Number of instances in the page. */ size_t Size() const; diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh index ca25cba256b8..64b45f01581c 100644 --- a/src/data/device_adapter.cuh +++ b/src/data/device_adapter.cuh @@ -212,6 +212,31 @@ class CupyAdapter : public detail::SingleBatchDataIter { int device_idx_; }; +// Returns maximum row length +template +size_t GetRowCounts(const AdapterBatchT& batch, common::Span offset, + int device_idx, float missing) { + IsValidFunctor is_valid(missing); + // Count elements per row + dh::LaunchN(device_idx, batch.Size(), [=] __device__(size_t idx) { + auto element = batch.GetElement(idx); + if (is_valid(element)) { + atomicAdd(reinterpret_cast( // NOLINT + &offset[element.row_idx]), + static_cast(1)); // NOLINT + } + }); + dh::XGBCachingDeviceAllocator alloc; + size_t row_stride = thrust::reduce( + thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()), + thrust::device_pointer_cast(offset.data()) + offset.size(), size_t(0), + thrust::maximum()); + return row_stride; +} + +#define DEFINE_DEVICE_ADAPTER(__func) \ + __func(::xgboost::data::CudfAdapter); \ + __func(::xgboost::data::CupyAdapter); }; // namespace data } // namespace xgboost #endif // XGBOOST_DATA_DEVICE_ADAPTER_H_ diff --git a/src/data/device_dmatrix.cu b/src/data/device_dmatrix.cu index 5dbc76ded6d5..e7512b107a42 100644 --- a/src/data/device_dmatrix.cu +++ b/src/data/device_dmatrix.cu @@ -19,181 +19,6 @@ namespace xgboost { namespace data { - -// Returns maximum row length -template -size_t GetRowCounts(const AdapterBatchT& batch, common::Span offset, - int device_idx, float missing) { - IsValidFunctor is_valid(missing); - // Count elements per row - dh::LaunchN(device_idx, batch.Size(), [=] __device__(size_t idx) { - auto element = batch.GetElement(idx); - if (is_valid(element)) { - atomicAdd(reinterpret_cast( // NOLINT - &offset[element.row_idx]), - static_cast(1)); // NOLINT - } - }); - dh::XGBCachingDeviceAllocator alloc; - size_t row_stride = thrust::reduce( - thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()), - thrust::device_pointer_cast(offset.data()) + offset.size(), size_t(0), - thrust::maximum()); - return row_stride; -} - -template -struct WriteCompressedEllpackFunctor { - WriteCompressedEllpackFunctor(common::CompressedByteT* buffer, - const common::CompressedBufferWriter& writer, - AdapterBatchT batch, - EllpackDeviceAccessor accessor, - const IsValidFunctor& is_valid) - : d_buffer(buffer), - writer(writer), - batch(std::move(batch)), - accessor(std::move(accessor)), - is_valid(is_valid) {} - - common::CompressedByteT* d_buffer; - common::CompressedBufferWriter writer; - AdapterBatchT batch; - EllpackDeviceAccessor accessor; - IsValidFunctor is_valid; - - using Tuple = thrust::tuple; - __device__ size_t operator()(Tuple out) { - auto e = batch.GetElement(out.get<2>()); - if (is_valid(e)) { - // -1 because the scan is inclusive - size_t output_position = - accessor.row_stride * e.row_idx + out.get<1>() - 1; - auto bin_idx = accessor.SearchBin(e.value, e.column_idx); - writer.AtomicWriteSymbol(d_buffer, bin_idx, output_position); - } - return 0; - } -}; - -// Here the data is already correctly ordered and simply needs to be compacted -// to remove missing data -template -void CopyDataRowMajor(const AdapterBatchT& batch, EllpackPageImpl* dst, - int device_idx, float missing) { - // Some witchcraft happens here - // The goal is to copy valid elements out of the input to an ellpack matrix - // with a given row stride, using no extra working memory Standard stream - // compaction needs to be modified to do this, so we manually define a - // segmented stream compaction via operators on an inclusive scan. The output - // of this inclusive scan is fed to a custom function which works out the - // correct output position - auto counting = thrust::make_counting_iterator(0llu); - IsValidFunctor is_valid(missing); - auto key_iter = dh::MakeTransformIterator( - counting, - [=] __device__(size_t idx) { return batch.GetElement(idx).row_idx; }); - auto value_iter = dh::MakeTransformIterator( - counting, [=] __device__(size_t idx) -> size_t { - return is_valid(batch.GetElement(idx)); - }); - - auto key_value_index_iter = thrust::make_zip_iterator( - thrust::make_tuple(key_iter, value_iter, counting)); - - // Tuple[0] = The row index of the input, used as a key to define segments - // Tuple[1] = Scanned flags of valid elements for each row - // Tuple[2] = The index in the input data - using Tuple = thrust::tuple; - - auto device_accessor = dst->GetDeviceAccessor(device_idx); - common::CompressedBufferWriter writer(device_accessor.NumSymbols()); - auto d_compressed_buffer = dst->gidx_buffer.DevicePointer(); - - // We redirect the scan output into this functor to do the actual writing - WriteCompressedEllpackFunctor functor( - d_compressed_buffer, writer, batch, device_accessor, is_valid); - thrust::discard_iterator discard; - thrust::transform_output_iterator< - WriteCompressedEllpackFunctor, decltype(discard)> - out(discard, functor); - dh::XGBCachingDeviceAllocator alloc; - thrust::inclusive_scan(thrust::cuda::par(alloc), key_value_index_iter, - key_value_index_iter + batch.Size(), out, - [=] __device__(Tuple a, Tuple b) { - // Key equal - if (a.get<0>() == b.get<0>()) { - b.get<1>() += a.get<1>(); - return b; - } - // Not equal - return b; - }); -} - -template -void CopyDataColumnMajor(AdapterT* adapter, const AdapterBatchT& batch, - EllpackPageImpl* dst, float missing) { - // Step 1: Get the sizes of the input columns - dh::caching_device_vector column_sizes(adapter->NumColumns(), 0); - auto d_column_sizes = column_sizes.data().get(); - // Populate column sizes - dh::LaunchN(adapter->DeviceIdx(), batch.Size(), [=] __device__(size_t idx) { - const auto& e = batch.GetElement(idx); - atomicAdd(reinterpret_cast( // NOLINT - &d_column_sizes[e.column_idx]), - static_cast(1)); // NOLINT - }); - - thrust::host_vector host_column_sizes = column_sizes; - - // Step 2: Iterate over columns, place elements in correct row, increment - // temporary row pointers - dh::caching_device_vector temp_row_ptr(adapter->NumRows(), 0); - auto d_temp_row_ptr = temp_row_ptr.data().get(); - auto row_stride = dst->row_stride; - size_t begin = 0; - auto device_accessor = dst->GetDeviceAccessor(adapter->DeviceIdx()); - common::CompressedBufferWriter writer(device_accessor.NumSymbols()); - auto d_compressed_buffer = dst->gidx_buffer.DevicePointer(); - IsValidFunctor is_valid(missing); - for (auto size : host_column_sizes) { - size_t end = begin + size; - dh::LaunchN(adapter->DeviceIdx(), end - begin, [=] __device__(size_t idx) { - auto writer_non_const = - writer; // For some reason this variable gets captured as const - const auto& e = batch.GetElement(idx + begin); - if (!is_valid(e)) return; - size_t output_position = - e.row_idx * row_stride + d_temp_row_ptr[e.row_idx]; - auto bin_idx = device_accessor.SearchBin(e.value, e.column_idx); - writer_non_const.AtomicWriteSymbol(d_compressed_buffer, bin_idx, - output_position); - d_temp_row_ptr[e.row_idx] += 1; - }); - - begin = end; - } -} - -void WriteNullValues(EllpackPageImpl* dst, int device_idx, - common::Span row_counts) { - // Write the null values - auto device_accessor = dst->GetDeviceAccessor(device_idx); - common::CompressedBufferWriter writer(device_accessor.NumSymbols()); - auto d_compressed_buffer = dst->gidx_buffer.DevicePointer(); - auto row_stride = dst->row_stride; - dh::LaunchN(device_idx, row_stride * dst->n_rows, [=] __device__(size_t idx) { - auto writer_non_const = - writer; // For some reason this variable gets captured as const - size_t row_idx = idx / row_stride; - size_t row_offset = idx % row_stride; - if (row_offset >= row_counts[row_idx]) { - writer_non_const.AtomicWriteSymbol(d_compressed_buffer, - device_accessor.NullValue(), idx); - } - }); -} - // Does not currently support metainfo as no on-device data source contains this // Current implementation assumes a single batch. More batches can // be supported in future. Does not currently support inferring row/column size @@ -210,30 +35,24 @@ DeviceDMatrix::DeviceDMatrix(AdapterT* adapter, float missing, int nthread, int size_t row_stride = GetRowCounts(batch, row_counts_span, adapter->DeviceIdx(), missing); + ellpack_page_.reset(new EllpackPage()); + *ellpack_page_->Impl() = + EllpackPageImpl(adapter, missing, this->IsDense(), nthread, max_bin, + row_counts_span, row_stride); + dh::XGBCachingDeviceAllocator alloc; info_.num_nonzero_ = thrust::reduce(thrust::cuda::par(alloc), - row_counts.begin(), row_counts.end()); + row_counts.begin(), row_counts.end()); info_.num_col_ = adapter->NumColumns(); info_.num_row_ = adapter->NumRows(); - ellpack_page_.reset(new EllpackPage()); - *ellpack_page_->Impl() = - EllpackPageImpl(adapter->DeviceIdx(), cuts, this->IsDense(), row_stride, - adapter->NumRows()); - if (adapter->IsRowMajor()) { - CopyDataRowMajor(batch, ellpack_page_->Impl(), adapter->DeviceIdx(), - missing); - } else { - CopyDataColumnMajor(adapter, batch, ellpack_page_->Impl(), missing); - } - - WriteNullValues(ellpack_page_->Impl(), adapter->DeviceIdx(), row_counts_span); - // Synchronise worker columns rabit::Allreduce(&info_.num_col_, 1); } -template DeviceDMatrix::DeviceDMatrix(CudfAdapter* adapter, float missing, - int nthread, int max_bin); -template DeviceDMatrix::DeviceDMatrix(CupyAdapter* adapter, float missing, - int nthread, int max_bin); + +#define DEVICE_DMARIX_SPECIALIZATION(__ADAPTER_T) \ + template DeviceDMatrix::DeviceDMatrix(__ADAPTER_T* adapter, float missing, \ + int nthread, int max_bin); + +DEFINE_DEVICE_ADAPTER(DEVICE_DMARIX_SPECIALIZATION) } // namespace data } // namespace xgboost diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index 91f25b7a7123..c92f7dd7f28c 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -3,10 +3,12 @@ */ #include - +#include +#include #include "../common/hist_util.h" #include "../common/random.h" #include "./ellpack_page.cuh" +#include "device_adapter.cuh" namespace xgboost { @@ -17,6 +19,8 @@ EllpackPage::EllpackPage(DMatrix* dmat, const BatchParam& param) EllpackPage::~EllpackPage() = default; +EllpackPage::EllpackPage(EllpackPage&& that) { std::swap(impl_, that.impl_); } + size_t EllpackPage::Size() const { return impl_->Size(); } void EllpackPage::SetBaseRowId(size_t row_id) { impl_->SetBaseRowId(row_id); } @@ -74,22 +78,19 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts, dh::safe_cuda(cudaSetDevice(device)); monitor_.StartCuda("InitCompressedData"); - InitCompressedData(device); + this->InitCompressedData(device); monitor_.StopCuda("InitCompressedData"); } -size_t GetRowStride(DMatrix* dmat) { - if (dmat->IsDense()) return dmat->Info().num_col_; - - size_t row_stride = 0; - for (const auto& batch : dmat->GetBatches()) { - const auto& row_offset = batch.offset.ConstHostVector(); - for (auto i = 1ull; i < row_offset.size(); i++) { - row_stride = std::max( - row_stride, static_cast(row_offset[i] - row_offset[i - 1])); - } - } - return row_stride; +EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts, + const SparsePage& page, bool is_dense, + size_t row_stride) + : cuts_(std::move(cuts)), + is_dense(is_dense), + n_rows(page.Size()), + row_stride(row_stride) { + this->InitCompressedData(device); + this->CreateHistIndices(device, page); } // Construct an ELLPACK matrix in memory. @@ -117,6 +118,193 @@ EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param) monitor_.StopCuda("BinningCompression"); } +template +struct WriteCompressedEllpackFunctor { + WriteCompressedEllpackFunctor(common::CompressedByteT* buffer, + const common::CompressedBufferWriter& writer, + AdapterBatchT batch, + EllpackDeviceAccessor accessor, + const data::IsValidFunctor& is_valid) + : d_buffer(buffer), + writer(writer), + batch(std::move(batch)), + accessor(std::move(accessor)), + is_valid(is_valid) {} + + common::CompressedByteT* d_buffer; + common::CompressedBufferWriter writer; + AdapterBatchT batch; + EllpackDeviceAccessor accessor; + data::IsValidFunctor is_valid; + + using Tuple = thrust::tuple; + __device__ size_t operator()(Tuple out) { + auto e = batch.GetElement(out.get<2>()); + if (is_valid(e)) { + // -1 because the scan is inclusive + size_t output_position = + accessor.row_stride * e.row_idx + out.get<1>() - 1; + auto bin_idx = accessor.SearchBin(e.value, e.column_idx); + writer.AtomicWriteSymbol(d_buffer, bin_idx, output_position); + } + return 0; + } +}; + +// Here the data is already correctly ordered and simply needs to be compacted +// to remove missing data +template +void CopyDataRowMajor(const AdapterBatchT& batch, EllpackPageImpl* dst, + int device_idx, float missing) { + // Some witchcraft happens here + // The goal is to copy valid elements out of the input to an ellpack matrix + // with a given row stride, using no extra working memory Standard stream + // compaction needs to be modified to do this, so we manually define a + // segmented stream compaction via operators on an inclusive scan. The output + // of this inclusive scan is fed to a custom function which works out the + // correct output position + auto counting = thrust::make_counting_iterator(0llu); + data::IsValidFunctor is_valid(missing); + auto key_iter = dh::MakeTransformIterator( + counting, + [=] __device__(size_t idx) { + return batch.GetElement(idx).row_idx; + }); + auto value_iter = dh::MakeTransformIterator( + counting, + [=] __device__(size_t idx) -> size_t { + return is_valid(batch.GetElement(idx)); + }); + + auto key_value_index_iter = thrust::make_zip_iterator( + thrust::make_tuple(key_iter, value_iter, counting)); + + // Tuple[0] = The row index of the input, used as a key to define segments + // Tuple[1] = Scanned flags of valid elements for each row + // Tuple[2] = The index in the input data + using Tuple = thrust::tuple; + + auto device_accessor = dst->GetDeviceAccessor(device_idx); + common::CompressedBufferWriter writer(device_accessor.NumSymbols()); + auto d_compressed_buffer = dst->gidx_buffer.DevicePointer(); + + // We redirect the scan output into this functor to do the actual writing + WriteCompressedEllpackFunctor functor( + d_compressed_buffer, writer, batch, device_accessor, is_valid); + thrust::discard_iterator discard; + thrust::transform_output_iterator< + WriteCompressedEllpackFunctor, decltype(discard)> + out(discard, functor); + dh::XGBCachingDeviceAllocator alloc; + thrust::inclusive_scan(thrust::cuda::par(alloc), key_value_index_iter, + key_value_index_iter + batch.Size(), out, + [=] __device__(Tuple a, Tuple b) { + // Key equal + if (a.get<0>() == b.get<0>()) { + b.get<1>() += a.get<1>(); + return b; + } + // Not equal + return b; + }); +} + +template +void CopyDataColumnMajor(AdapterT* adapter, const AdapterBatchT& batch, + EllpackPageImpl* dst, float missing) { + // Step 1: Get the sizes of the input columns + dh::caching_device_vector column_sizes(adapter->NumColumns(), 0); + auto d_column_sizes = column_sizes.data().get(); + // Populate column sizes + dh::LaunchN(adapter->DeviceIdx(), batch.Size(), [=] __device__(size_t idx) { + const auto& e = batch.GetElement(idx); + atomicAdd(reinterpret_cast( // NOLINT + &d_column_sizes[e.column_idx]), + static_cast(1)); // NOLINT + }); + + thrust::host_vector host_column_sizes = column_sizes; + + // Step 2: Iterate over columns, place elements in correct row, increment + // temporary row pointers + dh::caching_device_vector temp_row_ptr(adapter->NumRows(), 0); + auto d_temp_row_ptr = temp_row_ptr.data().get(); + auto row_stride = dst->row_stride; + size_t begin = 0; + auto device_accessor = dst->GetDeviceAccessor(adapter->DeviceIdx()); + common::CompressedBufferWriter writer(device_accessor.NumSymbols()); + auto d_compressed_buffer = dst->gidx_buffer.DevicePointer(); + data::IsValidFunctor is_valid(missing); + for (auto size : host_column_sizes) { + size_t end = begin + size; + dh::LaunchN(adapter->DeviceIdx(), end - begin, [=] __device__(size_t idx) { + auto writer_non_const = + writer; // For some reason this variable gets captured as const + const auto& e = batch.GetElement(idx + begin); + if (!is_valid(e)) return; + size_t output_position = + e.row_idx * row_stride + d_temp_row_ptr[e.row_idx]; + auto bin_idx = device_accessor.SearchBin(e.value, e.column_idx); + writer_non_const.AtomicWriteSymbol(d_compressed_buffer, bin_idx, + output_position); + d_temp_row_ptr[e.row_idx] += 1; + }); + + begin = end; + } +} + +void WriteNullValues(EllpackPageImpl* dst, int device_idx, + common::Span row_counts) { + // Write the null values + auto device_accessor = dst->GetDeviceAccessor(device_idx); + common::CompressedBufferWriter writer(device_accessor.NumSymbols()); + auto d_compressed_buffer = dst->gidx_buffer.DevicePointer(); + auto row_stride = dst->row_stride; + dh::LaunchN(device_idx, row_stride * dst->n_rows, [=] __device__(size_t idx) { + auto writer_non_const = + writer; // For some reason this variable gets captured as const + size_t row_idx = idx / row_stride; + size_t row_offset = idx % row_stride; + assert(row_idx < row_counts.size()); + if (row_idx >= row_counts.size()) { + printf("row_counts: %lld, idx: %lld\n", row_counts.size(), row_idx); + } + if (row_offset >= row_counts[row_idx]) { + writer_non_const.AtomicWriteSymbol(d_compressed_buffer, + device_accessor.NullValue(), idx); + } + }); +} + +template +EllpackPageImpl::EllpackPageImpl(AdapterT* adapter, float missing, bool is_dense, int nthread, + int max_bin, common::Span row_counts_span, + size_t row_stride) { + common::HistogramCuts cuts = + common::AdapterDeviceSketch(adapter, max_bin, missing); + dh::safe_cuda(cudaSetDevice(adapter->DeviceIdx())); + auto& batch = adapter->Value(); + + *this = EllpackPageImpl(adapter->DeviceIdx(), cuts, is_dense, row_stride, + adapter->NumRows()); + if (adapter->IsRowMajor()) { + CopyDataRowMajor(batch, this, adapter->DeviceIdx(), missing); + } else { + CopyDataColumnMajor(adapter, batch, this, missing); + } + + WriteNullValues(this, adapter->DeviceIdx(), row_counts_span); +} + +#define ELLPACK_SPECIALIZATION(__ADAPTER_T) \ + template EllpackPageImpl::EllpackPageImpl( \ + __ADAPTER_T* adapter, float missing, bool is_dense, int nthread, int max_bin, \ + common::Span row_counts_span, \ + size_t row_stride); + +DEFINE_DEVICE_ADAPTER(ELLPACK_SPECIALIZATION) + // A functor that copies the data from one EllpackPage to another. struct CopyPage { common::CompressedBufferWriter cbw; @@ -295,15 +483,4 @@ EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(int device) const { common::CompressedIterator(gidx_buffer.ConstDevicePointer(), NumSymbols())); } - -EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts, - const SparsePage& page, bool is_dense, - size_t row_stride) - : cuts_(std::move(cuts)), - is_dense(is_dense), - n_rows(page.Size()), - row_stride(row_stride) { - this->InitCompressedData(device); - this->CreateHistIndices(device, page); -} } // namespace xgboost diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh index 98ed3587a490..011190a07643 100644 --- a/src/data/ellpack_page.cuh +++ b/src/data/ellpack_page.cuh @@ -159,6 +159,10 @@ class EllpackPageImpl { */ explicit EllpackPageImpl(DMatrix* dmat, const BatchParam& parm); + template + explicit EllpackPageImpl(AdapterT* adapter, float missing, bool is_dense, int nthread, + int max_bin, common::Span row_counts_span, + size_t row_stride); /*! \brief Copy the elements of the given ELLPACK page into this page. * * @param device The GPU device to use. @@ -229,6 +233,19 @@ public: common::Monitor monitor_; }; +inline size_t GetRowStride(DMatrix* dmat) { + if (dmat->IsDense()) return dmat->Info().num_col_; + + size_t row_stride = 0; + for (const auto& batch : dmat->GetBatches()) { + const auto& row_offset = batch.offset.ConstHostVector(); + for (auto i = 1ull; i < row_offset.size(); i++) { + row_stride = std::max( + row_stride, static_cast(row_offset[i] - row_offset[i - 1])); + } + } + return row_stride; +} } // namespace xgboost #endif // XGBOOST_DATA_ELLPACK_PAGE_H_ diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu index 485df9dc5f52..1b91d1ba2bb9 100644 --- a/src/data/ellpack_page_source.cu +++ b/src/data/ellpack_page_source.cu @@ -13,20 +13,6 @@ namespace xgboost { namespace data { -size_t GetRowStride(DMatrix* dmat) { - if (dmat->IsDense()) return dmat->Info().num_col_; - - size_t row_stride = 0; - for (const auto& batch : dmat->GetBatches()) { - const auto& row_offset = batch.offset.ConstHostVector(); - for (auto i = 1ull; i < row_offset.size(); i++) { - row_stride = std::max( - row_stride, static_cast(row_offset[i] - row_offset[i - 1])); - } - } - return row_stride; -} - // Build the quantile sketch across the whole input data, then use the histogram cuts to compress // each CSR page, and write the accumulated ELLPACK pages to disk. EllpackPageSource::EllpackPageSource(DMatrix* dmat, diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h index c7074c092916..108af403b1a3 100644 --- a/src/data/sparse_page_source.h +++ b/src/data/sparse_page_source.h @@ -127,15 +127,16 @@ inline void CheckCacheFileExists(const std::string& file) { } } - /** - * \brief Given a set of cache files and page type, this object iterates over batches using prefetching for improved performance. Not thread safe. - * - * \tparam PageT Type of the page t. - */ - template +/** + * \brief Given a set of cache files and page type, this object iterates over batches + * using prefetching for improved performance. Not thread safe. + * + * \tparam PageT Type of the page t. + */ +template class ExternalMemoryPrefetcher : dmlc::DataIter { public: - explicit ExternalMemoryPrefetcher(const CacheInfo& info) noexcept(false) + explicit ExternalMemoryPrefetcher(const CacheInfo& info) noexcept(false) : base_rowid_(0), page_(nullptr), clock_ptr_(0) { // read in the info files CHECK_NE(info.name_shards.size(), 0U); From 559cbcff2dbdfdd452dbef4a624e08e631e785b7 Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 30 Apr 2020 12:49:00 +0800 Subject: [PATCH 2/3] Remove debug code. --- src/data/ellpack_page.cu | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index c92f7dd7f28c..857e7aed7636 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -266,10 +266,6 @@ void WriteNullValues(EllpackPageImpl* dst, int device_idx, writer; // For some reason this variable gets captured as const size_t row_idx = idx / row_stride; size_t row_offset = idx % row_stride; - assert(row_idx < row_counts.size()); - if (row_idx >= row_counts.size()) { - printf("row_counts: %lld, idx: %lld\n", row_counts.size(), row_idx); - } if (row_offset >= row_counts[row_idx]) { writer_non_const.AtomicWriteSymbol(d_compressed_buffer, device_accessor.NullValue(), idx); From c38ae9df408454084d3b74ea597fb886640c98ea Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 6 May 2020 17:09:48 +0800 Subject: [PATCH 3/3] Remove macro from header. --- src/data/device_adapter.cuh | 4 ---- src/data/device_dmatrix.cu | 3 ++- src/data/ellpack_page.cu | 3 ++- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh index 64b45f01581c..f52b45aa1c83 100644 --- a/src/data/device_adapter.cuh +++ b/src/data/device_adapter.cuh @@ -233,10 +233,6 @@ size_t GetRowCounts(const AdapterBatchT& batch, common::Span offset, thrust::maximum()); return row_stride; } - -#define DEFINE_DEVICE_ADAPTER(__func) \ - __func(::xgboost::data::CudfAdapter); \ - __func(::xgboost::data::CupyAdapter); }; // namespace data } // namespace xgboost #endif // XGBOOST_DATA_DEVICE_ADAPTER_H_ diff --git a/src/data/device_dmatrix.cu b/src/data/device_dmatrix.cu index e7512b107a42..092d4f678471 100644 --- a/src/data/device_dmatrix.cu +++ b/src/data/device_dmatrix.cu @@ -53,6 +53,7 @@ DeviceDMatrix::DeviceDMatrix(AdapterT* adapter, float missing, int nthread, int template DeviceDMatrix::DeviceDMatrix(__ADAPTER_T* adapter, float missing, \ int nthread, int max_bin); -DEFINE_DEVICE_ADAPTER(DEVICE_DMARIX_SPECIALIZATION) +DEVICE_DMARIX_SPECIALIZATION(CudfAdapter); +DEVICE_DMARIX_SPECIALIZATION(CupyAdapter); } // namespace data } // namespace xgboost diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index 857e7aed7636..cd9b1360466d 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -299,7 +299,8 @@ EllpackPageImpl::EllpackPageImpl(AdapterT* adapter, float missing, bool is_dense common::Span row_counts_span, \ size_t row_stride); -DEFINE_DEVICE_ADAPTER(ELLPACK_SPECIALIZATION) +ELLPACK_SPECIALIZATION(data::CudfAdapter) +ELLPACK_SPECIALIZATION(data::CupyAdapter) // A functor that copies the data from one EllpackPage to another. struct CopyPage {