diff --git a/plugin/sycl/data/gradient_index.cc b/plugin/sycl/data/gradient_index.cc index e6182e07b976..a18ca0798278 100644 --- a/plugin/sycl/data/gradient_index.cc +++ b/plugin/sycl/data/gradient_index.cc @@ -50,10 +50,9 @@ void mergeSort(BinIdxType* begin, BinIdxType* end, BinIdxType* buf) { template void GHistIndexMatrix::SetIndexData(::sycl::queue* qu, + Context const * ctx, BinIdxType* index_data, - DMatrix *dmat, - size_t nbins, - size_t row_stride) { + DMatrix *dmat) { if (nbins == 0) return; const bst_float* cut_values = cut.cut_values_.ConstDevicePointer(); const uint32_t* cut_ptrs = cut.cut_ptrs_.ConstDevicePointer(); @@ -61,17 +60,19 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue* qu, BinIdxType* sort_data = reinterpret_cast(sort_buff.Data()); - ::sycl::event event; for (auto &batch : dmat->GetBatches()) { - for (auto &batch : dmat->GetBatches()) { - const xgboost::Entry *data_ptr = batch.data.ConstDevicePointer(); - const bst_idx_t *offset_vec = batch.offset.ConstDevicePointer(); - size_t batch_size = batch.Size(); - if (batch_size > 0) { - const auto base_rowid = batch.base_rowid; - event = qu->submit([&](::sycl::handler& cgh) { - cgh.depends_on(event); - cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::item<1> pid) { + batch.data.SetDevice(ctx->Device()); + batch.offset.SetDevice(ctx->Device()); + + const xgboost::Entry *data_ptr = batch.data.ConstDevicePointer(); + const bst_idx_t *offset_vec = batch.offset.ConstDevicePointer(); + size_t batch_size = batch.Size(); + if (batch_size > 0) { + const auto base_rowid = batch.base_rowid; + size_t row_stride = this->row_stride; + size_t nbins = this->nbins; + qu->submit([&](::sycl::handler& cgh) { + cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::item<1> pid) { const size_t i = pid.get_id(0); const size_t ibegin = offset_vec[i]; const size_t iend = offset_vec[i + 1]; @@ -92,23 +93,22 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue* qu, } }); }); - } + qu->wait(); } } - qu->wait(); } -void GHistIndexMatrix::ResizeIndex(size_t n_index, bool isDense) { - if ((max_num_bins - 1 <= static_cast(std::numeric_limits::max())) && isDense) { +void GHistIndexMatrix::ResizeIndex(::sycl::queue* qu, size_t n_index) { + if ((max_num_bins - 1 <= static_cast(std::numeric_limits::max())) && isDense_) { index.SetBinTypeSize(BinTypeSize::kUint8BinsTypeSize); - index.Resize((sizeof(uint8_t)) * n_index); + index.Resize(qu, (sizeof(uint8_t)) * n_index); } else if ((max_num_bins - 1 > static_cast(std::numeric_limits::max()) && - max_num_bins - 1 <= static_cast(std::numeric_limits::max())) && isDense) { + max_num_bins - 1 <= static_cast(std::numeric_limits::max())) && isDense_) { index.SetBinTypeSize(BinTypeSize::kUint16BinsTypeSize); - index.Resize((sizeof(uint16_t)) * n_index); + index.Resize(qu, (sizeof(uint16_t)) * n_index); } else { index.SetBinTypeSize(BinTypeSize::kUint32BinsTypeSize); - index.Resize((sizeof(uint32_t)) * n_index); + index.Resize(qu, (sizeof(uint32_t)) * n_index); } } @@ -122,52 +122,50 @@ void GHistIndexMatrix::Init(::sycl::queue* qu, cut.SetDevice(ctx->Device()); max_num_bins = max_bins; - const uint32_t nbins = cut.Ptrs().back(); - this->nbins = nbins; + nbins = cut.Ptrs().back(); hit_count.SetDevice(ctx->Device()); hit_count.Resize(nbins, 0); - this->p_fmat = dmat; const bool isDense = dmat->IsDense(); this->isDense_ = isDense; - index.setQueue(qu); - row_stride = 0; size_t n_rows = 0; - for (const auto& batch : dmat->GetBatches()) { - const auto& row_offset = batch.offset.ConstHostVector(); - batch.data.SetDevice(ctx->Device()); - batch.offset.SetDevice(ctx->Device()); - n_rows += batch.Size(); - for (auto i = 1ull; i < row_offset.size(); i++) { - row_stride = std::max(row_stride, static_cast(row_offset[i] - row_offset[i - 1])); + if (!isDense) { + for (const auto& batch : dmat->GetBatches()) { + const auto& row_offset = batch.offset.ConstHostVector(); + n_rows += batch.Size(); + for (auto i = 1ull; i < row_offset.size(); i++) { + row_stride = std::max(row_stride, static_cast(row_offset[i] - row_offset[i - 1])); + } } + } else { + row_stride = nfeatures; + n_rows = dmat->Info().num_row_; } const size_t n_offsets = cut.cut_ptrs_.Size() - 1; const size_t n_index = n_rows * row_stride; - ResizeIndex(n_index, isDense); + ResizeIndex(qu, n_index); CHECK_GT(cut.cut_values_.Size(), 0U); if (isDense) { BinTypeSize curent_bin_size = index.GetBinTypeSize(); if (curent_bin_size == BinTypeSize::kUint8BinsTypeSize) { - SetIndexData(qu, index.data(), dmat, nbins, row_stride); - + SetIndexData(qu, ctx, index.data(), dmat); } else if (curent_bin_size == BinTypeSize::kUint16BinsTypeSize) { - SetIndexData(qu, index.data(), dmat, nbins, row_stride); + SetIndexData(qu, ctx, index.data(), dmat); } else { CHECK_EQ(curent_bin_size, BinTypeSize::kUint32BinsTypeSize); - SetIndexData(qu, index.data(), dmat, nbins, row_stride); + SetIndexData(qu, ctx, index.data(), dmat); } /* For sparse DMatrix we have to store index of feature for each bin in index field to chose right offset. So offset is nullptr and index is not reduced */ } else { sort_buff.Resize(qu, n_rows * row_stride * sizeof(uint32_t)); - SetIndexData(qu, index.data(), dmat, nbins, row_stride); + SetIndexData(qu, ctx, index.data(), dmat); } } diff --git a/plugin/sycl/data/gradient_index.h b/plugin/sycl/data/gradient_index.h index b88f2a8015ce..15748fa7f47f 100644 --- a/plugin/sycl/data/gradient_index.h +++ b/plugin/sycl/data/gradient_index.h @@ -31,21 +31,9 @@ struct Index { Index& operator=(Index&& i) = delete; void SetBinTypeSize(BinTypeSize binTypeSize) { binTypeSize_ = binTypeSize; - switch (binTypeSize) { - case BinTypeSize::kUint8BinsTypeSize: - func_ = &GetValueFromUint8; - break; - case BinTypeSize::kUint16BinsTypeSize: - func_ = &GetValueFromUint16; - break; - case BinTypeSize::kUint32BinsTypeSize: - func_ = &GetValueFromUint32; - break; - default: - CHECK(binTypeSize == BinTypeSize::kUint8BinsTypeSize || - binTypeSize == BinTypeSize::kUint16BinsTypeSize || - binTypeSize == BinTypeSize::kUint32BinsTypeSize); - } + CHECK(binTypeSize == BinTypeSize::kUint8BinsTypeSize || + binTypeSize == BinTypeSize::kUint16BinsTypeSize || + binTypeSize == BinTypeSize::kUint32BinsTypeSize); } BinTypeSize GetBinTypeSize() const { return binTypeSize_; @@ -65,8 +53,8 @@ struct Index { return data_.Size() / (binTypeSize_); } - void Resize(const size_t nBytesData) { - data_.Resize(qu_, nBytesData); + void Resize(::sycl::queue* qu, const size_t nBytesData) { + data_.Resize(qu, nBytesData); } uint8_t* begin() const { @@ -77,28 +65,9 @@ struct Index { return data_.End(); } - void setQueue(::sycl::queue* qu) { - qu_ = qu; - } - private: - static uint32_t GetValueFromUint8(const uint8_t* t, size_t i) { - return reinterpret_cast(t)[i]; - } - static uint32_t GetValueFromUint16(const uint8_t* t, size_t i) { - return reinterpret_cast(t)[i]; - } - static uint32_t GetValueFromUint32(const uint8_t* t, size_t i) { - return reinterpret_cast(t)[i]; - } - - using Func = uint32_t (*)(const uint8_t*, size_t); - USMVector data_; BinTypeSize binTypeSize_ {BinTypeSize::kUint8BinsTypeSize}; - Func func_; - - ::sycl::queue* qu_; }; /*! @@ -116,22 +85,19 @@ struct GHistIndexMatrix { USMVector sort_buff; /*! \brief The corresponding cuts */ xgboost::common::HistogramCuts cut; - DMatrix* p_fmat; size_t max_num_bins; size_t nbins; size_t nfeatures; size_t row_stride; // Create a global histogram matrix based on a given DMatrix device wrapper - void Init(::sycl::queue* qu, Context const * ctx, - DMatrix *dmat, int max_num_bins); + void Init(::sycl::queue* qu, Context const * ctx, DMatrix *dmat, int max_num_bins); template - void SetIndexData(::sycl::queue* qu, BinIdxType* index_data, - DMatrix *dmat, - size_t nbins, size_t row_stride); + void SetIndexData(::sycl::queue* qu, Context const * ctx, BinIdxType* index_data, + DMatrix *dmat); - void ResizeIndex(size_t n_index, bool isDense); + void ResizeIndex(::sycl::queue* qu, size_t n_index); inline void GetFeatureCounts(size_t* counts) const { auto nfeature = cut.cut_ptrs_.Size() - 1; diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc index 43356f64eb0b..9eb05271084c 100755 --- a/plugin/sycl/predictor/predictor.cc +++ b/plugin/sycl/predictor/predictor.cc @@ -291,7 +291,7 @@ class Predictor : public xgboost::Predictor { } if (num_group == 1) { - float sum = 0.0; + float& sum = out_predictions[row_idx]; for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { const Node* first_node = nodes + first_node_position[tree_idx - tree_begin]; if constexpr (any_missing) { @@ -300,7 +300,6 @@ class Predictor : public xgboost::Predictor { sum += GetLeafWeight(first_node, fval_buff_row_ptr); } } - out_predictions[row_idx] += sum; } else { for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { const Node* first_node = nodes + first_node_position[tree_idx - tree_begin]; @@ -333,7 +332,6 @@ class Predictor : public xgboost::Predictor { int num_features = dmat->Info().num_col_; float* out_predictions = out_preds->DevicePointer(); - ::sycl::event event; for (auto &batch : dmat->GetBatches()) { batch.data.SetDevice(ctx_->Device()); batch.offset.SetDevice(ctx_->Device()); @@ -343,6 +341,7 @@ class Predictor : public xgboost::Predictor { if (batch_size > 0) { const auto base_rowid = batch.base_rowid; + ::sycl::event event; if (needs_buffer_update) { fval_buff.ResizeNoCopy(qu_, num_features * batch_size); if constexpr (any_missing) { @@ -354,9 +353,9 @@ class Predictor : public xgboost::Predictor { row_ptr, batch_size, num_features, num_group, tree_begin, tree_end); needs_buffer_update = (batch_size != out_preds->Size()); + qu_->wait(); } } - qu_->wait(); } mutable USMVector fval_buff; diff --git a/tests/cpp/plugin/test_sycl_partition_builder.cc b/tests/cpp/plugin/test_sycl_partition_builder.cc index 5928988c6441..584b5c26fb72 100644 --- a/tests/cpp/plugin/test_sycl_partition_builder.cc +++ b/tests/cpp/plugin/test_sycl_partition_builder.cc @@ -67,7 +67,7 @@ void TestPartitioning(float sparsity, int max_bins) { std::vector ridx_left(num_rows, 0); std::vector ridx_right(num_rows, 0); - for (auto &batch : gmat.p_fmat->GetBatches()) { + for (auto &batch : p_fmat->GetBatches()) { const auto& data_vec = batch.data.HostVector(); const auto& offset_vec = batch.offset.HostVector(); diff --git a/tests/python-sycl/test_sycl_training_continuation.py b/tests/python-sycl/test_sycl_training_continuation.py index e2a11c987bb4..71d5965600e7 100644 --- a/tests/python-sycl/test_sycl_training_continuation.py +++ b/tests/python-sycl/test_sycl_training_continuation.py @@ -9,8 +9,8 @@ class TestSYCLTrainingContinuation: def run_training_continuation(self, use_json): kRows = 64 kCols = 32 - X = np.random.randn(kRows, kCols) - y = np.random.randn(kRows) + X = rng.randn(kRows, kCols) + y = rng.randn(kRows) dtrain = xgb.DMatrix(X, y) params = { "device": "sycl",