Skip to content

fix training continuation for iGPUs #71

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 37 additions & 39 deletions plugin/sycl/data/gradient_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,28 +50,29 @@ void mergeSort(BinIdxType* begin, BinIdxType* end, BinIdxType* buf) {

template <typename BinIdxType, bool isDense>
void GHistIndexMatrix::SetIndexData(::sycl::queue* qu,
Context const * ctx,
BinIdxType* index_data,
DMatrix *dmat,
size_t nbins,
size_t row_stride) {
DMatrix *dmat) {
if (nbins == 0) return;
const bst_float* cut_values = cut.cut_values_.ConstDevicePointer();
const uint32_t* cut_ptrs = cut.cut_ptrs_.ConstDevicePointer();
size_t* hit_count_ptr = hit_count.DevicePointer();

BinIdxType* sort_data = reinterpret_cast<BinIdxType*>(sort_buff.Data());

::sycl::event event;
for (auto &batch : dmat->GetBatches<SparsePage>()) {
for (auto &batch : dmat->GetBatches<SparsePage>()) {
const xgboost::Entry *data_ptr = batch.data.ConstDevicePointer();
const bst_idx_t *offset_vec = batch.offset.ConstDevicePointer();
size_t batch_size = batch.Size();
if (batch_size > 0) {
const auto base_rowid = batch.base_rowid;
event = qu->submit([&](::sycl::handler& cgh) {
cgh.depends_on(event);
cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::item<1> pid) {
batch.data.SetDevice(ctx->Device());
batch.offset.SetDevice(ctx->Device());

const xgboost::Entry *data_ptr = batch.data.ConstDevicePointer();
const bst_idx_t *offset_vec = batch.offset.ConstDevicePointer();
size_t batch_size = batch.Size();
if (batch_size > 0) {
const auto base_rowid = batch.base_rowid;
size_t row_stride = this->row_stride;
size_t nbins = this->nbins;
qu->submit([&](::sycl::handler& cgh) {
cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::item<1> pid) {
const size_t i = pid.get_id(0);
const size_t ibegin = offset_vec[i];
const size_t iend = offset_vec[i + 1];
Expand All @@ -92,23 +93,22 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue* qu,
}
});
});
}
qu->wait();
}
}
qu->wait();
}

void GHistIndexMatrix::ResizeIndex(size_t n_index, bool isDense) {
if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
void GHistIndexMatrix::ResizeIndex(::sycl::queue* qu, size_t n_index) {
if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense_) {
index.SetBinTypeSize(BinTypeSize::kUint8BinsTypeSize);
index.Resize((sizeof(uint8_t)) * n_index);
index.Resize(qu, (sizeof(uint8_t)) * n_index);
} else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) && isDense) {
max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) && isDense_) {
index.SetBinTypeSize(BinTypeSize::kUint16BinsTypeSize);
index.Resize((sizeof(uint16_t)) * n_index);
index.Resize(qu, (sizeof(uint16_t)) * n_index);
} else {
index.SetBinTypeSize(BinTypeSize::kUint32BinsTypeSize);
index.Resize((sizeof(uint32_t)) * n_index);
index.Resize(qu, (sizeof(uint32_t)) * n_index);
}
}

Expand All @@ -122,52 +122,50 @@ void GHistIndexMatrix::Init(::sycl::queue* qu,
cut.SetDevice(ctx->Device());

max_num_bins = max_bins;
const uint32_t nbins = cut.Ptrs().back();
this->nbins = nbins;
nbins = cut.Ptrs().back();

hit_count.SetDevice(ctx->Device());
hit_count.Resize(nbins, 0);

this->p_fmat = dmat;
const bool isDense = dmat->IsDense();
this->isDense_ = isDense;

index.setQueue(qu);

row_stride = 0;
size_t n_rows = 0;
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
const auto& row_offset = batch.offset.ConstHostVector();
batch.data.SetDevice(ctx->Device());
batch.offset.SetDevice(ctx->Device());
n_rows += batch.Size();
for (auto i = 1ull; i < row_offset.size(); i++) {
row_stride = std::max(row_stride, static_cast<size_t>(row_offset[i] - row_offset[i - 1]));
if (!isDense) {
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
const auto& row_offset = batch.offset.ConstHostVector();
n_rows += batch.Size();
for (auto i = 1ull; i < row_offset.size(); i++) {
row_stride = std::max(row_stride, static_cast<size_t>(row_offset[i] - row_offset[i - 1]));
}
}
} else {
row_stride = nfeatures;
n_rows = dmat->Info().num_row_;
}

const size_t n_offsets = cut.cut_ptrs_.Size() - 1;
const size_t n_index = n_rows * row_stride;
ResizeIndex(n_index, isDense);
ResizeIndex(qu, n_index);

CHECK_GT(cut.cut_values_.Size(), 0U);

if (isDense) {
BinTypeSize curent_bin_size = index.GetBinTypeSize();
if (curent_bin_size == BinTypeSize::kUint8BinsTypeSize) {
SetIndexData<uint8_t, true>(qu, index.data<uint8_t>(), dmat, nbins, row_stride);

SetIndexData<uint8_t, true>(qu, ctx, index.data<uint8_t>(), dmat);
} else if (curent_bin_size == BinTypeSize::kUint16BinsTypeSize) {
SetIndexData<uint16_t, true>(qu, index.data<uint16_t>(), dmat, nbins, row_stride);
SetIndexData<uint16_t, true>(qu, ctx, index.data<uint16_t>(), dmat);
} else {
CHECK_EQ(curent_bin_size, BinTypeSize::kUint32BinsTypeSize);
SetIndexData<uint32_t, true>(qu, index.data<uint32_t>(), dmat, nbins, row_stride);
SetIndexData<uint32_t, true>(qu, ctx, index.data<uint32_t>(), dmat);
}
/* For sparse DMatrix we have to store index of feature for each bin
in index field to chose right offset. So offset is nullptr and index is not reduced */
} else {
sort_buff.Resize(qu, n_rows * row_stride * sizeof(uint32_t));
SetIndexData<uint32_t, false>(qu, index.data<uint32_t>(), dmat, nbins, row_stride);
SetIndexData<uint32_t, false>(qu, ctx, index.data<uint32_t>(), dmat);
}
}

Expand Down
52 changes: 9 additions & 43 deletions plugin/sycl/data/gradient_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,9 @@ struct Index {
Index& operator=(Index&& i) = delete;
void SetBinTypeSize(BinTypeSize binTypeSize) {
binTypeSize_ = binTypeSize;
switch (binTypeSize) {
case BinTypeSize::kUint8BinsTypeSize:
func_ = &GetValueFromUint8;
break;
case BinTypeSize::kUint16BinsTypeSize:
func_ = &GetValueFromUint16;
break;
case BinTypeSize::kUint32BinsTypeSize:
func_ = &GetValueFromUint32;
break;
default:
CHECK(binTypeSize == BinTypeSize::kUint8BinsTypeSize ||
binTypeSize == BinTypeSize::kUint16BinsTypeSize ||
binTypeSize == BinTypeSize::kUint32BinsTypeSize);
}
CHECK(binTypeSize == BinTypeSize::kUint8BinsTypeSize ||
binTypeSize == BinTypeSize::kUint16BinsTypeSize ||
binTypeSize == BinTypeSize::kUint32BinsTypeSize);
}
BinTypeSize GetBinTypeSize() const {
return binTypeSize_;
Expand All @@ -65,8 +53,8 @@ struct Index {
return data_.Size() / (binTypeSize_);
}

void Resize(const size_t nBytesData) {
data_.Resize(qu_, nBytesData);
void Resize(::sycl::queue* qu, const size_t nBytesData) {
data_.Resize(qu, nBytesData);
}

uint8_t* begin() const {
Expand All @@ -77,28 +65,9 @@ struct Index {
return data_.End();
}

void setQueue(::sycl::queue* qu) {
qu_ = qu;
}

private:
static uint32_t GetValueFromUint8(const uint8_t* t, size_t i) {
return reinterpret_cast<const uint8_t*>(t)[i];
}
static uint32_t GetValueFromUint16(const uint8_t* t, size_t i) {
return reinterpret_cast<const uint16_t*>(t)[i];
}
static uint32_t GetValueFromUint32(const uint8_t* t, size_t i) {
return reinterpret_cast<const uint32_t*>(t)[i];
}

using Func = uint32_t (*)(const uint8_t*, size_t);

USMVector<uint8_t, MemoryType::on_device> data_;
BinTypeSize binTypeSize_ {BinTypeSize::kUint8BinsTypeSize};
Func func_;

::sycl::queue* qu_;
};

/*!
Expand All @@ -116,22 +85,19 @@ struct GHistIndexMatrix {
USMVector<uint8_t, MemoryType::on_device> sort_buff;
/*! \brief The corresponding cuts */
xgboost::common::HistogramCuts cut;
DMatrix* p_fmat;
size_t max_num_bins;
size_t nbins;
size_t nfeatures;
size_t row_stride;

// Create a global histogram matrix based on a given DMatrix device wrapper
void Init(::sycl::queue* qu, Context const * ctx,
DMatrix *dmat, int max_num_bins);
void Init(::sycl::queue* qu, Context const * ctx, DMatrix *dmat, int max_num_bins);

template <typename BinIdxType, bool isDense>
void SetIndexData(::sycl::queue* qu, BinIdxType* index_data,
DMatrix *dmat,
size_t nbins, size_t row_stride);
void SetIndexData(::sycl::queue* qu, Context const * ctx, BinIdxType* index_data,
DMatrix *dmat);

void ResizeIndex(size_t n_index, bool isDense);
void ResizeIndex(::sycl::queue* qu, size_t n_index);

inline void GetFeatureCounts(size_t* counts) const {
auto nfeature = cut.cut_ptrs_.Size() - 1;
Expand Down
7 changes: 3 additions & 4 deletions plugin/sycl/predictor/predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ class Predictor : public xgboost::Predictor {
}

if (num_group == 1) {
float sum = 0.0;
float& sum = out_predictions[row_idx];
for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
const Node* first_node = nodes + first_node_position[tree_idx - tree_begin];
if constexpr (any_missing) {
Expand All @@ -300,7 +300,6 @@ class Predictor : public xgboost::Predictor {
sum += GetLeafWeight(first_node, fval_buff_row_ptr);
}
}
out_predictions[row_idx] += sum;
} else {
for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
const Node* first_node = nodes + first_node_position[tree_idx - tree_begin];
Expand Down Expand Up @@ -333,7 +332,6 @@ class Predictor : public xgboost::Predictor {
int num_features = dmat->Info().num_col_;

float* out_predictions = out_preds->DevicePointer();
::sycl::event event;
for (auto &batch : dmat->GetBatches<SparsePage>()) {
batch.data.SetDevice(ctx_->Device());
batch.offset.SetDevice(ctx_->Device());
Expand All @@ -343,6 +341,7 @@ class Predictor : public xgboost::Predictor {
if (batch_size > 0) {
const auto base_rowid = batch.base_rowid;

::sycl::event event;
if (needs_buffer_update) {
fval_buff.ResizeNoCopy(qu_, num_features * batch_size);
if constexpr (any_missing) {
Expand All @@ -354,9 +353,9 @@ class Predictor : public xgboost::Predictor {
row_ptr, batch_size, num_features,
num_group, tree_begin, tree_end);
needs_buffer_update = (batch_size != out_preds->Size());
qu_->wait();
}
}
qu_->wait();
}

mutable USMVector<float, MemoryType::on_device> fval_buff;
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/plugin/test_sycl_partition_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ void TestPartitioning(float sparsity, int max_bins) {

std::vector<uint8_t> ridx_left(num_rows, 0);
std::vector<uint8_t> ridx_right(num_rows, 0);
for (auto &batch : gmat.p_fmat->GetBatches<SparsePage>()) {
for (auto &batch : p_fmat->GetBatches<SparsePage>()) {
const auto& data_vec = batch.data.HostVector();
const auto& offset_vec = batch.offset.HostVector();

Expand Down
4 changes: 2 additions & 2 deletions tests/python-sycl/test_sycl_training_continuation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ class TestSYCLTrainingContinuation:
def run_training_continuation(self, use_json):
kRows = 64
kCols = 32
X = np.random.randn(kRows, kCols)
y = np.random.randn(kRows)
X = rng.randn(kRows, kCols)
y = rng.randn(kRows)
dtrain = xgb.DMatrix(X, y)
params = {
"device": "sycl",
Expand Down
Loading