Skip to content

Fixes for v.2.1.0 #55

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
make -j$(nproc)
- name: Run gtest binary for SYCL
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/python_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DPLUGIN_SYCL=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
cmake .. -DPLUGIN_SYCL=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc
make -j$(nproc)
- name: Install Python package
run: |
Expand Down
2 changes: 0 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)

if(PLUGIN_SYCL)
set(CMAKE_CXX_COMPILER "g++")
set(CMAKE_C_COMPILER "gcc")
string(REPLACE " -isystem ${CONDA_PREFIX}/include" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
endif()

Expand Down
6 changes: 3 additions & 3 deletions include/xgboost/linalg.h
Original file line number Diff line number Diff line change
Expand Up @@ -664,13 +664,13 @@ auto MakeVec(T *ptr, size_t s, DeviceOrd device = DeviceOrd::CPU()) {

template <typename T>
auto MakeVec(HostDeviceVector<T> *data) {
return MakeVec(data->Device().IsCPU() ? data->HostPointer() : data->DevicePointer(), data->Size(),
data->Device());
return MakeVec(data->Device().IsCUDA() ? data->DevicePointer() : data->HostPointer(),
data->Size(), data->Device());
}

template <typename T>
auto MakeVec(HostDeviceVector<T> const *data) {
return MakeVec(data->Device().IsCPU() ? data->ConstHostPointer() : data->ConstDevicePointer(),
return MakeVec(data->Device().IsCUDA() ? data->ConstDevicePointer() : data->ConstHostPointer(),
data->Size(), data->Device());
}

Expand Down
4 changes: 2 additions & 2 deletions plugin/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ if(PLUGIN_SYCL)
target_compile_definitions(plugin_sycl PUBLIC -DXGBOOST_USE_SYCL=1)
target_link_libraries(plugin_sycl PUBLIC -fsycl)
set_target_properties(plugin_sycl PROPERTIES
COMPILE_FLAGS -fsycl
COMPILE_FLAGS "-fsycl -fno-sycl-id-queries-fit-in-int"
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
POSITION_INDEPENDENT_CODE ON)
if(USE_OPENMP)
find_package(OpenMP REQUIRED)
set_target_properties(plugin_sycl PROPERTIES
COMPILE_FLAGS "-fsycl -qopenmp")
COMPILE_FLAGS "-fsycl -qopenmp -fno-sycl-id-queries-fit-in-int")
endif()
# Get compilation and link flags of plugin_sycl and propagate to objxgboost
target_link_libraries(objxgboost PUBLIC plugin_sycl)
Expand Down
2 changes: 1 addition & 1 deletion plugin/sycl/common/hist_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ namespace common {
template<typename GradientSumT>
void InitHist(::sycl::queue qu, GHistRow<GradientSumT, MemoryType::on_device>* hist,
size_t size, ::sycl::event* event) {
*event = qu.fill(hist->Begin(),
*event = qu.fill(hist->Data(),
xgboost::detail::GradientPairInternal<GradientSumT>(), size, *event);
}
template void InitHist(::sycl::queue qu,
Expand Down
246 changes: 246 additions & 0 deletions plugin/sycl/common/linalg_op.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
/**
* Copyright 2021-2024, XGBoost Contributors
* \file linalg_op.h
*/
#ifndef PLUGIN_SYCL_COMMON_LINALG_OP_H_
#define PLUGIN_SYCL_COMMON_LINALG_OP_H_

#include <vector>
#include <utility>

#include "../data.h"

#include <CL/sycl.hpp>

namespace xgboost {
namespace sycl {
namespace linalg {

struct WorkGroupsParams {
size_t n_workgroups;
size_t workgroup_size;
};

template <typename Fn>
::sycl::event GroupWiseKernel(::sycl::queue* qu, int* flag_ptr,
const std::vector<::sycl::event>& events,
const WorkGroupsParams& wg, Fn &&fn) {
::sycl::buffer<int, 1> flag_buf(flag_ptr, 1);
auto event = qu->submit([&](::sycl::handler& cgh) {
cgh.depends_on(events);
auto flag = flag_buf.get_access<::sycl::access::mode::write>(cgh);
cgh.parallel_for_work_group<>(::sycl::range<1>(wg.n_workgroups),
::sycl::range<1>(wg.workgroup_size),
[=](::sycl::group<1> group) {
group.parallel_for_work_item([&](::sycl::h_item<1> item) {
const size_t idx = item.get_global_id()[0];
fn(idx, flag);
});
});
});
return event;
}

struct Argument {
template <typename T>
operator T&&() const;
};

template <typename Fn, typename Is, typename = void>
struct ArgumentsPassedImpl
: std::false_type {};

template <typename Fn, size_t ...Is>
struct ArgumentsPassedImpl<Fn, std::index_sequence<Is...>,
decltype(std::declval<Fn>()(((void)Is, Argument{})...), void())>
: std::true_type {};

template <typename Fn, size_t N>
struct ArgumentsPassed : ArgumentsPassedImpl<Fn, std::make_index_sequence<N>> {};

template <typename OutputDType, typename InputDType,
size_t BatchSize, size_t MaxNumInputs>
class BatchProcessingHelper {
public:
static constexpr size_t kBatchSize = BatchSize;
using InputType = HostDeviceVector<InputDType>;
using OutputType = HostDeviceVector<OutputDType>;

using ConstInputIteratorT =
typename USMVector<InputDType, MemoryType::on_device>::ConstIterator;
using InputIteratorT = typename USMVector<InputDType, MemoryType::on_device>::Iterator;
using OutputIteratorT = typename USMVector<OutputDType, MemoryType::on_device>::Iterator;

private:
template <size_t NumInput = 0>
void Host2Buffers(InputDType* in_buffer_ptr, const InputType& input) {
/*
* Some inputs may have less than 1 sample per output symbol.
*/
const size_t sub_sample_rate = ndata_ * sample_rates_[NumInput+1] / input.Size();
const size_t n_samples = batch_size_ * sample_rates_[NumInput+1] / sub_sample_rate;

const InputDType* in_host_ptr = input.HostPointer() +
batch_begin_ * sample_rates_[NumInput+1] / sub_sample_rate;

events_[NumInput] =
qu_->memcpy(in_buffer_ptr, in_host_ptr, n_samples * sizeof(InputDType),
events_[MaxNumInputs - 2]);
}

template <size_t NumInput = 0, class... InputTypes>
void Host2Buffers(InputDType* in_buffer_ptr, const InputType& input,
const InputTypes&... other_inputs) {
// Make copy for the first input in the list
Host2Buffers<NumInput>(in_buffer_ptr, input);
// Recurent call for next inputs
InputDType* next_input = in_buffer_.Data() + in_buff_offsets_[NumInput + 1];
Host2Buffers<NumInput+1>(next_input, other_inputs...);
}

void Buffers2Host(OutputType* output) {
const size_t n_samples = batch_size_ * sample_rates_[0];
OutputDType* out_host_ptr = output->HostPointer() + batch_begin_* sample_rates_[0];
events_[MaxNumInputs - 1] =
qu_->memcpy(out_host_ptr, out_buffer_.DataConst(), n_samples * sizeof(OutputDType),
events_[MaxNumInputs - 2]);
}

void Buffers2Host(InputType* output) {
const size_t n_samples = batch_size_ * sample_rates_[1];
InputDType* out_host_ptr = output->HostPointer() + batch_begin_* sample_rates_[1];
events_[MaxNumInputs - 1] =
qu_->memcpy(out_host_ptr, in_buffer_.DataConst(), n_samples * sizeof(InputDType),
events_[MaxNumInputs - 2]);
}

template <size_t NumInputs = 1, typename Fn, class... InputTypes>
void Call(Fn &&fn, ConstInputIteratorT input, const InputTypes... other_inputs) {
static_assert(NumInputs <= MaxNumInputs,
"To many arguments in the passed function");
/* Passed lambda may have less inputs than MaxNumInputs,
* need to pass only requared number of arguments
*/
// 1 for events, 1 for batch_size, 1 for output
if constexpr (ArgumentsPassed<Fn, NumInputs + 1 + 1 + 1>::value) {
events_[MaxNumInputs - 2] = fn(events_, batch_size_,
out_buffer_.Begin(), input, other_inputs...);
} else {
ConstInputIteratorT next_input = in_buffer_.Cbegin() +
in_buff_offsets_[MaxNumInputs - 1 - NumInputs];
Call<NumInputs+1>(std::forward<Fn>(fn), next_input, input, other_inputs...);
}
}

template <size_t NumInputs = 1, typename Fn, class... InputTypes>
void Call(Fn &&fn, InputIteratorT io, ConstInputIteratorT input,
const InputTypes... other_inputs) {
static_assert(NumInputs <= MaxNumInputs,
"To many arguments in the passed function");
if constexpr (ArgumentsPassed<Fn, NumInputs + 1 + 1>::value) {
events_[MaxNumInputs - 2] = fn(events_, batch_size_,
io, input, other_inputs...);
} else {
const ConstInputIteratorT next_input = in_buffer_.Cbegin() +
in_buff_offsets_[MaxNumInputs - NumInputs];
Call<NumInputs+1>(std::forward<Fn>(fn), io, next_input, input, other_inputs...);
}
}

template <size_t NumInputs = 1, typename Fn>
void Call(Fn &&fn, InputIteratorT io) {
static_assert(NumInputs <= MaxNumInputs,
"To many arguments in the passed function");
if constexpr (ArgumentsPassed<Fn, NumInputs + 1 + 1>::value) {
events_[MaxNumInputs - 2] = fn(events_, batch_size_, io);
} else {
const ConstInputIteratorT next_input = in_buffer_.Cbegin() +
in_buff_offsets_[MaxNumInputs - 1];
Call<NumInputs+1>(std::forward<Fn>(fn), io, next_input);
}
}

public:
BatchProcessingHelper() = default;

// The first element of sample_rate always corresonds to output sample rate
void InitBuffers(::sycl::queue* qu, const std::vector<int>& sample_rate) {
assert(sample_rate.size() == MaxNumInputs + 1);
sample_rates_ = sample_rate;
qu_ = qu;
events_.resize(MaxNumInputs + 2);
out_buffer_.Resize(qu, kBatchSize * sample_rate.front());

in_buff_offsets_[0] = 0;
for (size_t i = 1; i < MaxNumInputs; ++i) {
in_buff_offsets_[i] = in_buff_offsets_[i - 1] + kBatchSize * sample_rate[i];
}
const size_t in_buff_size = in_buff_offsets_.back() + kBatchSize * sample_rate.back();
in_buffer_.Resize(qu, in_buff_size);
}

/*
* Batch-wise proces on sycl device
* output = fn(inputs)
*/
template <typename Fn, class... InputTypes>
void Calculate(Fn &&fn, OutputType* output, const InputTypes&... inputs) {
ndata_ = output->Size() / sample_rates_.front();
const size_t nBatch = ndata_ / kBatchSize + (ndata_ % kBatchSize > 0);
for (size_t batch = 0; batch < nBatch; ++batch) {
batch_begin_ = batch * kBatchSize;
batch_end_ = (batch == nBatch - 1) ? ndata_ : batch_begin_ + kBatchSize;
batch_size_ = batch_end_ - batch_begin_;

// Iteratively copy all inputs to device buffers
Host2Buffers(in_buffer_.Data(), inputs...);
// Pack buffers and call function
// We shift input pointer to keep the same order of inputs after packing
Call(std::forward<Fn>(fn), in_buffer_.Cbegin() + in_buff_offsets_.back());
// Copy results to host
Buffers2Host(output);
}
}

/*
* Batch-wise proces on sycl device
* input = fn(input, other_inputs)
*/
template <typename Fn, class... InputTypes>
void Calculate(Fn &&fn, InputType* input, const InputTypes&... other_inputs) {
ndata_ = input->Size() / sample_rates_[1];
const size_t nBatch = ndata_ / kBatchSize + (ndata_ % kBatchSize > 0);
for (size_t batch = 0; batch < nBatch; ++batch) {
batch_begin_ = batch * kBatchSize;
batch_end_ = (batch == nBatch - 1) ? ndata_ : batch_begin_ + kBatchSize;
batch_size_ = batch_end_ - batch_begin_;

// Iteratively copy all inputs to device buffers.
// inputs are pased by const reference
Host2Buffers(in_buffer_.Data(), *(input), other_inputs...);
// Pack buffers and call function
// We shift input pointer to keep the same order of inputs after packing
Call(std::forward<Fn>(fn), in_buffer_.Begin());
// Copy results to host
Buffers2Host(input);
}
}

private:
std::array<int, MaxNumInputs> in_buff_offsets_;
std::vector<int> sample_rates_;
size_t ndata_;
size_t batch_begin_;
size_t batch_end_;
// is not equal to kBatchSize for the last batch
size_t batch_size_;
::sycl::queue* qu_;
std::vector<::sycl::event> events_;
USMVector<InputDType, MemoryType::on_device> in_buffer_;
USMVector<OutputDType, MemoryType::on_device> out_buffer_;
};

} // namespace linalg
} // namespace sycl
} // namespace xgboost
#endif // PLUGIN_SYCL_COMMON_LINALG_OP_H_
6 changes: 3 additions & 3 deletions plugin/sycl/common/row_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ class RowSetCollection {
inline void Init() {
CHECK_EQ(elem_of_each_node_.size(), 0U);

const size_t* begin = row_indices_.Begin();
const size_t* end = row_indices_.End();
const size_t* begin = row_indices_.Data();
const size_t* end = begin + row_indices_.Size();
elem_of_each_node_.emplace_back(Elem(begin, end, 0));
}

Expand All @@ -86,7 +86,7 @@ class RowSetCollection {
size_t n_right) {
const Elem e = elem_of_each_node_[node_id];
CHECK(e.begin != nullptr);
size_t* all_begin = row_indices_.Begin();
size_t* all_begin = row_indices_.Data();
size_t* begin = all_begin + (e.begin - all_begin);


Expand Down
Loading
Loading