Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ranking metric computation accelaration on gpu #5326

Closed
wants to merge 12 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 172 additions & 0 deletions src/common/device_helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
#include <thrust/system/cuda/error.h>
#include <thrust/system_error.h>
#include <thrust/logical.h>
#include <thrust/gather.h>

#include <omp.h>
#include <rabit/rabit.h>
#include <cub/cub.cuh>
#include <cub/util_allocator.cuh>
Expand Down Expand Up @@ -482,6 +484,176 @@ using device_vector = thrust::device_vector<T, XGBDeviceAllocator<T>>;
template <typename T>
using caching_device_vector = thrust::device_vector<T, XGBCachingDeviceAllocator<T>>;

// This type sorts an array which is divided into multiple groups. The sorting is influenced
// by the function object 'Comparator'
template <typename T>
class SegmentSorter {
private:
// Items sorted within the group
caching_device_vector<T> ditems_;

// Original position of the items before they are sorted descendingly within its groups
caching_device_vector<uint32_t> doriginal_pos_;

// Segments within the original list that delineates the different groups
caching_device_vector<uint32_t> group_segments_;

// Need this on the device as it is used in the kernels
caching_device_vector<uint32_t> dgroups_; // Group information on device

// Where did the item that was originally present at position 'x' move to after they are sorted
caching_device_vector<uint32_t> dindexable_sorted_pos_;

// Initialize everything but the segments
void Init(uint32_t num_elems) {
trivialfis marked this conversation as resolved.
Show resolved Hide resolved
ditems_.resize(num_elems);

doriginal_pos_.resize(num_elems);
thrust::sequence(doriginal_pos_.begin(), doriginal_pos_.end());
}

// Initialize all with group info
void Init(const std::vector<uint32_t> &groups) {
uint32_t num_elems = groups.back();
this->Init(num_elems);
this->CreateGroupSegments(groups);
}

public:
// This needs to be public due to device lambda
void CreateGroupSegments(const std::vector<uint32_t> &groups) {
uint32_t num_elems = groups.back();
group_segments_.resize(num_elems, 0);

dgroups_ = groups;

if (GetNumGroups() == 1) return; // There are no segments; hence, no need to compute them

// Define the segments by assigning a group ID to each element
const uint32_t *dgroups = dgroups_.data().get();
uint32_t ngroups = dgroups_.size();
auto ComputeGroupIDLambda = [=] __device__(uint32_t idx) {
return dh::UpperBound(dgroups, ngroups, idx) - 1;
}; // NOLINT

thrust::transform(thrust::make_counting_iterator(static_cast<uint32_t>(0)),
thrust::make_counting_iterator(num_elems),
group_segments_.begin(),
ComputeGroupIDLambda);
}

// Accessors that returns device pointer
inline const T *GetItemsPtr() const { return ditems_.data().get(); }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pass Span rather than pointers.

inline uint32_t GetNumItems() const { return ditems_.size(); }
inline const caching_device_vector<T> &GetItems() const {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Span over const device vectors.

return ditems_;
}

inline const uint32_t *GetOriginalPositionsPtr() const { return doriginal_pos_.data().get(); }
inline const caching_device_vector<uint32_t> &GetOriginalPositions() const {
return doriginal_pos_;
}

inline const caching_device_vector<uint32_t> &GetGroupSegments() const {
return group_segments_;
}

inline uint32_t GetNumGroups() const { return dgroups_.size() - 1; }
inline const uint32_t *GetGroupsPtr() const { return dgroups_.data().get(); }
inline const caching_device_vector<uint32_t> &GetGroups() const { return dgroups_; }

inline const caching_device_vector<uint32_t> &GetIndexableSortedPositions() const {
return dindexable_sorted_pos_;
}

// Sort an array that is divided into multiple groups. The array is sorted within each group.
// This version provides the group information that is on the host.
// The array is sorted based on an adaptable binary predicate. By default a stateless predicate
// is used.
template <typename Comparator = thrust::greater<T>>
void SortItems(const T *ditems, uint32_t item_size, const std::vector<uint32_t> &groups,
const Comparator &comp = Comparator()) {
this->Init(groups);
this->SortItems(ditems, item_size, group_segments_, comp);
}

// Sort an array that is divided into multiple groups. The array is sorted within each group.
// This version provides the group information that is on the device.
// The array is sorted based on an adaptable binary predicate. By default a stateless predicate
// is used.
template <typename Comparator = thrust::greater<T>>
void SortItems(const T *ditems, uint32_t item_size,
const caching_device_vector<uint32_t> &group_segments,
const Comparator &comp = Comparator()) {
this->Init(item_size);

// Sort the items that are grouped. We would like to avoid using predicates to perform the sort,
// as thrust resorts to using a merge sort as opposed to a much much faster radix sort
// when comparators are used. Hence, the following algorithm is used. This is done so that
// we can grab the appropriate related values from the original list later, after the
// items are sorted.
//
// Here is the internal representation:
// dgroups_: [ 0, 3, 5, 8, 10 ]
// group_segments_: 0 0 0 | 1 1 | 2 2 2 | 3 3
// doriginal_pos_: 0 1 2 | 3 4 | 5 6 7 | 8 9
// ditems_: 1 0 1 | 2 1 | 1 3 3 | 4 4 (from original items)
//
// Sort the items first and make a note of the original positions in doriginal_pos_
// based on the sort
// ditems_: 4 4 3 3 2 1 1 1 1 0
// doriginal_pos_: 8 9 6 7 3 0 2 4 5 1
// NOTE: This consumes space, but is much faster than some of the other approaches - sorting
// in kernel, sorting using predicates etc.

ditems_.assign(thrust::device_ptr<const T>(ditems),
thrust::device_ptr<const T>(ditems) + item_size);

// Allocator to be used by sort for managing space overhead while sorting
dh::XGBCachingDeviceAllocator<char> alloc;

thrust::stable_sort_by_key(thrust::cuda::par(alloc),
ditems_.begin(), ditems_.end(),
doriginal_pos_.begin(), comp);

if (GetNumGroups() == 1) return; // The entire array is sorted, as it isn't segmented

// Next, gather the segments based on the doriginal_pos_. This is to reflect the
// holisitic item sort order on the segments
// group_segments_c_: 3 3 2 2 1 0 0 1 2 0
// doriginal_pos_: 8 9 6 7 3 0 2 4 5 1 (stays the same)
caching_device_vector<uint32_t> group_segments_c(group_segments);
thrust::gather(doriginal_pos_.begin(), doriginal_pos_.end(),
group_segments.begin(), group_segments_c.begin());

// Now, sort the group segments so that you may bring the items within the group together,
// in the process also noting the relative changes to the doriginal_pos_ while that happens
// group_segments_c_: 0 0 0 1 1 2 2 2 3 3
// doriginal_pos_: 0 2 1 3 4 6 7 5 8 9
thrust::stable_sort_by_key(thrust::cuda::par(alloc),
group_segments_c.begin(), group_segments_c.end(),
doriginal_pos_.begin(), thrust::less<uint32_t>());

// Finally, gather the original items based on doriginal_pos_ to sort the input and
// to store them in ditems_
// doriginal_pos_: 0 2 1 3 4 6 7 5 8 9 (stays the same)
// ditems_: 1 1 0 2 1 3 3 1 4 4 (from unsorted items - ditems)
thrust::gather(doriginal_pos_.begin(), doriginal_pos_.end(),
thrust::device_ptr<const T>(ditems), ditems_.begin());
}

// Determine where an item that was originally present at position 'x' has been relocated to
// after a sort. Creation of such an index has to be explicitly requested after a sort
void CreateIndexableSortedPositions() {
dindexable_sorted_pos_.resize(GetNumItems());
thrust::scatter(thrust::make_counting_iterator(static_cast<uint32_t>(0)),
thrust::make_counting_iterator(GetNumItems()), // Rearrange indices...
// ...based on this map
thrust::device_ptr<const uint32_t>(GetOriginalPositionsPtr()),
dindexable_sorted_pos_.begin()); // Write results into this
}
};

/**
* \brief A double buffer, useful for algorithms like sort.
*/
Expand Down
54 changes: 43 additions & 11 deletions src/metric/metric.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
* \brief Registry of objective functions.
*/
#include <dmlc/registry.h>

#include <xgboost/metric.h>
#include <xgboost/generic_parameters.h>

namespace dmlc {
DMLC_REGISTRY_ENABLE(::xgboost::MetricReg);
}
#include "metric_common.h"

namespace xgboost {
Metric* Metric::Create(const std::string& name, GenericParameter const* tparam) {
template <typename MetricRegistry>
Metric* CreateMetricImpl(const std::string& name, GenericParameter const* tparam) {
std::string buf = name;
std::string prefix = name;
const char* param;
Expand All @@ -26,32 +26,64 @@ Metric* Metric::Create(const std::string& name, GenericParameter const* tparam)
prefix = buf;
param = nullptr;
}
auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(prefix.c_str());
auto *e = ::dmlc::Registry<MetricRegistry>::Get()->Find(prefix.c_str());
if (e == nullptr) {
LOG(FATAL) << "Unknown metric function " << name;
return nullptr;
}
auto p_metric = (e->body)(param);
p_metric->tparam_ = tparam;
return p_metric;
} else {
std::string prefix = buf.substr(0, pos);
auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(prefix.c_str());
auto *e = ::dmlc::Registry<MetricRegistry>::Get()->Find(prefix.c_str());
if (e == nullptr) {
LOG(FATAL) << "Unknown metric function " << name;
return nullptr;
}
auto p_metric = (e->body)(buf.substr(pos + 1, buf.length()).c_str());
p_metric->tparam_ = tparam;
return p_metric;
}
}

Metric *
Metric::Create(const std::string& name, GenericParameter const* tparam) {
auto metric = CreateMetricImpl<MetricReg>(name, tparam);
if (metric == nullptr) {
LOG(FATAL) << "Unknown metric function " << name;
}

metric->tparam_ = tparam;
return metric;
}

Metric *
GPUMetric::CreateGPUMetric(const std::string& name, GenericParameter const* tparam) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure why this is needed. I don't recall having to do this for other factory methods in GPU code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the main idea is to decouple the cpu and gpu implementations when there is significant digression without the cuda file include trick. @trivialfis raised this issue and the discussion is subsumed in this pr. more specifically, the following change may shed some light.

the gpu implementation is now dynamically looked up from the registry and dispatched when there is a valid device present and if xgboost is gpu enabled.

auto metric = CreateMetricImpl<MetricGPUReg>(name, tparam);
if (metric == nullptr) {
LOG(WARNING) << "Cannot find a GPU metric builder for metric " << name
<< ". Resorting to the CPU builder";
return metric;
}

// Narrowing reference only for the compiler to allow assignment to a base class member.
// As such, using this narrowed reference to refer to derived members will be an illegal op.
// This is moot, as this type is stateless.
static_cast<GPUMetric *>(metric)->tparam_ = tparam;
return metric;
}
} // namespace xgboost

namespace dmlc {
DMLC_REGISTRY_ENABLE(::xgboost::MetricReg);
DMLC_REGISTRY_ENABLE(::xgboost::MetricGPUReg);
}

namespace xgboost {
namespace metric {

// List of files that will be force linked in static links.
DMLC_REGISTRY_LINK_TAG(elementwise_metric);
DMLC_REGISTRY_LINK_TAG(multiclass_metric);
DMLC_REGISTRY_LINK_TAG(rank_metric);
#ifdef XGBOOST_USE_CUDA
DMLC_REGISTRY_LINK_TAG(rank_metric_gpu);
#endif
} // namespace metric
} // namespace xgboost
57 changes: 57 additions & 0 deletions src/metric/metric_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,68 @@
#ifndef XGBOOST_METRIC_METRIC_COMMON_H_
#define XGBOOST_METRIC_METRIC_COMMON_H_

#include <utility>
#include <vector>
#include <limits>
#include <string>

#include "../common/common.h"

namespace {

using PredIndPair = std::pair<xgboost::bst_float, unsigned>;
using PredIndPairContainer = std::vector<PredIndPair>;

} // anonymous namespace

namespace xgboost {
// This creates a GPU metric instance dynamically and adds it to the GPU metric registry, if not
// present already. This is created when there is a device ordinal present and if xgboost
// is compiled with CUDA support
struct GPUMetric : Metric {
static Metric *CreateGPUMetric(const std::string& name, GenericParameter const* tparam);
};

/*!
* \brief Internal registry entries for GPU Metric factory functions.
* The additional parameter const char* param gives the value after @, can be null.
* For example, metric map@3, then: param == "3".
*/
struct MetricGPUReg
: public dmlc::FunctionRegEntryBase<MetricGPUReg,
std::function<Metric * (const char*)> > {
};

/*!
* \brief Macro to register metric computed on GPU.
*
* \code
* // example of registering a objective ndcg@k
* XGBOOST_REGISTER_GPU_METRIC(NDCG_GPU, "ndcg")
* .describe("NDCG metric computer on GPU.")
* .set_body([](const char* param) {
* int at_k = atoi(param);
* return new NDCG(at_k);
* });
* \endcode
*/

// Note: Metric names registered in the GPU registry should follow this convention:
// - GPU metric types should be registered with the same name as the non GPU metric types
#define XGBOOST_REGISTER_GPU_METRIC(UniqueId, Name) \
::xgboost::MetricGPUReg& __make_ ## MetricGPUReg ## _ ## UniqueId ## __ = \
::dmlc::Registry< ::xgboost::MetricGPUReg>::Get()->__REGISTER__(Name)

namespace metric {

// Ranking config to be used on device and host
struct EvalRankConfig {
public:
unsigned topn{std::numeric_limits<unsigned>::max()};
std::string name;
bool minus{false};
};

class PackedReduceResult {
double residue_sum_;
double weights_sum_;
Expand Down
Loading