dmlc · sriramch · Feb 19, 2020 · Feb 19, 2020 · Feb 19, 2020 · Feb 24, 2020
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
@@ -8,7 +8,9 @@
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
 #include <thrust/logical.h>
+#include <thrust/gather.h>
 
+#include <omp.h>
 #include <rabit/rabit.h>
 #include <cub/cub.cuh>
 #include <cub/util_allocator.cuh>
@@ -482,6 +484,176 @@ using device_vector = thrust::device_vector<T,  XGBDeviceAllocator<T>>;
 template <typename T>
 using caching_device_vector = thrust::device_vector<T,  XGBCachingDeviceAllocator<T>>;
 
+// This type sorts an array which is divided into multiple groups. The sorting is influenced
+// by the function object 'Comparator'
+template <typename T>
+class SegmentSorter {
+ private:
+  // Items sorted within the group
+  caching_device_vector<T> ditems_;
+
+  // Original position of the items before they are sorted descendingly within its groups
+  caching_device_vector<uint32_t> doriginal_pos_;
+
+  // Segments within the original list that delineates the different groups
+  caching_device_vector<uint32_t> group_segments_;
+
+  // Need this on the device as it is used in the kernels
+  caching_device_vector<uint32_t> dgroups_;       // Group information on device
+
+  // Where did the item that was originally present at position 'x' move to after they are sorted
+  caching_device_vector<uint32_t> dindexable_sorted_pos_;
+
+  // Initialize everything but the segments
+  void Init(uint32_t num_elems) {
+    ditems_.resize(num_elems);
+
+    doriginal_pos_.resize(num_elems);
+    thrust::sequence(doriginal_pos_.begin(), doriginal_pos_.end());
+  }
+
+  // Initialize all with group info
+  void Init(const std::vector<uint32_t> &groups) {
+    uint32_t num_elems = groups.back();
+    this->Init(num_elems);
+    this->CreateGroupSegments(groups);
+  }
+
+ public:
+  // This needs to be public due to device lambda
+  void CreateGroupSegments(const std::vector<uint32_t> &groups) {
+    uint32_t num_elems = groups.back();
+    group_segments_.resize(num_elems, 0);
+
+    dgroups_ = groups;
+
+    if (GetNumGroups() == 1) return;  // There are no segments; hence, no need to compute them
+
+    // Define the segments by assigning a group ID to each element
+    const uint32_t *dgroups = dgroups_.data().get();
+    uint32_t ngroups = dgroups_.size();
+    auto ComputeGroupIDLambda = [=] __device__(uint32_t idx) {
+      return dh::UpperBound(dgroups, ngroups, idx) - 1;
+    };  // NOLINT
+
+    thrust::transform(thrust::make_counting_iterator(static_cast<uint32_t>(0)),
+                      thrust::make_counting_iterator(num_elems),
+                      group_segments_.begin(),
+                      ComputeGroupIDLambda);
+  }
+
+  // Accessors that returns device pointer
+  inline const T *GetItemsPtr() const { return ditems_.data().get(); }
+  inline uint32_t GetNumItems() const { return ditems_.size(); }
+  inline const caching_device_vector<T> &GetItems() const {
+    return ditems_;
+  }
+
+  inline const uint32_t *GetOriginalPositionsPtr() const { return doriginal_pos_.data().get(); }
+  inline const caching_device_vector<uint32_t> &GetOriginalPositions() const {
+    return doriginal_pos_;
+  }
+
+  inline const caching_device_vector<uint32_t> &GetGroupSegments() const {
+    return group_segments_;
+  }
+
+  inline uint32_t GetNumGroups() const { return dgroups_.size() - 1; }
+  inline const uint32_t *GetGroupsPtr() const { return dgroups_.data().get(); }
+  inline const caching_device_vector<uint32_t> &GetGroups() const { return dgroups_; }
+
+  inline const caching_device_vector<uint32_t> &GetIndexableSortedPositions() const {
+    return dindexable_sorted_pos_;
+  }
+
+  // Sort an array that is divided into multiple groups. The array is sorted within each group.
+  // This version provides the group information that is on the host.
+  // The array is sorted based on an adaptable binary predicate. By default a stateless predicate
+  // is used.
+  template <typename Comparator = thrust::greater<T>>
+  void SortItems(const T *ditems, uint32_t item_size, const std::vector<uint32_t> &groups,
+                 const Comparator &comp = Comparator()) {
+    this->Init(groups);
+    this->SortItems(ditems, item_size, group_segments_, comp);
+  }
+
+  // Sort an array that is divided into multiple groups. The array is sorted within each group.
+  // This version provides the group information that is on the device.
+  // The array is sorted based on an adaptable binary predicate. By default a stateless predicate
+  // is used.
+  template <typename Comparator = thrust::greater<T>>
+  void SortItems(const T *ditems, uint32_t item_size,
+                 const caching_device_vector<uint32_t> &group_segments,
+                 const Comparator &comp = Comparator()) {
+    this->Init(item_size);
+
+    // Sort the items that are grouped. We would like to avoid using predicates to perform the sort,
+    // as thrust resorts to using a merge sort as opposed to a much much faster radix sort
+    // when comparators are used. Hence, the following algorithm is used. This is done so that
+    // we can grab the appropriate related values from the original list later, after the
+    // items are sorted.
+    //
+    // Here is the internal representation:
+    // dgroups_:          [ 0, 3, 5, 8, 10 ]
+    // group_segments_:   0 0 0 | 1 1 | 2 2 2 | 3 3
+    // doriginal_pos_:    0 1 2 | 3 4 | 5 6 7 | 8 9
+    // ditems_:           1 0 1 | 2 1 | 1 3 3 | 4 4 (from original items)
+    //
+    // Sort the items first and make a note of the original positions in doriginal_pos_
+    // based on the sort
+    // ditems_:           4 4 3 3 2 1 1 1 1 0
+    // doriginal_pos_:    8 9 6 7 3 0 2 4 5 1
+    // NOTE: This consumes space, but is much faster than some of the other approaches - sorting
+    //       in kernel, sorting using predicates etc.
+
+    ditems_.assign(thrust::device_ptr<const T>(ditems),
+                   thrust::device_ptr<const T>(ditems) + item_size);
+
+    // Allocator to be used by sort for managing space overhead while sorting
+    dh::XGBCachingDeviceAllocator<char> alloc;
+
+    thrust::stable_sort_by_key(thrust::cuda::par(alloc),
+                               ditems_.begin(), ditems_.end(),
+                               doriginal_pos_.begin(), comp);
+
+    if (GetNumGroups() == 1) return;  // The entire array is sorted, as it isn't segmented
+
+    // Next, gather the segments based on the doriginal_pos_. This is to reflect the
+    // holisitic item sort order on the segments
+    // group_segments_c_:   3 3 2 2 1 0 0 1 2 0
+    // doriginal_pos_:      8 9 6 7 3 0 2 4 5 1 (stays the same)
+    caching_device_vector<uint32_t> group_segments_c(group_segments);
+    thrust::gather(doriginal_pos_.begin(), doriginal_pos_.end(),
+                   group_segments.begin(), group_segments_c.begin());
+
+    // Now, sort the group segments so that you may bring the items within the group together,
+    // in the process also noting the relative changes to the doriginal_pos_ while that happens
+    // group_segments_c_:   0 0 0 1 1 2 2 2 3 3
+    // doriginal_pos_:      0 2 1 3 4 6 7 5 8 9
+    thrust::stable_sort_by_key(thrust::cuda::par(alloc),
+                               group_segments_c.begin(), group_segments_c.end(),
+                               doriginal_pos_.begin(), thrust::less<uint32_t>());
+
+    // Finally, gather the original items based on doriginal_pos_ to sort the input and
+    // to store them in ditems_
+    // doriginal_pos_:      0 2 1 3 4 6 7 5 8 9  (stays the same)
+    // ditems_:             1 1 0 2 1 3 3 1 4 4  (from unsorted items - ditems)
+    thrust::gather(doriginal_pos_.begin(), doriginal_pos_.end(),
+                   thrust::device_ptr<const T>(ditems), ditems_.begin());
+  }
+
+  // Determine where an item that was originally present at position 'x' has been relocated to
+  // after a sort. Creation of such an index has to be explicitly requested after a sort
+  void CreateIndexableSortedPositions() {
+    dindexable_sorted_pos_.resize(GetNumItems());
+    thrust::scatter(thrust::make_counting_iterator(static_cast<uint32_t>(0)),
+                    thrust::make_counting_iterator(GetNumItems()),  // Rearrange indices...
+                    // ...based on this map
+                    thrust::device_ptr<const uint32_t>(GetOriginalPositionsPtr()),
+                    dindexable_sorted_pos_.begin());  // Write results into this
+  }
+};
+
 /**
  * \brief A double buffer, useful for algorithms like sort.
  */

diff --git a/src/metric/metric.cc b/src/metric/metric.cc
@@ -4,15 +4,15 @@
  * \brief Registry of objective functions.
  */
 #include <dmlc/registry.h>
+
 #include <xgboost/metric.h>
 #include <xgboost/generic_parameters.h>
 
-namespace dmlc {
-DMLC_REGISTRY_ENABLE(::xgboost::MetricReg);
-}
+#include "metric_common.h"
 
 namespace xgboost {
-Metric* Metric::Create(const std::string& name, GenericParameter const* tparam) {
+template <typename MetricRegistry>
+Metric* CreateMetricImpl(const std::string& name, GenericParameter const* tparam) {
   std::string buf = name;
   std::string prefix = name;
   const char* param;
@@ -26,32 +26,64 @@ Metric* Metric::Create(const std::string& name, GenericParameter const* tparam)
       prefix = buf;
       param = nullptr;
     }
-    auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(prefix.c_str());
+    auto *e = ::dmlc::Registry<MetricRegistry>::Get()->Find(prefix.c_str());
     if (e == nullptr) {
-      LOG(FATAL) << "Unknown metric function " << name;
+      return nullptr;
     }
     auto p_metric = (e->body)(param);
-    p_metric->tparam_ = tparam;
     return p_metric;
   } else {
     std::string prefix = buf.substr(0, pos);
-    auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(prefix.c_str());
+    auto *e = ::dmlc::Registry<MetricRegistry>::Get()->Find(prefix.c_str());
     if (e == nullptr) {
-      LOG(FATAL) << "Unknown metric function " << name;
+      return nullptr;
     }
     auto p_metric = (e->body)(buf.substr(pos + 1, buf.length()).c_str());
-    p_metric->tparam_ = tparam;
     return p_metric;
   }
 }
+
+Metric *
+Metric::Create(const std::string& name, GenericParameter const* tparam) {
+  auto metric = CreateMetricImpl<MetricReg>(name, tparam);
+  if (metric == nullptr) {
+    LOG(FATAL) << "Unknown metric function " << name;
+  }
+
+  metric->tparam_ = tparam;
+  return metric;
+}
+
+Metric *
+GPUMetric::CreateGPUMetric(const std::string& name, GenericParameter const* tparam) {
+  auto metric = CreateMetricImpl<MetricGPUReg>(name, tparam);
+  if (metric == nullptr) {
+    LOG(WARNING) << "Cannot find a GPU metric builder for metric " << name
+                 << ". Resorting to the CPU builder";
+    return metric;
+  }
+
+  // Narrowing reference only for the compiler to allow assignment to a base class member.
+  // As such, using this narrowed reference to refer to derived members will be an illegal op.
+  // This is moot, as this type is stateless.
+  static_cast<GPUMetric *>(metric)->tparam_ = tparam;
+  return metric;
+}
 }  // namespace xgboost
 
+namespace dmlc {
+DMLC_REGISTRY_ENABLE(::xgboost::MetricReg);
+DMLC_REGISTRY_ENABLE(::xgboost::MetricGPUReg);
+}
+
 namespace xgboost {
 namespace metric {
-
 // List of files that will be force linked in static links.
 DMLC_REGISTRY_LINK_TAG(elementwise_metric);
 DMLC_REGISTRY_LINK_TAG(multiclass_metric);
 DMLC_REGISTRY_LINK_TAG(rank_metric);
+#ifdef XGBOOST_USE_CUDA
+DMLC_REGISTRY_LINK_TAG(rank_metric_gpu);
+#endif
 }  // namespace metric
 }  // namespace xgboost
diff --git a/src/metric/metric_common.h b/src/metric/metric_common.h
@@ -5,11 +5,68 @@
 #ifndef XGBOOST_METRIC_METRIC_COMMON_H_
 #define XGBOOST_METRIC_METRIC_COMMON_H_
 
+#include <utility>
+#include <vector>
+#include <limits>
+#include <string>
+
 #include "../common/common.h"
 
+namespace {
+
+using PredIndPair = std::pair<xgboost::bst_float, unsigned>;
+using PredIndPairContainer = std::vector<PredIndPair>;
+
+}  // anonymous namespace
+
 namespace xgboost {
+// This creates a GPU metric instance dynamically and adds it to the GPU metric registry, if not
+// present already. This is created when there is a device ordinal present and if xgboost
+// is compiled with CUDA support
+struct GPUMetric : Metric {
+  static Metric *CreateGPUMetric(const std::string& name, GenericParameter const* tparam);
+};
+
+/*!
+ * \brief Internal registry entries for GPU Metric factory functions.
+ *  The additional parameter const char* param gives the value after @, can be null.
+ *  For example, metric map@3, then: param == "3".
+ */
+struct MetricGPUReg
+    : public dmlc::FunctionRegEntryBase<MetricGPUReg,
+                                        std::function<Metric * (const char*)> > {
+};
+
+/*!
+ * \brief Macro to register metric computed on GPU.
+ *
+ * \code
+ * // example of registering a objective ndcg@k
+ * XGBOOST_REGISTER_GPU_METRIC(NDCG_GPU, "ndcg")
+ * .describe("NDCG metric computer on GPU.")
+ * .set_body([](const char* param) {
+ *     int at_k = atoi(param);
+ *     return new NDCG(at_k);
+ *   });
+ * \endcode
+ */
+
+// Note: Metric names registered in the GPU registry should follow this convention:
+// - GPU metric types should be registered with the same name as the non GPU metric types
+#define XGBOOST_REGISTER_GPU_METRIC(UniqueId, Name)                         \
+  ::xgboost::MetricGPUReg&  __make_ ## MetricGPUReg ## _ ## UniqueId ## __ =  \
+      ::dmlc::Registry< ::xgboost::MetricGPUReg>::Get()->__REGISTER__(Name)
+
 namespace metric {
 
+// Ranking config to be used on device and host
+struct EvalRankConfig {
+ public:
+  unsigned topn{std::numeric_limits<unsigned>::max()};
+  std::string name;
+  bool minus{false};
+};
+
 class PackedReduceResult {
   double residue_sum_;
   double weights_sum_;