Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support building gradient index with cat data. #7371

Merged
merged 5 commits into from
Nov 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 24 additions & 5 deletions src/common/hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,12 @@
#include <utility>
#include <map>

#include "row_set.h"
#include "categorical.h"
#include "common.h"
#include "quantile.h"
#include "row_set.h"
#include "threading_utils.h"
#include "../tree/param.h"
#include "./quantile.h"
#include "./timer.h"
#include "../include/rabit/rabit.h"
#include "timer.h"

namespace xgboost {
class GHistIndexMatrix;
Expand Down Expand Up @@ -105,9 +104,29 @@ class HistogramCuts {
return idx;
}

/**
* \brief Search the bin index for numerical feature.
*/
BinIdx SearchBin(Entry const& e) const {
return SearchBin(e.fvalue, e.index);
}

/**
* \brief Search the bin index for categorical feature.
*/
BinIdx SearchCatBin(Entry const &e) const {
auto const &ptrs = this->Ptrs();
auto const &vals = this->Values();
auto end = ptrs.at(e.index + 1) + vals.cbegin();
auto beg = ptrs[e.index] + vals.cbegin();
// Truncates the value in case it's not perfectly rounded.
auto v = static_cast<float>(common::AsCat(e.fvalue));
auto bin_idx = std::lower_bound(beg, end, v) - vals.cbegin();
if (bin_idx == ptrs.at(e.index + 1)) {
bin_idx -= 1;
}
return bin_idx;
}
};

inline HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins,
Expand Down
6 changes: 4 additions & 2 deletions src/common/quantile.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
*/
#include <limits>
#include <utility>

#include "rabit/rabit.h"
#include "quantile.h"
#include "hist_util.h"
#include "categorical.h"
Expand Down Expand Up @@ -189,7 +191,7 @@ void HostSketchContainer::PushRowPage(
if (is_dense) {
for (size_t ii = begin; ii < end; ii++) {
if (IsCat(feature_types_, ii)) {
categories_[ii].emplace(p_inst[ii].fvalue);
categories_[ii].emplace(AsCat(p_inst[ii].fvalue));
} else {
sketches_[ii].Push(p_inst[ii].fvalue, w);
}
Expand All @@ -199,7 +201,7 @@ void HostSketchContainer::PushRowPage(
auto const& entry = p_inst[i];
if (entry.index >= begin && entry.index < end) {
if (IsCat(feature_types_, entry.index)) {
categories_[entry.index].emplace(entry.fvalue);
categories_[entry.index].emplace(AsCat(entry.fvalue));
} else {
sketches_[entry.index].Push(entry.fvalue, w);
}
Expand Down
19 changes: 11 additions & 8 deletions src/data/gradient_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@

namespace xgboost {

void GHistIndexMatrix::PushBatch(SparsePage const &batch, size_t rbegin,
size_t prev_sum, uint32_t nbins,
void GHistIndexMatrix::PushBatch(SparsePage const &batch,
common::Span<FeatureType const> ft,
size_t rbegin, size_t prev_sum, uint32_t nbins,
int32_t n_threads) {
// The number of threads is pegged to the batch size. If the OMP
// block is parallelized on anything other than the batch/block size,
Expand Down Expand Up @@ -86,23 +87,23 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, size_t rbegin,
common::BinTypeSize curent_bin_size = index.GetBinTypeSize();
if (curent_bin_size == common::kUint8BinsTypeSize) {
common::Span<uint8_t> index_data_span = {index.data<uint8_t>(), n_index};
SetIndexData(index_data_span, batch_threads, batch, rbegin, nbins,
SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins,
[offsets](auto idx, auto j) {
return static_cast<uint8_t>(idx - offsets[j]);
});

} else if (curent_bin_size == common::kUint16BinsTypeSize) {
common::Span<uint16_t> index_data_span = {index.data<uint16_t>(),
n_index};
SetIndexData(index_data_span, batch_threads, batch, rbegin, nbins,
SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins,
[offsets](auto idx, auto j) {
return static_cast<uint16_t>(idx - offsets[j]);
});
} else {
CHECK_EQ(curent_bin_size, common::kUint32BinsTypeSize);
common::Span<uint32_t> index_data_span = {index.data<uint32_t>(),
n_index};
SetIndexData(index_data_span, batch_threads, batch, rbegin, nbins,
SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins,
[offsets](auto idx, auto j) {
return static_cast<uint32_t>(idx - offsets[j]);
});
Expand All @@ -113,7 +114,7 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, size_t rbegin,
not reduced */
} else {
common::Span<uint32_t> index_data_span = {index.data<uint32_t>(), n_index};
SetIndexData(index_data_span, batch_threads, batch, rbegin, nbins,
SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins,
[](auto idx, auto) { return idx; });
}

Expand Down Expand Up @@ -147,15 +148,17 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_bins, common::Span<float> h
size_t prev_sum = 0;
const bool isDense = p_fmat->IsDense();
this->isDense_ = isDense;
auto ft = p_fmat->Info().feature_types.ConstHostSpan();

for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
this->PushBatch(batch, rbegin, prev_sum, nbins, nthread);
this->PushBatch(batch, ft, rbegin, prev_sum, nbins, nthread);
prev_sum = row_ptr[rbegin + batch.Size()];
rbegin += batch.Size();
}
}

void GHistIndexMatrix::Init(SparsePage const &batch,
common::Span<FeatureType const> ft,
common::HistogramCuts const &cuts,
int32_t max_bins_per_feat, bool isDense,
int32_t n_threads) {
Expand All @@ -176,7 +179,7 @@ void GHistIndexMatrix::Init(SparsePage const &batch,
size_t rbegin = 0;
size_t prev_sum = 0;

this->PushBatch(batch, rbegin, prev_sum, nbins, n_threads);
this->PushBatch(batch, ft, rbegin, prev_sum, nbins, n_threads);
}

void GHistIndexMatrix::ResizeIndex(const size_t n_index,
Expand Down
25 changes: 18 additions & 7 deletions src/data/gradient_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <vector>
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "../common/categorical.h"
#include "../common/hist_util.h"
#include "../common/threading_utils.h"

Expand All @@ -18,8 +19,9 @@ namespace xgboost {
* index for CPU histogram. On GPU ellpack page is used.
*/
class GHistIndexMatrix {
void PushBatch(SparsePage const &batch, size_t rbegin, size_t prev_sum,
uint32_t nbins, int32_t n_threads);
void PushBatch(SparsePage const &batch, common::Span<FeatureType const> ft,
size_t rbegin, size_t prev_sum, uint32_t nbins,
int32_t n_threads);

public:
/*! \brief row pointer to rows by element position */
Expand All @@ -40,12 +42,14 @@ class GHistIndexMatrix {
}
// Create a global histogram matrix, given cut
void Init(DMatrix* p_fmat, int max_num_bins, common::Span<float> hess);
void Init(SparsePage const &page, common::HistogramCuts const &cuts,
int32_t max_bins_per_feat, bool is_dense, int32_t n_threads);
void Init(SparsePage const &page, common::Span<FeatureType const> ft,
common::HistogramCuts const &cuts, int32_t max_bins_per_feat,
bool is_dense, int32_t n_threads);

// specific method for sparse data as no possibility to reduce allocated memory
template <typename BinIdxType, typename GetOffset>
void SetIndexData(common::Span<BinIdxType> index_data_span,
common::Span<FeatureType const> ft,
size_t batch_threads, const SparsePage &batch,
size_t rbegin, size_t nbins, GetOffset get_offset) {
const xgboost::Entry *data_ptr = batch.data.HostVector().data();
Expand All @@ -61,9 +65,16 @@ class GHistIndexMatrix {
SparsePage::Inst inst = {data_ptr + offset_vec[i], size};
CHECK_EQ(ibegin + inst.size(), iend);
for (bst_uint j = 0; j < inst.size(); ++j) {
uint32_t idx = cut.SearchBin(inst[j]);
index_data[ibegin + j] = get_offset(idx, j);
++hit_count_tloc_[tid * nbins + idx];
auto e = inst[j];
if (common::IsCat(ft, e.index)) {
auto bin_idx = cut.SearchCatBin(e);
index_data[ibegin + j] = get_offset(bin_idx, j);
++hit_count_tloc_[tid * nbins + bin_idx];
} else {
uint32_t idx = cut.SearchBin(inst[j]);
index_data[ibegin + j] = get_offset(idx, j);
++hit_count_tloc_[tid * nbins + idx];
}
}
});
}
Expand Down
3 changes: 2 additions & 1 deletion src/data/gradient_index_page_source.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ void GradientIndexPageSource::Fetch() {
auto const& csr = source_->Page();
this->page_.reset(new GHistIndexMatrix());
CHECK_NE(cuts_.Values().size(), 0);
this->page_->Init(*csr, cuts_, max_bin_per_feat_, is_dense_, nthreads_);
this->page_->Init(*csr, feature_types_, cuts_, max_bin_per_feat_, is_dense_,
nthreads_);
this->WriteCache();
}
}
Expand Down
6 changes: 4 additions & 2 deletions src/data/gradient_index_page_source.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,18 @@ class GradientIndexPageSource : public PageSourceIncMixIn<GHistIndexMatrix> {
common::HistogramCuts cuts_;
bool is_dense_;
int32_t max_bin_per_feat_;
common::Span<FeatureType const> feature_types_;

public:
GradientIndexPageSource(float missing, int nthreads, bst_feature_t n_features,
size_t n_batches, std::shared_ptr<Cache> cache,
BatchParam param, common::HistogramCuts cuts,
bool is_dense, int32_t max_bin_per_feat,
common::Span<FeatureType const> feature_types,
std::shared_ptr<SparsePageSource> source)
: PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache),
cuts_{std::move(cuts)}, is_dense_{is_dense}, max_bin_per_feat_{
max_bin_per_feat} {
cuts_{std::move(cuts)}, is_dense_{is_dense},
max_bin_per_feat_{max_bin_per_feat}, feature_types_{feature_types} {
this->source_ = source;
this->Fetch();
}
Expand Down
3 changes: 2 additions & 1 deletion src/data/sparse_page_dmatrix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -184,10 +184,11 @@ BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(const BatchParam&
batch_param_ = param;
ghist_index_source_.reset();
CHECK_NE(cuts.Values().size(), 0);
auto ft = this->info_.feature_types.ConstHostSpan();
ghist_index_source_.reset(new GradientIndexPageSource(
this->missing_, this->ctx_.Threads(), this->Info().num_col_,
this->n_batches_, cache_info_.at(id), param, std::move(cuts),
this->IsDense(), param.max_bin, sparse_page_source_));
this->IsDense(), param.max_bin, ft, sparse_page_source_));
} else {
CHECK(ghist_index_source_);
ghist_index_source_->Reset();
Expand Down
3 changes: 2 additions & 1 deletion src/tree/gpu_hist/gradient_based_sampler.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*!
* Copyright 2019 by XGBoost Contributors
* Copyright 2019-2021 by XGBoost Contributors
*/
#include <thrust/functional.h>
#include <thrust/random.h>
Expand All @@ -13,6 +13,7 @@

#include "../../common/compressed_iterator.h"
#include "../../common/random.h"
#include "../param.h"
#include "gradient_based_sampler.cuh"

namespace xgboost {
Expand Down
34 changes: 34 additions & 0 deletions tests/cpp/data/test_gradient_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,39 @@ TEST(GradientIndex, ExternalMemory) {
++i;
}
}

TEST(GradientIndex, FromCategoricalBasic) {
size_t constexpr kRows = 1000, kCats = 13, kCols = 1;
size_t max_bins = 8;
auto x = GenerateRandomCategoricalSingleColumn(kRows, kCats);
auto m = GetDMatrixFromData(x, kRows, 1);

auto &h_ft = m->Info().feature_types.HostVector();
h_ft.resize(kCols, FeatureType::kCategorical);

BatchParam p(0, max_bins);
GHistIndexMatrix gidx;

gidx.Init(m.get(), max_bins, {});

auto x_copy = x;
std::sort(x_copy.begin(), x_copy.end());
auto n_uniques = std::unique(x_copy.begin(), x_copy.end()) - x_copy.begin();
ASSERT_EQ(n_uniques, kCats);

auto const &h_cut_ptr = gidx.cut.Ptrs();
auto const &h_cut_values = gidx.cut.Values();

ASSERT_EQ(h_cut_ptr.size(), 2);
ASSERT_EQ(h_cut_values.size(), kCats);

auto const &index = gidx.index;

for (size_t i = 0; i < x.size(); ++i) {
auto bin = index[i];
auto bin_value = h_cut_values.at(bin);
ASSERT_EQ(common::AsCat(x[i]), common::AsCat(bin_value));
}
}
} // namespace data
} // namespace xgboost
4 changes: 4 additions & 0 deletions tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
/*!
* Copyright 2020-2021 by XGBoost Contributors
*/
#include <gtest/gtest.h>

#include "../../../../src/data/ellpack_page.cuh"
#include "../../../../src/tree/gpu_hist/gradient_based_sampler.cuh"
#include "../../../../src/tree/param.h"
#include "../../helpers.h"
#include "dmlc/filesystem.h"

Expand Down