Skip to content

Commit

Permalink
Optimized ApplySplit and UpdatePredictionCache functions
Browse files Browse the repository at this point in the history
  • Loading branch information
SmirnovEgorRu committed Jan 29, 2020
1 parent 02b7232 commit 4d18012
Show file tree
Hide file tree
Showing 7 changed files with 551 additions and 246 deletions.
1 change: 1 addition & 0 deletions src/common/column_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class Column {
size_t Size() const { return len_; }
uint32_t GetGlobalBinIdx(size_t idx) const { return index_base_ + index_[idx]; }
uint32_t GetFeatureBinIdx(size_t idx) const { return index_[idx]; }
const uint32_t* GetFeatureBinIdxPtr() const { return index_; }
// column.GetFeatureBinIdx(idx) + column.GetBaseIdx(idx) ==
// column.GetGlobalBinIdx(idx)
uint32_t GetBaseIdx() const { return index_base_; }
Expand Down
105 changes: 86 additions & 19 deletions src/common/hist_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -662,8 +662,8 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
/*!
* \brief fill a histogram by zeroes
*/
void InitilizeHistByZeroes(GHistRow hist) {
memset(hist.data(), '\0', hist.size()*sizeof(tree::GradStats));
void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end) {
memset(hist.data() + begin, '\0', (end-begin)*sizeof(tree::GradStats));
}

/*!
Expand Down Expand Up @@ -707,40 +707,107 @@ void SubtractionHist(GHistRow dst, const GHistRow src1, const GHistRow src2,
}
}

template<typename FPType, bool do_prefetch>
void BuildHistDenseKernel(const size_t* rid, const float* pgh, const uint32_t* index,
FPType* hist_data, size_t ibegin, size_t iend, size_t n_features,
size_t prefetch_offset, size_t prefetch_step) {
for (size_t i = ibegin; i < iend; ++i) {
const size_t icol_start = rid[i] * n_features;
const size_t idx_gh = 2*rid[i];

if (do_prefetch) {
const size_t icol_start_prefetch = rid[i+prefetch_offset] * n_features;

PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]);
for (size_t j = icol_start_prefetch; j < icol_start_prefetch + n_features;
j += prefetch_step) {
PREFETCH_READ_T0(index + j);
}
}

for (size_t j = icol_start; j < icol_start + n_features; ++j) {
const uint32_t idx_bin = 2*index[j];

hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
}
}
}

template<typename FPType, bool do_prefetch>
void BuildHistSparseKernel(const size_t* rid, const float* pgh, const uint32_t* index,
FPType* hist_data, const size_t* row_ptr, size_t ibegin, size_t iend,
size_t prefetch_offset, size_t prefetch_step) {
for (size_t i = ibegin; i < iend; ++i) {
const size_t icol_start = row_ptr[rid[i]];
const size_t icol_end = row_ptr[rid[i]+1];
const size_t idx_gh = 2*rid[i];

if (do_prefetch) {
const size_t icol_start_prftch = row_ptr[rid[i+prefetch_offset]];
const size_t icol_end_prefect = row_ptr[rid[i+prefetch_offset]+1];

PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]);
for (size_t j = icol_start_prftch; j < icol_end_prefect; j+=prefetch_step) {
PREFETCH_READ_T0(index + j);
}
}

for (size_t j = icol_start; j < icol_end; ++j) {
const uint32_t idx_bin = 2*index[j];
hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
}
}
}

template<typename FPType, bool do_prefetch>
void BuildHistKernel(const size_t* rid, const float* pgh, const uint32_t* index,
FPType* hist_data, const size_t* row_ptr, size_t ibegin, size_t iend,
size_t prefetch_offset, size_t prefetch_step, bool isDense) {
if (isDense) {
const size_t n_features = row_ptr[rid[0]+1] - row_ptr[rid[0]];
BuildHistDenseKernel<FPType, do_prefetch>(rid, pgh, index, hist_data,
ibegin, iend, n_features, prefetch_offset, prefetch_step);
} else {
BuildHistSparseKernel<FPType, do_prefetch>(rid, pgh, index, hist_data, row_ptr,
ibegin, iend, prefetch_offset, prefetch_step);
}
}

void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist) {
GHistRow hist,
bool isDense) {
const size_t* rid = row_indices.begin;
const size_t nrows = row_indices.Size();
const uint32_t* index = gmat.index.data();
const size_t* row_ptr = gmat.row_ptr.data();
const float* pgh = reinterpret_cast<const float*>(gpair.data());

double* hist_data = reinterpret_cast<double*>(hist.data());
using FPType = decltype(tree::GradStats::sum_grad);
FPType* hist_data = reinterpret_cast<FPType*>(hist.data());

const size_t cache_line_size = 64;
const size_t prefetch_offset = 10;
size_t no_prefetch_size = prefetch_offset + cache_line_size/sizeof(*rid);
no_prefetch_size = no_prefetch_size > nrows ? nrows : no_prefetch_size;
const size_t prefetch_step = cache_line_size / sizeof(*index);

for (size_t i = 0; i < nrows; ++i) {
const size_t icol_start = row_ptr[rid[i]];
const size_t icol_end = row_ptr[rid[i]+1];

if (i < nrows - no_prefetch_size) {
PREFETCH_READ_T0(row_ptr + rid[i + prefetch_offset]);
PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]);
}

for (size_t j = icol_start; j < icol_end; ++j) {
const uint32_t idx_bin = 2*index[j];
const size_t idx_gh = 2*rid[i];
// if need to work with all rows from bin-matrix (e.g. root node)
const bool contiguousBlock = (rid[row_indices.Size()-1] - rid[0]) == (row_indices.Size() - 1);

hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
}
if (contiguousBlock) {
// contiguous memory access, built-in HW prefetching is enough
BuildHistKernel<FPType, false>(rid, pgh, index, hist_data, row_ptr,
0, nrows, prefetch_offset, prefetch_step, isDense);
} else {
BuildHistKernel<FPType, true>(rid, pgh, index, hist_data, row_ptr,
0, nrows - no_prefetch_size, prefetch_offset, prefetch_step, isDense);
// no prefetching to avoid loading extra memory
BuildHistKernel<FPType, false>(rid, pgh, index, hist_data, row_ptr,
nrows - no_prefetch_size, nrows, prefetch_offset, prefetch_step, isDense);
}
}

Expand Down
28 changes: 19 additions & 9 deletions src/common/hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ using GHistRow = Span<tree::GradStats>;
/*!
* \brief fill a histogram by zeros
*/
void InitilizeHistByZeroes(GHistRow hist);
void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end);

/*!
* \brief Increment hist as dst += add in range [begin, end)
Expand Down Expand Up @@ -454,6 +454,7 @@ class ParallelGHistBuilder {
tid_nid_to_hist_.clear();
hist_memory_.clear();
threads_to_nids_map_.clear();

targeted_hists_ = targeted_hists;

CHECK_EQ(nodes, targeted_hists.size());
Expand All @@ -478,7 +479,7 @@ class ParallelGHistBuilder {
GHistRow hist = hist_memory_[idx];

if (!hist_was_used_[tid * nodes_ + nid]) {
InitilizeHistByZeroes(hist);
InitilizeHistByZeroes(hist, 0, hist.size());
hist_was_used_[tid * nodes_ + nid] = static_cast<int>(true);
}

Expand All @@ -492,16 +493,23 @@ class ParallelGHistBuilder {

GHistRow dst = targeted_hists_[nid];

bool is_updated = false;
for (size_t tid = 0; tid < nthreads_; ++tid) {
if (hist_was_used_[tid * nodes_ + nid]) {
is_updated = true;
const size_t idx = tid_nid_to_hist_.at({tid, nid});
GHistRow src = hist_memory_[idx];

if (dst.data() != src.data()) {
IncrementHist(dst, src, begin, end);
} // else src is already targeted hist
}
}
}
if (!is_updated) {
// In distributed mode - some tree nodes can be empty on local machines,
// So we need just set local hist by zeros in this case
InitilizeHistByZeroes(dst, begin, end);
}
}

protected:
Expand Down Expand Up @@ -531,18 +539,19 @@ class ParallelGHistBuilder {
size_t hist_allocated_additionally = 0;

for (size_t nid = 0; nid < nodes_; ++nid) {
size_t nthreads_for_nid = 0;
int nthreads_for_nid = 0;

for (size_t tid = 0; tid < nthreads_; ++tid) {
if (threads_to_nids_map_[tid * nodes_ + nid]) {
nthreads_for_nid++;
}
}

CHECK_GT(nthreads_for_nid, 0);
// -1 means that we have one histogram per node already allocated externally,
// which should store final result for the node
hist_allocated_additionally += (nthreads_for_nid - 1);
// In distributed mode - some tree nodes can be empty on local machines,
// set nthreads_for_nid to 0 in this case.
// In another case - allocate additional (nthreads_for_nid - 1) histograms,
// because one is already allocated externally (will store final result for the node).
hist_allocated_additionally += std::max<int>(0, nthreads_for_nid - 1);
}

for (size_t i = 0; i < hist_allocated_additionally; ++i) {
Expand Down Expand Up @@ -613,7 +622,8 @@ class GHistBuilder {
void BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist);
GHistRow hist,
bool isDense);
// same, with feature grouping
void BuildBlockHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
Expand Down
Loading

0 comments on commit 4d18012

Please sign in to comment.