Skip to content

Commit

Permalink
Improve initial setup time and memory consumption in fast histogram
Browse files Browse the repository at this point in the history
  • Loading branch information
hcho3 committed Jul 24, 2017
1 parent d41dc07 commit 4d614f2
Show file tree
Hide file tree
Showing 8 changed files with 237 additions and 92 deletions.
17 changes: 17 additions & 0 deletions doc/parameter.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,23 @@ Parameters for Tree Booster
- This is only used if 'hist' is specified as `tree_method`.
- Maximum number of discrete bins to bucket continuous features.
- Increasing this number improves the optimality of splits at the cost of higher computation time.
* use_columnar_access, [default=1]
- This is only used if 'hist' is specified as `tree_method`.
- If greater than zero, store a transposed copy of input matrix for fast columnar access. May increase memory usage and initial setup time.
* sparse_threshold, [default=0.2]
- range: [0.0, 1.0]
- This is only used if 'hist' is specified as `tree_method`.
- Percentage threshold for treating a feature as sparse. For instance, 0.2 indicates that any feature with fewer than 20% nonzero rows will be considered sparse. May impact computation time slightly.
* enable_feature_grouping, [default=0]
- This is only used if 'hist' is specified as `tree_method`.
- If greater than zero, group complementary features together so as to improve work balance for parallel histogram aggregation. May increase memory usage and initial setup time.
* max_conflict_rate, [default=0]
- range: [0.0, 1.0]
- Only relevant when `enable_feature_grouping=1` is specified.
- Specifies criterion for "complementary" features. By default, only features with no common nonzero rows are considered complementary. Increase this number to encourage larger feature groups.
* max_search_group, [default=100]
- Only relevant when `enable_feature_grouping=1` is specified.
- Increasing this number will result in better feature grouping, at the cost of greater initial setup time.

Additional parameters for Dart Booster
--------------------------------------
Expand Down
55 changes: 30 additions & 25 deletions src/common/column_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,31 +153,36 @@ class ColumnMatrix {
std::vector<size_t> num_nonzeros;
num_nonzeros.resize(nfeature);
std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
for (size_t rid = 0; rid < nrow; ++rid) {
const size_t ibegin = gmat.row_ptr[rid];
const size_t iend = gmat.row_ptr[rid + 1];
size_t fid = 0;
for (size_t i = ibegin; i < iend; ++i) {
const uint32_t bin_id = gmat.index[i];
while (bin_id >= gmat.cut->row_ptr[fid + 1]) {
++fid;
}
if (type_[fid] == kDenseColumn) {
XGBOOST_TYPE_SWITCH(this->dtype, {
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
begin[rid] = static_cast<DType>(bin_id - index_base_[fid]);
});
} else {
XGBOOST_TYPE_SWITCH(this->dtype, {
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
begin[num_nonzeros[fid]] = static_cast<DType>(bin_id - index_base_[fid]);
});
row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
++num_nonzeros[fid];

const int nthread = omp_get_max_threads();
#pragma omp parallel num_threads(nthread)
{
for (size_t rid = 0; rid < nrow; ++rid) {
const size_t ibegin = gmat.row_ptr[rid];
const size_t iend = gmat.row_ptr[rid + 1];
#pragma omp for schedule(static)
for (size_t i = ibegin; i < iend; ++i) {
const uint32_t bin_id = gmat.index[i];
const auto& vec = gmat.cut->row_ptr;
auto it = std::upper_bound(vec.begin(), vec.end(), bin_id);
const size_t fid = it - vec.begin() - 1;
if (type_[fid] == kDenseColumn) {
XGBOOST_TYPE_SWITCH(this->dtype, {
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
begin[rid] = static_cast<DType>(bin_id - index_base_[fid]);
});
} else {
XGBOOST_TYPE_SWITCH(this->dtype, {
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
begin[num_nonzeros[fid]] = static_cast<DType>(bin_id - index_base_[fid]);
});
row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
++num_nonzeros[fid];
}
}
}
}
Expand Down
131 changes: 89 additions & 42 deletions src/common/hist_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@
#include "./column_matrix.h"
#include "./hist_util.h"
#include "./quantile.h"
#include "./memory.h"

namespace xgboost {
namespace common {

void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins, bool verbose) {
typedef common::WXQuantileSketch<bst_float, bst_float> WXQSketch;
const MetaInfo& info = p_fmat->info();

Expand All @@ -33,6 +34,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
s.Init(info.num_row, 1.0 / (max_num_bins * kFactor));
}

LOG(INFO) << "Generating sketches...";
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
iter->BeforeFirst();
while (iter->Next()) {
Expand All @@ -55,51 +57,64 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
}
}

// gather the histogram data
rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
std::vector<WXQSketch::SummaryContainer> summary_array;
summary_array.resize(sketchs.size());
for (size_t i = 0; i < sketchs.size(); ++i) {
WXQSketch::SummaryContainer out;
sketchs[i].GetSummary(&out);
summary_array[i].Reserve(max_num_bins * kFactor);
summary_array[i].SetPrune(out, max_num_bins * kFactor);
}
size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());

this->min_val.resize(info.num_col);
row_ptr.push_back(0);
for (size_t fid = 0; fid < summary_array.size(); ++fid) {
WXQSketch::SummaryContainer a;
a.Reserve(max_num_bins);
a.SetPrune(summary_array[fid], max_num_bins);
const bst_float mval = a.data[0].value;
this->min_val[fid] = mval - fabs(mval);
if (a.size > 1 && a.size <= 16) {
/* specialized code categorial / ordinal data -- use midpoints */
for (size_t i = 1; i < a.size; ++i) {
bst_float cpt = (a.data[i].value + a.data[i - 1].value) / 2.0;
if (i == 1 || cpt > cut.back()) {
cut.push_back(cpt);
// gather the histogram data
rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
const size_t bundle_size // limit this task to 1GB
= std::min(GetSystemMemory() / 2,
static_cast<unsigned long long>(1) * 1024 * 1024 * 1024)
/ (max_num_bins * kFactor * 16);
for (size_t ibegin = 0; ibegin < sketchs.size(); ibegin += bundle_size) {
const size_t iend = std::min(ibegin + bundle_size, sketchs.size());
const size_t batch_size = iend - ibegin;

std::vector<WXQSketch::SummaryContainer> summary_array;
summary_array.resize(batch_size);
if (verbose) {
LOG(INFO) << "Computing quantiles for features ["
<< ibegin << ", " << iend << ")...";
}
for (size_t i = ibegin; i < iend; ++i) {
WXQSketch::SummaryContainer out;
sketchs[i].GetSummary(&out);
summary_array[i - ibegin].Reserve(max_num_bins * kFactor);
summary_array[i - ibegin].SetPrune(out, max_num_bins * kFactor);
}
size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());

for (size_t fid = ibegin; fid < iend; ++fid) {
WXQSketch::SummaryContainer a;
a.Reserve(max_num_bins);
a.SetPrune(summary_array[fid - ibegin], max_num_bins);
const bst_float mval = a.data[0].value;
this->min_val[fid] = mval - fabs(mval);
if (a.size > 1 && a.size <= 16) {
/* specialized code categorial / ordinal data -- use midpoints */
for (size_t i = 1; i < a.size; ++i) {
bst_float cpt = (a.data[i].value + a.data[i - 1].value) / 2.0;
if (i == 1 || cpt > cut.back()) {
cut.push_back(cpt);
}
}
}
} else {
for (size_t i = 2; i < a.size; ++i) {
bst_float cpt = a.data[i - 1].value;
if (i == 2 || cpt > cut.back()) {
cut.push_back(cpt);
} else {
for (size_t i = 2; i < a.size; ++i) {
bst_float cpt = a.data[i - 1].value;
if (i == 2 || cpt > cut.back()) {
cut.push_back(cpt);
}
}
}
// push a value that is greater than anything
if (a.size != 0) {
bst_float cpt = a.data[a.size - 1].value;
// this must be bigger than last value in a scale
bst_float last = cpt + fabs(cpt);
cut.push_back(last);
}
row_ptr.push_back(cut.size());
}
// push a value that is greater than anything
if (a.size != 0) {
bst_float cpt = a.data[a.size - 1].value;
// this must be bigger than last value in a scale
bst_float last = cpt + fabs(cpt);
cut.push_back(last);
}
row_ptr.push_back(cut.size());
}
}

Expand Down Expand Up @@ -296,8 +311,15 @@ FastFeatureGrouping(const GHistIndexMatrix& gmat,
return feature_nnz[a] > feature_nnz[b];
});

auto groups_alt1 = FindGroups(feature_list, feature_nnz, colmat, nrow, param);
auto groups_alt2 = FindGroups(features_by_nnz, feature_nnz, colmat, nrow, param);
std::vector<std::vector<unsigned>> groups_alt1, groups_alt2;

#pragma omp parallel sections
{
#pragma omp section
groups_alt1 = FindGroups(feature_list, feature_nnz, colmat, nrow, param);
#pragma omp section
groups_alt2 = FindGroups(features_by_nnz, feature_nnz, colmat, nrow, param);
}
auto& groups = (groups_alt1.size() > groups_alt2.size()) ? groups_alt2 : groups_alt1;

// take apart small, sparse groups, as it won't help speed
Expand Down Expand Up @@ -338,6 +360,7 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
cut = gmat.cut;

const size_t nrow = gmat.row_ptr.size() - 1;
const size_t nfeature = gmat.cut->row_ptr.size() - 1;
const uint32_t nbins = gmat.cut->row_ptr.back();

/* step 1: form feature groups */
Expand All @@ -355,10 +378,24 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
}
}
}

std::vector<size_t> block_nnz(nblock, 0);
{
std::vector<size_t> feature_nnz(nfeature);
gmat.GetFeatureCounts(&feature_nnz[0]);
for (uint32_t group_id = 0; group_id < nblock; ++group_id) {
for (auto& fid : groups[group_id]) {
block_nnz[group_id] += feature_nnz[fid];
}
}
}

std::vector<std::vector<uint32_t>> index_temp(nblock);
std::vector<std::vector<size_t>> row_ptr_temp(nblock);
for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
row_ptr_temp[block_id].reserve(nrow + 1);
row_ptr_temp[block_id].push_back(0);
index_temp[block_id].reserve(block_nnz[block_id]);
}
for (size_t rid = 0; rid < nrow; ++rid) {
const size_t ibegin = gmat.row_ptr[rid];
Expand All @@ -378,6 +415,16 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
std::vector<size_t> row_ptr_blk_ptr;
index_blk_ptr.push_back(0);
row_ptr_blk_ptr.push_back(0);

{
size_t tot = 0, tot2 = 0;
for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
tot += index_temp[block_id].size();
tot2 += row_ptr_temp[block_id].size();
}
index.reserve(tot);
row_ptr.reserve(tot2);
}
for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
index.insert(index.end(), index_temp[block_id].begin(), index_temp[block_id].end());
row_ptr.insert(row_ptr.end(), row_ptr_temp[block_id].begin(), row_ptr_temp[block_id].end());
Expand Down
2 changes: 1 addition & 1 deletion src/common/hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ struct HistCutMatrix {
}
// create histogram cut matrix given statistics from data
// using approximate quantile sketch approach
void Init(DMatrix* p_fmat, uint32_t max_num_bins);
void Init(DMatrix* p_fmat, uint32_t max_num_bins, bool verbose = false);
};


Expand Down
38 changes: 38 additions & 0 deletions src/common/memory.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*!
* Copyright 2017 by Contributors
* \file memory.h
* \brief Utility for memory
* \author Philip Cho
*/
#ifndef XGBOOST_COMMON_MEMORY_H_
#define XGBOOST_COMMON_MEMORY_H_

#ifndef _WIN32
#include <unistd.h>
#else
#include <windows.h>
#endif

namespace xgboost {
namespace common {

#ifndef _WIN32
inline unsigned long long GetSystemMemory()
{
long pages = sysconf(_SC_PHYS_PAGES);
long page_size = sysconf(_SC_PAGE_SIZE);
return pages * page_size;
}
#else
inline unsigned long long GetSystemMemory()
{
MEMORYSTATUSEX status;
status.dwLength = sizeof(status);
GlobalMemoryStatusEx(&status);
return status.ullTotalPhys;
}
#endif

} // namespace common
} // namespace xgboost
#endif // XGBOOST_COMMON_MEMORY_H_
8 changes: 6 additions & 2 deletions src/tree/fast_hist_param.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ struct FastHistParam : public dmlc::Parameter<FastHistParam> {
// percentage threshold for treating a feature as sparse
// e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse
double sparse_threshold;
// use feature grouping? (default yes)
// use feature grouping? (default no)
int enable_feature_grouping;
// use columnar access structure? (default yes)
int use_columnar_access;
// when grouping features, how many "conflicts" to allow.
// conflict is when an instance has nonzero values for two or more features
// default is 0, meaning features should be strictly complementary
Expand All @@ -45,7 +47,9 @@ struct FastHistParam : public dmlc::Parameter<FastHistParam> {
DMLC_DECLARE_FIELD(enable_feature_grouping).set_lower_bound(0).set_default(0)
.describe("if >0, enable feature grouping to ameliorate work imbalance "
"among worker threads");
DMLC_DECLARE_FIELD(max_conflict_rate).set_range(0, 1.0).set_default(0)
DMLC_DECLARE_FIELD(use_columnar_access).set_lower_bound(0).set_default(1)
.describe("if >0, store a transposed copy of input matrix for fast columnar access");
DMLC_DECLARE_FIELD(max_conflict_rate).set_range(1, 1.0).set_default(0)
.describe("when grouping features, how many \"conflicts\" to allow."
"conflict is when an instance has nonzero values for two or more features."
"default is 0, meaning features should be strictly complementary.");
Expand Down
Loading

0 comments on commit 4d614f2

Please sign in to comment.