Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve initial setup time and memory consumption in fast histogram #2543

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 30 additions & 25 deletions src/common/column_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,31 +153,36 @@ class ColumnMatrix {
std::vector<size_t> num_nonzeros;
num_nonzeros.resize(nfeature);
std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
for (size_t rid = 0; rid < nrow; ++rid) {
const size_t ibegin = gmat.row_ptr[rid];
const size_t iend = gmat.row_ptr[rid + 1];
size_t fid = 0;
for (size_t i = ibegin; i < iend; ++i) {
const uint32_t bin_id = gmat.index[i];
while (bin_id >= gmat.cut->row_ptr[fid + 1]) {
++fid;
}
if (type_[fid] == kDenseColumn) {
XGBOOST_TYPE_SWITCH(this->dtype, {
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
begin[rid] = static_cast<DType>(bin_id - index_base_[fid]);
});
} else {
XGBOOST_TYPE_SWITCH(this->dtype, {
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
begin[num_nonzeros[fid]] = static_cast<DType>(bin_id - index_base_[fid]);
});
row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
++num_nonzeros[fid];

const int nthread = omp_get_max_threads();
#pragma omp parallel num_threads(nthread)
{
for (size_t rid = 0; rid < nrow; ++rid) {
const bst_omp_uint ibegin = static_cast<bst_omp_uint>(gmat.row_ptr[rid]);
const bst_omp_uint iend = static_cast<bst_omp_uint>(gmat.row_ptr[rid + 1]);
#pragma omp for schedule(static)
for (bst_omp_uint i = ibegin; i < iend; ++i) {
const uint32_t bin_id = gmat.index[i];
const auto& vec = gmat.cut->row_ptr;
auto it = std::upper_bound(vec.begin(), vec.end(), bin_id);
const size_t fid = it - vec.begin() - 1;
if (type_[fid] == kDenseColumn) {
XGBOOST_TYPE_SWITCH(this->dtype, {
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
begin[rid] = static_cast<DType>(bin_id - index_base_[fid]);
});
} else {
XGBOOST_TYPE_SWITCH(this->dtype, {
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
begin[num_nonzeros[fid]] = static_cast<DType>(bin_id - index_base_[fid]);
});
row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
++num_nonzeros[fid];
}
}
}
}
Expand Down
131 changes: 89 additions & 42 deletions src/common/hist_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@
#include "./column_matrix.h"
#include "./hist_util.h"
#include "./quantile.h"
#include "./memory.h"

namespace xgboost {
namespace common {

void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins, bool verbose) {
typedef common::WXQuantileSketch<bst_float, bst_float> WXQSketch;
const MetaInfo& info = p_fmat->info();

Expand All @@ -33,6 +34,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
s.Init(info.num_row, 1.0 / (max_num_bins * kFactor));
}

LOG(INFO) << "Generating sketches...";
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
iter->BeforeFirst();
while (iter->Next()) {
Expand All @@ -55,51 +57,64 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
}
}

// gather the histogram data
rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
std::vector<WXQSketch::SummaryContainer> summary_array;
summary_array.resize(sketchs.size());
for (size_t i = 0; i < sketchs.size(); ++i) {
WXQSketch::SummaryContainer out;
sketchs[i].GetSummary(&out);
summary_array[i].Reserve(max_num_bins * kFactor);
summary_array[i].SetPrune(out, max_num_bins * kFactor);
}
size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());

this->min_val.resize(info.num_col);
row_ptr.push_back(0);
for (size_t fid = 0; fid < summary_array.size(); ++fid) {
WXQSketch::SummaryContainer a;
a.Reserve(max_num_bins);
a.SetPrune(summary_array[fid], max_num_bins);
const bst_float mval = a.data[0].value;
this->min_val[fid] = mval - fabs(mval);
if (a.size > 1 && a.size <= 16) {
/* specialized code categorial / ordinal data -- use midpoints */
for (size_t i = 1; i < a.size; ++i) {
bst_float cpt = (a.data[i].value + a.data[i - 1].value) / 2.0;
if (i == 1 || cpt > cut.back()) {
cut.push_back(cpt);
// gather the histogram data
rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
const size_t bundle_size // limit this task to 1GB
= std::min(GetSystemMemory() / 2,
static_cast<size_t>(1) * 1024 * 1024 * 1024)
/ (max_num_bins * kFactor * 16);
for (size_t ibegin = 0; ibegin < sketchs.size(); ibegin += bundle_size) {
const size_t iend = std::min(ibegin + bundle_size, sketchs.size());
const size_t batch_size = iend - ibegin;

std::vector<WXQSketch::SummaryContainer> summary_array;
summary_array.resize(batch_size);
if (verbose) {
LOG(INFO) << "Computing quantiles for features ["
<< ibegin << ", " << iend << ")...";
}
for (size_t i = ibegin; i < iend; ++i) {
WXQSketch::SummaryContainer out;
sketchs[i].GetSummary(&out);
summary_array[i - ibegin].Reserve(max_num_bins * kFactor);
summary_array[i - ibegin].SetPrune(out, max_num_bins * kFactor);
}
size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());

for (size_t fid = ibegin; fid < iend; ++fid) {
WXQSketch::SummaryContainer a;
a.Reserve(max_num_bins);
a.SetPrune(summary_array[fid - ibegin], max_num_bins);
const bst_float mval = a.data[0].value;
this->min_val[fid] = mval - fabs(mval);
if (a.size > 1 && a.size <= 16) {
/* specialized code categorial / ordinal data -- use midpoints */
for (size_t i = 1; i < a.size; ++i) {
bst_float cpt = (a.data[i].value + a.data[i - 1].value) / 2.0;
if (i == 1 || cpt > cut.back()) {
cut.push_back(cpt);
}
}
}
} else {
for (size_t i = 2; i < a.size; ++i) {
bst_float cpt = a.data[i - 1].value;
if (i == 2 || cpt > cut.back()) {
cut.push_back(cpt);
} else {
for (size_t i = 2; i < a.size; ++i) {
bst_float cpt = a.data[i - 1].value;
if (i == 2 || cpt > cut.back()) {
cut.push_back(cpt);
}
}
}
// push a value that is greater than anything
if (a.size != 0) {
bst_float cpt = a.data[a.size - 1].value;
// this must be bigger than last value in a scale
bst_float last = cpt + fabs(cpt);
cut.push_back(last);
}
row_ptr.push_back(cut.size());
}
// push a value that is greater than anything
if (a.size != 0) {
bst_float cpt = a.data[a.size - 1].value;
// this must be bigger than last value in a scale
bst_float last = cpt + fabs(cpt);
cut.push_back(last);
}
row_ptr.push_back(cut.size());
}
}

Expand Down Expand Up @@ -296,8 +311,15 @@ FastFeatureGrouping(const GHistIndexMatrix& gmat,
return feature_nnz[a] > feature_nnz[b];
});

auto groups_alt1 = FindGroups(feature_list, feature_nnz, colmat, nrow, param);
auto groups_alt2 = FindGroups(features_by_nnz, feature_nnz, colmat, nrow, param);
std::vector<std::vector<unsigned>> groups_alt1, groups_alt2;

#pragma omp parallel sections
{
#pragma omp section
groups_alt1 = FindGroups(feature_list, feature_nnz, colmat, nrow, param);
#pragma omp section
groups_alt2 = FindGroups(features_by_nnz, feature_nnz, colmat, nrow, param);
}
auto& groups = (groups_alt1.size() > groups_alt2.size()) ? groups_alt2 : groups_alt1;

// take apart small, sparse groups, as it won't help speed
Expand Down Expand Up @@ -338,6 +360,7 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
cut = gmat.cut;

const size_t nrow = gmat.row_ptr.size() - 1;
const size_t nfeature = gmat.cut->row_ptr.size() - 1;
const uint32_t nbins = gmat.cut->row_ptr.back();

/* step 1: form feature groups */
Expand All @@ -355,10 +378,24 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
}
}
}

std::vector<size_t> block_nnz(nblock, 0);
{
std::vector<size_t> feature_nnz(nfeature);
gmat.GetFeatureCounts(&feature_nnz[0]);
for (uint32_t group_id = 0; group_id < nblock; ++group_id) {
for (auto& fid : groups[group_id]) {
block_nnz[group_id] += feature_nnz[fid];
}
}
}

std::vector<std::vector<uint32_t>> index_temp(nblock);
std::vector<std::vector<size_t>> row_ptr_temp(nblock);
for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
row_ptr_temp[block_id].reserve(nrow + 1);
row_ptr_temp[block_id].push_back(0);
index_temp[block_id].reserve(block_nnz[block_id]);
}
for (size_t rid = 0; rid < nrow; ++rid) {
const size_t ibegin = gmat.row_ptr[rid];
Expand All @@ -378,6 +415,16 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
std::vector<size_t> row_ptr_blk_ptr;
index_blk_ptr.push_back(0);
row_ptr_blk_ptr.push_back(0);

{
size_t tot = 0, tot2 = 0;
for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
tot += index_temp[block_id].size();
tot2 += row_ptr_temp[block_id].size();
}
index.reserve(tot);
row_ptr.reserve(tot2);
}
for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
index.insert(index.end(), index_temp[block_id].begin(), index_temp[block_id].end());
row_ptr.insert(row_ptr.end(), row_ptr_temp[block_id].begin(), row_ptr_temp[block_id].end());
Expand Down
2 changes: 1 addition & 1 deletion src/common/hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ struct HistCutMatrix {
}
// create histogram cut matrix given statistics from data
// using approximate quantile sketch approach
void Init(DMatrix* p_fmat, uint32_t max_num_bins);
void Init(DMatrix* p_fmat, uint32_t max_num_bins, bool verbose = false);
};


Expand Down
37 changes: 37 additions & 0 deletions src/common/memory.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*!
* Copyright 2017 by Contributors
* \file memory.h
* \brief Utility for memory
* \author Philip Cho
*/
#ifndef XGBOOST_COMMON_MEMORY_H_
#define XGBOOST_COMMON_MEMORY_H_

#ifndef _WIN32
#include <unistd.h>
#else
#define NOMINMAX
#include <windows.h>
#endif

namespace xgboost {
namespace common {

#ifndef _WIN32
inline size_t GetSystemMemory() {
size_t pages = sysconf(_SC_PHYS_PAGES);
size_t page_size = sysconf(_SC_PAGE_SIZE);
return pages * page_size;
}
#else
inline size_t GetSystemMemory() {
MEMORYSTATUSEX status;
status.dwLength = sizeof(status);
GlobalMemoryStatusEx(&status);
return status.ullTotalPhys;
}
#endif

} // namespace common
} // namespace xgboost
#endif // XGBOOST_COMMON_MEMORY_H_
8 changes: 6 additions & 2 deletions src/tree/fast_hist_param.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ struct FastHistParam : public dmlc::Parameter<FastHistParam> {
// percentage threshold for treating a feature as sparse
// e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse
double sparse_threshold;
// use feature grouping? (default yes)
// use feature grouping? (default no)
int enable_feature_grouping;
// use columnar access structure? (default yes)
int use_columnar_access;
// when grouping features, how many "conflicts" to allow.
// conflict is when an instance has nonzero values for two or more features
// default is 0, meaning features should be strictly complementary
Expand All @@ -45,7 +47,9 @@ struct FastHistParam : public dmlc::Parameter<FastHistParam> {
DMLC_DECLARE_FIELD(enable_feature_grouping).set_lower_bound(0).set_default(0)
.describe("if >0, enable feature grouping to ameliorate work imbalance "
"among worker threads");
DMLC_DECLARE_FIELD(max_conflict_rate).set_range(0, 1.0).set_default(0)
DMLC_DECLARE_FIELD(use_columnar_access).set_lower_bound(0).set_default(1)
.describe("if >0, store a transposed copy of input matrix for fast columnar access");
DMLC_DECLARE_FIELD(max_conflict_rate).set_range(1, 1.0).set_default(0)
.describe("when grouping features, how many \"conflicts\" to allow."
"conflict is when an instance has nonzero values for two or more features."
"default is 0, meaning features should be strictly complementary.");
Expand Down
Loading