Improve initial setup time and memory consumption in fast histogram

dmlc · Jul 24, 2017 · 4d614f2 · 4d614f2
1 parent d41dc07
commit 4d614f2
Show file tree

Hide file tree

Showing 8 changed files with 237 additions and 92 deletions.
diff --git a/doc/parameter.md b/doc/parameter.md
@@ -107,6 +107,23 @@ Parameters for Tree Booster
   - This is only used if 'hist' is specified as `tree_method`.
   - Maximum number of discrete bins to bucket continuous features.
   - Increasing this number improves the optimality of splits at the cost of higher computation time.
+* use_columnar_access, [default=1]
+  - This is only used if 'hist' is specified as `tree_method`.
+  - If greater than zero, store a transposed copy of input matrix for fast columnar access. May increase memory usage and initial setup time.
+* sparse_threshold, [default=0.2]
+  - range: [0.0, 1.0]
+  - This is only used if 'hist' is specified as `tree_method`.
+  - Percentage threshold for treating a feature as sparse. For instance, 0.2 indicates that any feature with fewer than 20% nonzero rows will be considered sparse. May impact computation time slightly.
+* enable_feature_grouping, [default=0]
+  - This is only used if 'hist' is specified as `tree_method`.
+  - If greater than zero, group complementary features together so as to improve work balance for parallel histogram aggregation. May increase memory usage and initial setup time.
+* max_conflict_rate, [default=0]
+  - range: [0.0, 1.0]
+  - Only relevant when `enable_feature_grouping=1` is specified.
+  - Specifies criterion for "complementary" features. By default, only features with no common nonzero rows are considered complementary. Increase this number to encourage larger feature groups.
+* max_search_group, [default=100]
+  - Only relevant when `enable_feature_grouping=1` is specified.
+  - Increasing this number will result in better feature grouping, at the cost of greater initial setup time.
 
 Additional parameters for Dart Booster
 --------------------------------------

diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h
@@ -153,31 +153,36 @@ class ColumnMatrix {
     std::vector<size_t> num_nonzeros;
     num_nonzeros.resize(nfeature);
     std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
-    for (size_t rid = 0; rid < nrow; ++rid) {
-      const size_t ibegin = gmat.row_ptr[rid];
-      const size_t iend = gmat.row_ptr[rid + 1];
-      size_t fid = 0;
-      for (size_t i = ibegin; i < iend; ++i) {
-        const uint32_t bin_id = gmat.index[i];
-        while (bin_id >= gmat.cut->row_ptr[fid + 1]) {
-          ++fid;
-        }
-        if (type_[fid] == kDenseColumn) {
-          XGBOOST_TYPE_SWITCH(this->dtype, {
-            const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
-            const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
-            DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
-            begin[rid] = static_cast<DType>(bin_id - index_base_[fid]);
-          });
-        } else {
-          XGBOOST_TYPE_SWITCH(this->dtype, {
-            const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
-            const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
-            DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
-            begin[num_nonzeros[fid]] = static_cast<DType>(bin_id - index_base_[fid]);
-          });
-          row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
-          ++num_nonzeros[fid];
+
+    const int nthread = omp_get_max_threads();
+    #pragma omp parallel num_threads(nthread)
+    {
+      for (size_t rid = 0; rid < nrow; ++rid) {
+        const size_t ibegin = gmat.row_ptr[rid];
+        const size_t iend = gmat.row_ptr[rid + 1];
+        #pragma omp for schedule(static)
+        for (size_t i = ibegin; i < iend; ++i) {
+          const uint32_t bin_id = gmat.index[i];
+          const auto& vec = gmat.cut->row_ptr;
+          auto it = std::upper_bound(vec.begin(), vec.end(), bin_id);
+          const size_t fid = it - vec.begin() - 1;
+          if (type_[fid] == kDenseColumn) {
+            XGBOOST_TYPE_SWITCH(this->dtype, {
+              const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
+              const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
+              DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
+              begin[rid] = static_cast<DType>(bin_id - index_base_[fid]);
+            });
+          } else {
+            XGBOOST_TYPE_SWITCH(this->dtype, {
+              const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
+              const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
+              DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
+              begin[num_nonzeros[fid]] = static_cast<DType>(bin_id - index_base_[fid]);
+            });
+            row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
+            ++num_nonzeros[fid];
+          }
         }
       }
     }

diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
@@ -12,11 +12,12 @@
 #include "./column_matrix.h"
 #include "./hist_util.h"
 #include "./quantile.h"
+#include "./memory.h"
 
 namespace xgboost {
 namespace common {
 
-void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
+void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins, bool verbose) {
   typedef common::WXQuantileSketch<bst_float, bst_float> WXQSketch;
   const MetaInfo& info = p_fmat->info();
 
@@ -33,6 +34,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
     s.Init(info.num_row, 1.0 / (max_num_bins * kFactor));
   }
 
+  LOG(INFO) << "Generating sketches...";
   dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
   iter->BeforeFirst();
   while (iter->Next()) {
@@ -55,51 +57,64 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
     }
   }
 
-  // gather the histogram data
-  rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
-  std::vector<WXQSketch::SummaryContainer> summary_array;
-  summary_array.resize(sketchs.size());
-  for (size_t i = 0; i < sketchs.size(); ++i) {
-    WXQSketch::SummaryContainer out;
-    sketchs[i].GetSummary(&out);
-    summary_array[i].Reserve(max_num_bins * kFactor);
-    summary_array[i].SetPrune(out, max_num_bins * kFactor);
-  }
-  size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
-  sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
-
   this->min_val.resize(info.num_col);
   row_ptr.push_back(0);
-  for (size_t fid = 0; fid < summary_array.size(); ++fid) {
-    WXQSketch::SummaryContainer a;
-    a.Reserve(max_num_bins);
-    a.SetPrune(summary_array[fid], max_num_bins);
-    const bst_float mval = a.data[0].value;
-    this->min_val[fid] = mval - fabs(mval);
-    if (a.size > 1 && a.size <= 16) {
-      /* specialized code categorial / ordinal data -- use midpoints */
-      for (size_t i = 1; i < a.size; ++i) {
-        bst_float cpt = (a.data[i].value + a.data[i - 1].value) / 2.0;
-        if (i == 1 || cpt > cut.back()) {
-          cut.push_back(cpt);
+  // gather the histogram data
+  rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
+  const size_t bundle_size       // limit this task to 1GB
+    = std::min(GetSystemMemory() / 2,
+               static_cast<unsigned long long>(1) * 1024 * 1024 * 1024)
+                / (max_num_bins * kFactor * 16);
+  for (size_t ibegin = 0; ibegin < sketchs.size(); ibegin += bundle_size) {
+    const size_t iend = std::min(ibegin + bundle_size, sketchs.size());
+    const size_t batch_size = iend - ibegin;
+
+    std::vector<WXQSketch::SummaryContainer> summary_array;
+    summary_array.resize(batch_size);
+    if (verbose) {
+      LOG(INFO) << "Computing quantiles for features ["
+                << ibegin << ", " << iend << ")...";
+    }
+    for (size_t i = ibegin; i < iend; ++i) {
+      WXQSketch::SummaryContainer out;
+      sketchs[i].GetSummary(&out);
+      summary_array[i - ibegin].Reserve(max_num_bins * kFactor);
+      summary_array[i - ibegin].SetPrune(out, max_num_bins * kFactor);
+    }
+    size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
+    sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
+
+    for (size_t fid = ibegin; fid < iend; ++fid) {
+      WXQSketch::SummaryContainer a;
+      a.Reserve(max_num_bins);
+      a.SetPrune(summary_array[fid - ibegin], max_num_bins);
+      const bst_float mval = a.data[0].value;
+      this->min_val[fid] = mval - fabs(mval);
+      if (a.size > 1 && a.size <= 16) {
+        /* specialized code categorial / ordinal data -- use midpoints */
+        for (size_t i = 1; i < a.size; ++i) {
+          bst_float cpt = (a.data[i].value + a.data[i - 1].value) / 2.0;
+          if (i == 1 || cpt > cut.back()) {
+            cut.push_back(cpt);
+          }
         }
-      }
-    } else {
-      for (size_t i = 2; i < a.size; ++i) {
-        bst_float cpt = a.data[i - 1].value;
-        if (i == 2 || cpt > cut.back()) {
-          cut.push_back(cpt);
+      } else {
+        for (size_t i = 2; i < a.size; ++i) {
+          bst_float cpt = a.data[i - 1].value;
+          if (i == 2 || cpt > cut.back()) {
+            cut.push_back(cpt);
+          }
         }
       }
+      // push a value that is greater than anything
+      if (a.size != 0) {
+        bst_float cpt = a.data[a.size - 1].value;
+        // this must be bigger than last value in a scale
+        bst_float last = cpt + fabs(cpt);
+        cut.push_back(last);
+      }
+      row_ptr.push_back(cut.size());
     }
-    // push a value that is greater than anything
-    if (a.size != 0) {
-      bst_float cpt = a.data[a.size - 1].value;
-      // this must be bigger than last value in a scale
-      bst_float last = cpt + fabs(cpt);
-      cut.push_back(last);
-    }
-    row_ptr.push_back(cut.size());
   }
 }
 
@@ -296,8 +311,15 @@ FastFeatureGrouping(const GHistIndexMatrix& gmat,
     return feature_nnz[a] > feature_nnz[b];
   });
 
-  auto groups_alt1 = FindGroups(feature_list, feature_nnz, colmat, nrow, param);
-  auto groups_alt2 = FindGroups(features_by_nnz, feature_nnz, colmat, nrow, param);
+  std::vector<std::vector<unsigned>> groups_alt1, groups_alt2;
+
+  #pragma omp parallel sections
+  {
+    #pragma omp section
+    groups_alt1 = FindGroups(feature_list, feature_nnz, colmat, nrow, param);
+    #pragma omp section
+    groups_alt2 = FindGroups(features_by_nnz, feature_nnz, colmat, nrow, param);
+  }
   auto& groups = (groups_alt1.size() > groups_alt2.size()) ? groups_alt2 : groups_alt1;
 
   // take apart small, sparse groups, as it won't help speed
@@ -338,6 +360,7 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
   cut = gmat.cut;
 
   const size_t nrow = gmat.row_ptr.size() - 1;
+  const size_t nfeature = gmat.cut->row_ptr.size() - 1;
   const uint32_t nbins = gmat.cut->row_ptr.back();
 
   /* step 1: form feature groups */
@@ -355,10 +378,24 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
       }
     }
   }
+
+  std::vector<size_t> block_nnz(nblock, 0);
+  {
+    std::vector<size_t> feature_nnz(nfeature);
+    gmat.GetFeatureCounts(&feature_nnz[0]);
+    for (uint32_t group_id = 0; group_id < nblock; ++group_id) {
+      for (auto& fid : groups[group_id]) {
+        block_nnz[group_id] += feature_nnz[fid];
+      }
+    }
+  }
+
   std::vector<std::vector<uint32_t>> index_temp(nblock);
   std::vector<std::vector<size_t>> row_ptr_temp(nblock);
   for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
+    row_ptr_temp[block_id].reserve(nrow + 1);
     row_ptr_temp[block_id].push_back(0);
+    index_temp[block_id].reserve(block_nnz[block_id]);
   }
   for (size_t rid = 0; rid < nrow; ++rid) {
     const size_t ibegin = gmat.row_ptr[rid];
@@ -378,6 +415,16 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
   std::vector<size_t> row_ptr_blk_ptr;
   index_blk_ptr.push_back(0);
   row_ptr_blk_ptr.push_back(0);
+
+  {
+    size_t tot = 0, tot2 = 0;
+    for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
+      tot += index_temp[block_id].size();
+      tot2 += row_ptr_temp[block_id].size();
+    }
+    index.reserve(tot);
+    row_ptr.reserve(tot2);
+  }
   for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
     index.insert(index.end(), index_temp[block_id].begin(), index_temp[block_id].end());
     row_ptr.insert(row_ptr.end(), row_ptr_temp[block_id].begin(), row_ptr_temp[block_id].end());

diff --git a/src/common/hist_util.h b/src/common/hist_util.h
@@ -79,7 +79,7 @@ struct HistCutMatrix {
   }
   // create histogram cut matrix given statistics from data
   // using approximate quantile sketch approach
-  void Init(DMatrix* p_fmat, uint32_t max_num_bins);
+  void Init(DMatrix* p_fmat, uint32_t max_num_bins, bool verbose = false);
 };
 
 

diff --git a/src/common/memory.h b/src/common/memory.h
@@ -0,0 +1,38 @@
+/*!
+ * Copyright 2017 by Contributors
+ * \file memory.h
+ * \brief Utility for memory
+ * \author Philip Cho
+ */
+#ifndef XGBOOST_COMMON_MEMORY_H_
+#define XGBOOST_COMMON_MEMORY_H_
+
+#ifndef _WIN32
+#include <unistd.h>
+#else
+#include <windows.h>
+#endif
+
+namespace xgboost {
+namespace common {
+
+#ifndef _WIN32
+inline unsigned long long GetSystemMemory()
+{
+  long pages = sysconf(_SC_PHYS_PAGES);
+  long page_size = sysconf(_SC_PAGE_SIZE);
+  return pages * page_size;
+}
+#else
+inline unsigned long long GetSystemMemory()
+{
+  MEMORYSTATUSEX status;
+  status.dwLength = sizeof(status);
+  GlobalMemoryStatusEx(&status);
+  return status.ullTotalPhys;
+}
+#endif
+
+}  // namespace common
+}  // namespace xgboost
+#endif  // XGBOOST_COMMON_MEMORY_H_
diff --git a/src/tree/fast_hist_param.h b/src/tree/fast_hist_param.h
@@ -18,8 +18,10 @@ struct FastHistParam : public dmlc::Parameter<FastHistParam> {
   // percentage threshold for treating a feature as sparse
   // e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse
   double sparse_threshold;
-  // use feature grouping? (default yes)
+  // use feature grouping? (default no)
   int enable_feature_grouping;
+  // use columnar access structure? (default yes)
+  int use_columnar_access;
   // when grouping features, how many "conflicts" to allow.
   // conflict is when an instance has nonzero values for two or more features
   // default is 0, meaning features should be strictly complementary
@@ -45,7 +47,9 @@ struct FastHistParam : public dmlc::Parameter<FastHistParam> {
     DMLC_DECLARE_FIELD(enable_feature_grouping).set_lower_bound(0).set_default(0)
         .describe("if >0, enable feature grouping to ameliorate work imbalance "
                   "among worker threads");
-    DMLC_DECLARE_FIELD(max_conflict_rate).set_range(0, 1.0).set_default(0)
+    DMLC_DECLARE_FIELD(use_columnar_access).set_lower_bound(0).set_default(1)
+        .describe("if >0, store a transposed copy of input matrix for fast columnar access");
+    DMLC_DECLARE_FIELD(max_conflict_rate).set_range(1, 1.0).set_default(0)
         .describe("when grouping features, how many \"conflicts\" to allow."
        "conflict is when an instance has nonzero values for two or more features."
        "default is 0, meaning features should be strictly complementary.");