add Dart booster (#1220)

dmlc · Jun 8, 2016 · 949d1e3 · 949d1e3
1 parent e034fdf
commit 949d1e3
Show file tree

Hide file tree

Showing 3 changed files with 332 additions and 3 deletions.
diff --git a/doc/parameter.md b/doc/parameter.md
@@ -13,7 +13,8 @@ In R-package, you can use .(dot) to replace under score in the parameters, for e
 General Parameters
 ------------------
 * booster [default=gbtree]
-  - which booster to use, can be gbtree or gblinear. gbtree uses tree based model while gblinear uses linear function.
+  - which booster to use, can be gbtree, gblinear or dart.
+  　gbtree and dart use tree based model while gblinear uses linear function.
 * silent [default=0]
   - 0 means printing running messages, 1 means silent mode.
 * nthread [default to maximum number of threads available if not set]
@@ -74,6 +75,28 @@ Parameters for Tree Booster
 * scale_pos_weight, [default=0]
   - Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative  cases) / sum(positive cases) See [Parameters Tuning](how_to/param_tuning.md) for more discussion. Also see Higgs Kaggle competition demo for examples: [R](../demo/kaggle-higgs/higgs-train.R ), [py1](../demo/kaggle-higgs/higgs-numpy.py ), [py2](../demo/kaggle-higgs/higgs-cv.py ), [py3](../demo/guide-python/cross_validation.py)
 
+Additional parameters for Dart Booster
+--------------------------------------
+* sample_type [default="uniform"]
+  - type of sampling algorithm.
+    - "uniform": dropped trees are selected uniformly.
+    - "weighted": dropped trees are selected in proportion to weight.
+* normalize_type [default="tree]
+  - type of normalization algorithm.
+    - "tree": New trees have the same weight of each of dropped trees.
+              weight of new trees are learning_rate / (k + learnig_rate)
+              dropped trees are scaled by a factor of k / (k + learning_rate)
+    - "forest": New trees have the same weight of sum of dropped trees (forest).
+                weight of new trees are learning_rate / (1 + learning_rate)
+                dropped trees are scaled by a factor of 1 / (1 + learning_rate)
+* rate_drop [default=0.0]
+  - dropout rate.
+  - range: [0.0, 1.0]
+* skip_drop [default=0.0]
+  - probability of skip dropout.
+    If a dropout is skipped, new trees are added in the same manner as gbtree.
+  - range: [0.0, 1.0]
+
 Parameters for Linear Booster
 -----------------------------
 * lambda [default=0]

diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
@@ -17,6 +17,8 @@
 #include <limits>
 #include "../common/common.h"
 
+#include "../common/random.h"
+
 namespace xgboost {
 namespace gbm {
 
@@ -47,6 +49,42 @@ struct GBTreeTrainParam : public dmlc::Parameter<GBTreeTrainParam> {
   }
 };
 
+/*! \brief training parameters */
+struct DartTrainParam : public dmlc::Parameter<DartTrainParam> {
+  /*! \brief whether to not print info during training */
+  bool silent;
+  /*! \brief type of sampling algorithm */
+  int sample_type;
+  /*! \brief type of normalization algorithm */
+  int normalize_type;
+  /*! \brief how many trees are dropped */
+  float rate_drop;
+  /*! \brief whether to drop trees */
+  float skip_drop;
+  /*! \brief learning step size for a time */
+  float learning_rate;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(DartTrainParam) {
+    DMLC_DECLARE_FIELD(silent).set_default(false)
+        .describe("Not print information during trainig.");
+    DMLC_DECLARE_FIELD(sample_type).set_default(0)
+        .add_enum("uniform", 0)
+        .add_enum("weighted", 1)
+        .describe("Different types of sampling algorithm.");
+    DMLC_DECLARE_FIELD(normalize_type).set_default(0)
+        .add_enum("tree", 0)
+        .add_enum("forest", 1)
+        .describe("Different types of normalization algorithm.");
+    DMLC_DECLARE_FIELD(rate_drop).set_range(0.0f, 1.0f).set_default(0.0f)
+        .describe("Parameter of how many trees are dropped.");
+    DMLC_DECLARE_FIELD(skip_drop).set_range(0.0f, 1.0f).set_default(0.0f)
+        .describe("Parameter of whether to drop trees.");
+    DMLC_DECLARE_FIELD(learning_rate).set_lower_bound(0.0f).set_default(0.3f)
+        .describe("Learning rate(step size) of update.");
+    DMLC_DECLARE_ALIAS(learning_rate, eta);
+  }
+};
+
 /*! \brief model parameters */
 struct GBTreeModelParam : public dmlc::Parameter<GBTreeModelParam> {
   /*! \brief number of trees */
@@ -313,8 +351,9 @@ class GBTree : public GradientBooster {
     }
   }
   // commit new trees all at once
-  inline void CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
-                          int bst_group) {
+  virtual void
+  CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
+              int bst_group) {
     for (size_t i = 0; i < new_trees.size(); ++i) {
       trees.push_back(std::move(new_trees[i]));
       tree_info.push_back(bst_group);
@@ -475,14 +514,236 @@ class GBTree : public GradientBooster {
   std::vector<std::unique_ptr<TreeUpdater> > updaters;
 };
 
+// dart
+class Dart : public GBTree {
+ public:
+  Dart() {}
+
+  void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) override {
+    GBTree::Configure(cfg);
+    if (trees.size() == 0) {
+      dparam.InitAllowUnknown(cfg);
+    }
+  }
+
+  void Load(dmlc::Stream* fi) override {
+    GBTree::Load(fi);
+    weight_drop.resize(mparam.num_trees);
+    if (mparam.num_trees != 0) {
+      fi->Read(&weight_drop);
+    }
+  }
+
+  void Save(dmlc::Stream* fo) const override {
+    GBTree::Save(fo);
+    if (weight_drop.size() != 0) {
+      fo->Write(weight_drop);
+    }
+  }
+
+  // predict the leaf scores with dropout if ntree_limit = 0
+  void Predict(DMatrix* p_fmat,
+               int64_t buffer_offset,
+               std::vector<float>* out_preds,
+               unsigned ntree_limit) override {
+    DropTrees(ntree_limit);
+    const MetaInfo& info = p_fmat->info();
+    int nthread;
+    #pragma omp parallel
+    {
+      nthread = omp_get_num_threads();
+    }
+    InitThreadTemp(nthread);
+    std::vector<float> &preds = *out_preds;
+    const size_t stride = p_fmat->info().num_row * mparam.num_output_group;
+    preds.resize(stride * (mparam.size_leaf_vector+1));
+    // start collecting the prediction
+    dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
+
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const RowBatch &batch = iter->Value();
+      // parallel over local batch
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
+        const int tid = omp_get_thread_num();
+        RegTree::FVec &feats = thread_temp[tid];
+        int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
+        CHECK_LT(static_cast<size_t>(ridx), info.num_row);
+        // loop over output groups
+        for (int gid = 0; gid < mparam.num_output_group; ++gid) {
+          this->Pred(batch[i],
+                     buffer_offset < 0 ? -1 : buffer_offset + ridx,
+                     gid, info.GetRoot(ridx), &feats,
+                     &preds[ridx * mparam.num_output_group + gid], stride,
+                     ntree_limit);
+        }
+      }
+    }
+  }
+
+  void Predict(const SparseBatch::Inst& inst,
+               std::vector<float>* out_preds,
+               unsigned ntree_limit,
+               unsigned root_index) override {
+    DropTrees(1);
+    if (thread_temp.size() == 0) {
+      thread_temp.resize(1, RegTree::FVec());
+      thread_temp[0].Init(mparam.num_feature);
+    }
+    out_preds->resize(mparam.num_output_group * (mparam.size_leaf_vector+1));
+    // loop over output groups
+    for (int gid = 0; gid < mparam.num_output_group; ++gid) {
+      this->Pred(inst, -1, gid, root_index, &thread_temp[0],
+                 &(*out_preds)[gid], mparam.num_output_group,
+                 ntree_limit);
+    }
+  }
+
+ protected:
+  // commit new trees all at once
+  virtual void
+  CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
+              int bst_group) {
+    for (size_t i = 0; i < new_trees.size(); ++i) {
+      trees.push_back(std::move(new_trees[i]));
+      tree_info.push_back(bst_group);
+    }
+    mparam.num_trees += static_cast<int>(new_trees.size());
+    size_t num_drop = NormalizeTrees(new_trees.size());
+    if (dparam.silent != 1) {
+      LOG(INFO) << "drop " << num_drop << " trees, "
+                << "weight = " << weight_drop.back();
+    }
+  }
+  // predict the leaf scores without dropped trees
+  inline void Pred(const RowBatch::Inst &inst,
+                   int64_t buffer_index,
+                   int bst_group,
+                   unsigned root_index,
+                   RegTree::FVec *p_feats,
+                   float *out_pred,
+                   size_t stride,
+                   unsigned ntree_limit) {
+    float  psum = 0.0f;
+    // sum of leaf vector
+    std::vector<float> vec_psum(mparam.size_leaf_vector, 0.0f);
+    const int64_t bid = this->BufferOffset(buffer_index, bst_group);
+    p_feats->Fill(inst);
+    for (size_t i = 0; i < trees.size(); ++i) {
+      if (tree_info[i] == bst_group) {
+        bool drop = (std::find(idx_drop.begin(), idx_drop.end(), i) != idx_drop.end());
+        if (!drop) {
+          int tid = trees[i]->GetLeafIndex(*p_feats, root_index);
+          psum += weight_drop[i] * (*trees[i])[tid].leaf_value();
+          for (int j = 0; j < mparam.size_leaf_vector; ++j) {
+            vec_psum[j] += weight_drop[i] * trees[i]->leafvec(tid)[j];
+          }
+        }
+      }
+    }
+    p_feats->Drop(inst);
+    // updated the buffered results
+    if (bid >= 0 && ntree_limit == 0) {
+      pred_counter[bid] = static_cast<unsigned>(trees.size());
+      pred_buffer[bid] = psum;
+      for (int i = 0; i < mparam.size_leaf_vector; ++i) {
+        pred_buffer[bid + i + 1] = vec_psum[i];
+      }
+    }
+    out_pred[0] = psum;
+    for (int i = 0; i < mparam.size_leaf_vector; ++i) {
+      out_pred[stride * (i + 1)] = vec_psum[i];
+    }
+  }
+
+  // select dropped trees
+  inline void DropTrees(unsigned ntree_limit_drop) {
+    std::uniform_real_distribution<> runif(0.0, 1.0);
+    auto& rnd = common::GlobalRandom();
+    // reset
+    idx_drop.clear();
+    // sample dropped trees
+    bool skip = false;
+    if (dparam.skip_drop > 0.0) skip = (runif(rnd) < dparam.skip_drop);
+    if (ntree_limit_drop == 0 && !skip) {
+      if (dparam.sample_type == 1) {
+        float sum_weight = 0.0;
+        for (size_t i = 0; i < weight_drop.size(); ++i) {
+          sum_weight += weight_drop[i];
+        }
+        for (size_t i = 0; i < weight_drop.size(); ++i) {
+          if (runif(rnd) < dparam.rate_drop * weight_drop.size() * weight_drop[i] / sum_weight) {
+            idx_drop.push_back(i);
+          }
+        }
+      } else {
+        for (size_t i = 0; i < weight_drop.size(); ++i) {
+          if (runif(rnd) < dparam.rate_drop) {
+            idx_drop.push_back(i);
+          }
+        }
+      }
+    }
+  }
+  // set normalization factors
+  inline size_t NormalizeTrees(size_t size_new_trees) {
+    float lr = 1.0 * dparam.learning_rate / size_new_trees;
+    size_t num_drop = idx_drop.size();
+    if (num_drop == 0) {
+      for (size_t i = 0; i < size_new_trees; ++i) {
+        weight_drop.push_back(1.0);
+      }
+    } else {
+      if (dparam.normalize_type == 1) {
+        // normalize_type 1
+        float factor = 1.0 / (1.0 + lr);
+        for (size_t i = 0; i < idx_drop.size(); ++i) {
+          weight_drop[i] *= factor;
+        }
+        for (size_t i = 0; i < size_new_trees; ++i) {
+          weight_drop.push_back(lr * factor);
+        }
+      } else {
+        // normalize_type 0
+        float factor = 1.0 * num_drop / (num_drop + lr);
+        for (size_t i = 0; i < idx_drop.size(); ++i) {
+          weight_drop[i] *= factor;
+        }
+        for (size_t i = 0; i < size_new_trees; ++i) {
+          weight_drop.push_back(1.0 * lr / (num_drop + lr));
+        }
+      }
+    }
+    // reset
+    idx_drop.clear();
+    return num_drop;
+  }
+
+  // --- data structure ---
+  // training parameter
+  DartTrainParam dparam;
+  /*! \brief prediction buffer */
+  std::vector<float> weight_drop;
+  // indexes of dropped trees
+  std::vector<size_t> idx_drop;
+};
+
 // register the ojective functions
 DMLC_REGISTER_PARAMETER(GBTreeModelParam);
 DMLC_REGISTER_PARAMETER(GBTreeTrainParam);
+DMLC_REGISTER_PARAMETER(DartTrainParam);
 
 XGBOOST_REGISTER_GBM(GBTree, "gbtree")
 .describe("Tree booster, gradient boosted trees.")
 .set_body([]() {
     return new GBTree();
   });
+XGBOOST_REGISTER_GBM(Dart, "dart")
+.describe("Tree booster, dart.")
+.set_body([]() {
+    return new Dart();
+  });
 }  // namespace gbm
 }  // namespace xgboost
diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py
@@ -23,6 +23,51 @@ def test_glm(self):
                   if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
         assert err < 0.1
 
+    def test_dart(self):
+        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
+        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
+        param = {'max_depth': 5, 'objective': 'binary:logistic', 'booster': 'dart', 'silent': False}
+        # specify validations set to watch performance
+        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+        num_round = 2
+        bst = xgb.train(param, dtrain, num_round, watchlist)
+        # this is prediction
+        preds = bst.predict(dtest, ntree_limit=num_round)
+        labels = dtest.get_label()
+        err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
+        # error must be smaller than 10%
+        assert err < 0.1
+
+        # save dmatrix into binary buffer
+        dtest.save_binary('dtest.buffer')
+        # save model
+        bst.save_model('xgb.model.dart')
+        # load model and data in
+        bst2 = xgb.Booster(params=param, model_file='xgb.model.dart')
+        dtest2 = xgb.DMatrix('dtest.buffer')
+        preds2 = bst2.predict(dtest2, ntree_limit=num_round)
+        # assert they are the same
+        assert np.sum(np.abs(preds2 - preds)) == 0
+
+        # check whether sample_type and normalize_type work
+        num_round = 50
+        param['silent'] = True
+        param['learning_rate'] = 0.1
+        param['rate_drop'] = 0.1
+        preds_list = []
+        for p in [[p0, p1] for p0 in ['uniform', 'weighted'] for p1 in ['tree', 'forest']]:
+            param['sample_type'] = p[0]
+            param['normalize_type'] = p[1]
+            bst = xgb.train(param, dtrain, num_round, watchlist)
+            preds = bst.predict(dtest, ntree_limit=num_round)
+            err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
+            assert err < 0.1
+            preds_list.append(preds)
+
+        for ii in range(len(preds_list)):
+            for jj in range(ii + 1, len(preds_list)):
+                assert np.sum(np.abs(preds_list[ii] - preds_list[jj])) > 0
+
     def test_eta_decay(self):
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
         num_round = 4