microsoft · shiyu1994 · Nov 10, 2021 · Oct 26, 2021 · Oct 26, 2021 · Oct 27, 2021
@@ -11,6 +11,7 @@
 
 #include <algorithm>
 #include <queue>
+#include <set>
 #include <unordered_map>
 #include <utility>
 
@@ -322,17 +323,22 @@ bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int
 }
 
 void SerialTreeLearner::FindBestSplits(const Tree* tree) {
+  FindBestSplits(tree, nullptr);
+}
+
+void SerialTreeLearner::FindBestSplits(const Tree* tree, const std::set<int>* force_features) {
   std::vector<int8_t> is_feature_used(num_features_, 0);
-  #pragma omp parallel for schedule(static, 256) if (num_features_ >= 512)
+#pragma omp parallel for schedule(static, 256) if (num_features_ >= 512)
   for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
-    if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
+    if (!col_sampler_.is_feature_used_bytree()[feature_index] && (force_features == nullptr || force_features->find(feature_index) == force_features->end())) continue;
     if (parent_leaf_histogram_array_ != nullptr
-        && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
+      && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
       smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
       continue;
     }
     is_feature_used[feature_index] = 1;
   }
+
   bool use_subtract = parent_leaf_histogram_array_ != nullptr;
 
 #ifdef USE_CUDA
@@ -344,6 +350,7 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree) {
 #else
   ConstructHistograms(is_feature_used, use_subtract);
 #endif
+
   FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree);
 }
 
@@ -463,11 +470,8 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf,
   std::unordered_map<int, SplitInfo> forceSplitMap;
   q.push(std::make_pair(left, *left_leaf));
   while (!q.empty()) {
-    // before processing next node from queue, store info for current left/right leaf
-    // store "best split" for left and right, even if they might be overwritten by forced split
-    if (BeforeFindBestSplit(tree, *left_leaf, *right_leaf)) {
-      FindBestSplits(tree);
-    }
+    FindBestSplitsForForceSplitLeaf(tree, left_leaf, right_leaf, left, right);
+
     // then, compute own splits
     SplitInfo left_split;
     SplitInfo right_split;
@@ -561,6 +565,28 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf,
   return result_count;
 }
 
+void SerialTreeLearner::FindBestSplitsForForceSplitLeaf(Tree* tree, int* left_leaf, int* right_leaf, Json left_force_split_leaf_setting, Json right_force_split_leaf_setting) {
+  // before processing next node from queue, store info for current left/right leaf
+  // store "best split" for left and right, even if they might be overwritten by forced split
+  if (BeforeFindBestSplit(tree, *left_leaf, *right_leaf)) {
+    std::set<int> force_features;
+
+    if (!left_force_split_leaf_setting.is_null()) {
+      const int left_feature = left_force_split_leaf_setting["feature"].int_value();
+      const int left_inner_feature_index = train_data_->InnerFeatureIndex(left_feature);
+      force_features.insert(left_inner_feature_index);
+    }
+
+    if (!right_force_split_leaf_setting.is_null()) {
+      const int right_feature = right_force_split_leaf_setting["feature"].int_value();
+      const int right_inner_feature_index = train_data_->InnerFeatureIndex(right_feature);
+      force_features.insert(right_inner_feature_index);
+    }
+
+    FindBestSplits(tree, &force_features);
+  }
+}
+
 void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf,
                                    int* right_leaf, bool update_cnt) {
   Common::FunctionTimer fun_timer("SerialTreeLearner::SplitInner", global_timer);

@@ -19,6 +19,7 @@
 #include <memory>
 #include <random>
 #include <vector>
+#include <set>
 
 #include "col_sampler.hpp"
 #include "data_partition.hpp"
@@ -142,6 +143,8 @@ class SerialTreeLearner: public TreeLearner {
 
   virtual void FindBestSplits(const Tree* tree);
 
+  virtual void FindBestSplits(const Tree* tree, const std::set<int>* force_features);
+
   virtual void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract);
 
   virtual void FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract, const Tree*);
@@ -165,6 +168,8 @@ class SerialTreeLearner: public TreeLearner {
   int32_t ForceSplits(Tree* tree, int* left_leaf, int* right_leaf,
                       int* cur_depth);
 
+  void FindBestSplitsForForceSplitLeaf(LightGBM::Tree* tree, int* left_leaf, int* right_leaf, Json left, Json right);
+
   /*!
   * \brief Get the number of data in a leaf
   * \param leaf_idx The index of leaf

@@ -1,6 +1,7 @@
 # coding: utf-8
 import copy
 import itertools
+import json
 import math
 import pickle
 import platform
@@ -2887,3 +2888,30 @@ def hook(obj):
     dumped_model_str = str(bst.dump_model(5, 0, object_hook=hook))
     assert "leaf_value" not in dumped_model_str
     assert "LV" in dumped_model_str
+
+
+def test_force_split_with_feature_fraction():
+    X, y = load_boston(return_X_y=True)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
+    lgb_train = lgb.Dataset(X_train, y_train)
+
+    forced_split = {
+        "feature": 0,
+        "threshold": 0.5
+    }
+
+    with open("forced_split.json", "w") as f:
+        f.write(json.dumps(forced_split))
+
+    params = {
+        "objective": "regression",
+        "feature_fraction": 0.6,
+        "force_col_wise": True,
+        "feature_fraction_seed": 1,
+        "forcedsplits_filename": "forced_split.json"
+    }
+
+    gbm = lgb.train(params, lgb_train)
+    ret = mean_absolute_error(y_test, gbm.predict(X_test))
+
+    assert ret < 2.0