From 0050954c85f8df08ce43c6c0c85204328ebdb057 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Fri, 27 Mar 2020 14:38:32 +1100
Subject: [PATCH 01/10] Path smoothing

---
 docs/Parameters-Tuning.rst                    |   2 +
 docs/Parameters.rst                           |  14 +
 include/LightGBM/config.h                     |   9 +
 include/LightGBM/tree.h                       |   5 +-
 src/io/config.cpp                             |   8 +
 src/io/config_auto.cpp                        |   5 +
 src/treelearner/feature_histogram.hpp         | 359 ++++++++++++------
 src/treelearner/leaf_splits.hpp               |  12 +-
 src/treelearner/serial_tree_learner.cpp       |  43 ++-
 .../voting_parallel_tree_learner.cpp          |  12 +-
 tests/python_package_test/test_engine.py      |  17 +
 11 files changed, 348 insertions(+), 138 deletions(-)
diff --git a/docs/Parameters-Tuning.rst b/docs/Parameters-Tuning.rst
index d2579361a1c9..1f39b29ded46 100644
--- a/docs/Parameters-Tuning.rst
+++ b/docs/Parameters-Tuning.rst
@@ -82,3 +82,5 @@ Deal with Over-fitting
 -  Try ``extra_trees``
 
 .. _Optuna: https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258
+
+-  Try increasing ``path_smoothing``
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 0d9ade659fef..13896a6e2fdf 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -514,6 +514,20 @@ Learning Control Parameters
 
    -  applied once per forest
 
+-  ``path_smooth`` :raw-html:`<a id="path_smooth" title="Permalink to this parameter" href="#path_smooth">&#x1F517;&#xFE0E;</a>`, default = ``0``, type = double, constraints: ``path_smooth >=  0.0``
+
+   -  controls smoothing applied to tree nodes
+
+   -  helps prevent overfitting on leaves with few samples
+
+   -  if set to zero, no smoothing is applied
+
+   -  if `path_smooth > 0` then `min_data_in_leaf` must be at least `2`.
+
+   -  larger values give stronger regularisation
+
+      -  the weight of each node is `(n / path_smooth) * w + w_p / (n / path_smooth + 1)`, where `n` is the number of samples in the node, `w` is the calculated node weight, and `w_p` is the weight of the parent node
+
 -  ``verbosity`` :raw-html:`<a id="verbosity" title="Permalink to this parameter" href="#verbosity">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, aliases: ``verbose``
 
    -  controls the level of LightGBM's verbosity
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 057f56e99491..8feff729d0a8 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -488,6 +488,15 @@ struct Config {
   // desc = applied once per forest
   std::vector<double> cegb_penalty_feature_coupled;
 
+  // check = >= 0.0
+  // desc = controls smoothing applied to tree nodes
+  // desc = helps prevent overfitting on leaves with few samples
+  // desc = if set to zero, no smoothing is applied
+  // desc = if `path_smooth > 0` then `min_data_in_leaf` must be at least `2`.
+  // desc = larger values give stronger regularisation
+  // descl2 = the weight of each node is `(n / path_smooth) * w + w_p / (n / path_smooth + 1)`, where `n` is the number of samples in the node, `w` is the calculated node weight, and `w_p` is the weight of the parent node
+  double path_smooth = 0;
+
   // alias = verbose
   // desc = controls the level of LightGBM's verbosity
   // desc = ``< 0``: Fatal, ``= 0``: Error (Warning), ``= 1``: Info, ``> 1``: Debug
diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h
index 55568e41f544..66ba8fb8eacb 100644
--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -142,6 +142,9 @@ class Tree {
   /*! \brief Get depth of specific leaf*/
   inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; }
 
+  /*! \brief Get parent of specific leaf*/
+  inline int leaf_parent(int leaf_idx) const {return leaf_parent_[leaf_idx]; }
+
   /*! \brief Get feature of specific split*/
   inline int split_feature(int split_idx) const { return split_feature_[split_idx]; }
 
@@ -163,8 +166,6 @@ class Tree {
     return split_feature_inner_[node_idx];
   }
 
-  inline int leaf_parent(int leaf_idx) const { return leaf_parent_[leaf_idx]; }
-
   inline uint32_t threshold_in_bin(int node_idx) const {
     return threshold_in_bin_[node_idx];
   }
diff --git a/src/io/config.cpp b/src/io/config.cpp
index 0cf1d3c8bf21..fb831b4ec495 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -317,6 +317,14 @@ void Config::CheckParamConflict() {
     force_col_wise = true;
     force_row_wise = false;
   }
+  // min_data_in_leaf must be at least 2 if path smoothing is active. This is because when the split is calculated
+  // the count is calculated using the proportion of hessian in the leaf which is rounded up to nearest int, so it can
+  // be 1 when there is actually no data in the leaf. In rare cases this can cause a bug because with path smoothing the
+  // calculated split gain can be positive even with zero gradient and hessian.
+  if (path_smooth > kEpsilon && min_data_in_leaf < 2) {
+    min_data_in_leaf = 2;
+    Log::Warning("min_data_in_leaf has been increased to 2 because this is required when path smoothing is active.");
+  }
   if (is_parallel && monotone_constraints_method == std::string("intermediate")) {
     // In distributed mode, local node doesn't have histograms on all features, cannot perform "intermediate" monotone constraints.
     Log::Warning("Cannot use \"intermediate\" monotone constraints in parallel learning, auto set to \"basic\" method.");
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index 59cc62a5d375..564db37c3334 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -225,6 +225,7 @@ const std::unordered_set<std::string>& Config::parameter_set() {
   "cegb_penalty_split",
   "cegb_penalty_feature_lazy",
   "cegb_penalty_feature_coupled",
+  "path_smooth",
   "verbosity",
   "input_model",
   "output_model",
@@ -443,6 +444,9 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
     cegb_penalty_feature_coupled = Common::StringToArray<double>(tmp_str, ',');
   }
 
+  GetDouble(params, "path_smooth", &path_smooth);
+  CHECK_GE(path_smooth,  0.0);
+
   GetInt(params, "verbosity", &verbosity);
 
   GetString(params, "input_model", &input_model);
@@ -646,6 +650,7 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n";
   str_buf << "[cegb_penalty_feature_lazy: " << Common::Join(cegb_penalty_feature_lazy, ",") << "]\n";
   str_buf << "[cegb_penalty_feature_coupled: " << Common::Join(cegb_penalty_feature_coupled, ",") << "]\n";
+  str_buf << "[path_smooth: " << path_smooth << "]\n";
   str_buf << "[verbosity: " << verbosity << "]\n";
   str_buf << "[max_bin: " << max_bin << "]\n";
   str_buf << "[max_bin_by_feature: " << Common::Join(max_bin_by_feature, ",") << "]\n";
diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp
index 9c859f7ec1a2..79bf63c34457 100644
--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -85,22 +85,29 @@ class FeatureHistogram {
   void FindBestThreshold(double sum_gradient, double sum_hessian,
                          data_size_t num_data,
                          const ConstraintEntry& constraints,
+                         double parent_output,
                          SplitInfo* output) {
     output->default_left = true;
     output->gain = kMinScore;
     find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data,
-                             constraints, output);
+                             constraints, parent_output, output);
     output->gain *= meta_->penalty;
   }
 
-  template <bool USE_RAND, bool USE_L1, bool USE_MAX_OUTPUT>
-  double BeforeNumercal(double sum_gradient, double sum_hessian,
+  template <bool USE_RAND, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
+  double BeforeNumercal(double sum_gradient, double sum_hessian, double parent_output,
                         SplitInfo* output, int* rand_threshold) {
     is_splittable_ = false;
     output->monotone_type = meta_->monotone_type;
-    double gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT>(
-        sum_gradient, sum_hessian, meta_->config->lambda_l1,
-        meta_->config->lambda_l2, meta_->config->max_delta_step);
+    double gain_shift;
+    if (USE_SMOOTHING) {
+      gain_shift = GetLeafGainGivenOutput<USE_L1>(
+          sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output);
+    } else {
+      gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT, false>(
+          sum_gradient, sum_hessian, meta_->config->lambda_l1,
+          meta_->config->lambda_l2, meta_->config->max_delta_step, 0, 0, 0);
+    }
     *rand_threshold = 0;
     if (USE_RAND) {
       if (meta_->num_bin - 2 > 0) {
@@ -144,21 +151,30 @@ class FeatureHistogram {
 
   template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT>
   void FuncForNumricalL2() {
-#define TEMPLATE_PREFIX USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT
+    if (meta_->config->path_smooth > kEpsilon) {
+      FuncForNumricalL3<USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT, true>();
+    } else {
+      FuncForNumricalL3<USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT, false>();
+    }
+  }
+
+  template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
+  void FuncForNumricalL3() {
+#define TEMPLATE_PREFIX USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING
 #define LAMBDA_ARGUMENTS                                         \
   double sum_gradient, double sum_hessian, data_size_t num_data, \
-      const ConstraintEntry &constraints, SplitInfo *output
-#define BEFORE_ARGUMENTS sum_gradient, sum_hessian, output, &rand_threshold
+      const ConstraintEntry &constraints, double parent_output, SplitInfo *output
+#define BEFORE_ARGUMENTS sum_gradient, sum_hessian, parent_output, output, &rand_threshold
 #define FUNC_ARGUMENTS                                                      \
-  sum_gradient, sum_hessian, num_data, constraints, min_gain_shift, output, \
-      rand_threshold
+  sum_gradient, sum_hessian, num_data, constraints, min_gain_shift, \
+      output, rand_threshold, parent_output
 
     if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) {
       if (meta_->missing_type == MissingType::Zero) {
         find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
           int rand_threshold = 0;
           double min_gain_shift =
-              BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
+              BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
                   BEFORE_ARGUMENTS);
           FindBestThresholdSequentially<TEMPLATE_PREFIX, true, true, false>(
               FUNC_ARGUMENTS);
@@ -169,7 +185,7 @@ class FeatureHistogram {
         find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
           int rand_threshold = 0;
           double min_gain_shift =
-              BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
+              BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
                   BEFORE_ARGUMENTS);
           FindBestThresholdSequentially<TEMPLATE_PREFIX, true, false, true>(
               FUNC_ARGUMENTS);
@@ -182,7 +198,7 @@ class FeatureHistogram {
         find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
           int rand_threshold = 0;
           double min_gain_shift =
-              BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
+              BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
                   BEFORE_ARGUMENTS);
           FindBestThresholdSequentially<TEMPLATE_PREFIX, true, false, false>(
               FUNC_ARGUMENTS);
@@ -191,10 +207,9 @@ class FeatureHistogram {
         find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
           int rand_threshold = 0;
           double min_gain_shift =
-              BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
+              BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
                   BEFORE_ARGUMENTS);
-          FindBestThresholdSequentially<USE_RAND, USE_MC, USE_L1,
-                                        USE_MAX_OUTPUT, true, false, false>(
+          FindBestThresholdSequentially<TEMPLATE_PREFIX, true, false, false>(
               FUNC_ARGUMENTS);
           output->default_left = false;
         };
@@ -224,42 +239,52 @@ class FeatureHistogram {
 
   template <bool USE_RAND, bool USE_MC>
   void FuncForCategoricalL1() {
+    if (meta_->config->path_smooth > kEpsilon) {
+      FuncForCategoricalL2<USE_RAND, USE_MC, true>();
+    } else {
+      FuncForCategoricalL2<USE_RAND, USE_MC, false>();
+    }
+  }
+
+  template <bool USE_RAND, bool USE_MC, bool USE_SMOOTHING>
+  void FuncForCategoricalL2() {
 #define ARGUMENTS                                                      \
   std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, \
-      std::placeholders::_4, std::placeholders::_5
+      std::placeholders::_4, std::placeholders::_5, std::placeholders::_6
     if (meta_->config->lambda_l1 > 0) {
       if (meta_->config->max_delta_step > 0) {
         find_best_threshold_fun_ =
             std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
-                          USE_RAND, USE_MC, true, true>,
+                          USE_RAND, USE_MC, true, true, USE_SMOOTHING>,
                       this, ARGUMENTS);
       } else {
         find_best_threshold_fun_ =
             std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
-                          USE_RAND, USE_MC, true, false>,
+                          USE_RAND, USE_MC, true, false, USE_SMOOTHING>,
                       this, ARGUMENTS);
       }
     } else {
       if (meta_->config->max_delta_step > 0) {
         find_best_threshold_fun_ =
             std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
-                          USE_RAND, USE_MC, false, true>,
+                          USE_RAND, USE_MC, false, true, USE_SMOOTHING>,
                       this, ARGUMENTS);
       } else {
         find_best_threshold_fun_ =
             std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
-                          USE_RAND, USE_MC, false, false>,
+                          USE_RAND, USE_MC, false, false, USE_SMOOTHING>,
                       this, ARGUMENTS);
       }
     }
 #undef ARGUMENTS
   }
 
-  template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT>
+  template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
   void FindBestThresholdCategoricalInner(double sum_gradient,
                                          double sum_hessian,
                                          data_size_t num_data,
                                          const ConstraintEntry& constraints,
+                                         double parent_output,
                                          SplitInfo* output) {
     is_splittable_ = false;
     output->default_left = false;
@@ -267,9 +292,17 @@ class FeatureHistogram {
     data_size_t best_left_count = 0;
     double best_sum_left_gradient = 0;
     double best_sum_left_hessian = 0;
-    double gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT>(
-        sum_gradient, sum_hessian, meta_->config->lambda_l1,
-        meta_->config->lambda_l2, meta_->config->max_delta_step);
+    double gain_shift;
+    if (USE_SMOOTHING) {
+      gain_shift = GetLeafGainGivenOutput<USE_L1>(
+          sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output);
+    } else {
+      // Need special case for no smoothing to preserve existing behaviour. If no smoothing, the parent output is calculated
+      // with the larger categorical l2, whereas min_split_gain uses the original l2.
+      gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT, false>(sum_gradient, sum_hessian,
+          meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, 0,
+          num_data, 0);
+    }
 
     double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
     bool is_full_categorical = meta_->missing_type == MissingType::None;
@@ -317,10 +350,10 @@ class FeatureHistogram {
           }
         }
         // current split gain
-        double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT>(
+        double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
             sum_other_gradient, sum_other_hessian, grad, hess + kEpsilon,
             meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
-            constraints, 0);
+            constraints, 0, meta_->config->path_smooth, other_count, cnt, parent_output);
         // gain with split is worse than without split
         if (current_gain <= min_gain_shift) {
           continue;
@@ -419,10 +452,11 @@ class FeatureHistogram {
               continue;
             }
           }
-          double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT>(
+          double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
               sum_left_gradient, sum_left_hessian, sum_right_gradient,
               sum_right_hessian, meta_->config->lambda_l1, l2,
-              meta_->config->max_delta_step, constraints, 0);
+              meta_->config->max_delta_step, constraints, 0, meta_->config->path_smooth,
+              left_count, right_count, parent_output);
           if (current_gain <= min_gain_shift) {
             continue;
           }
@@ -440,19 +474,18 @@ class FeatureHistogram {
     }
 
     if (is_splittable_) {
-      output->left_output =
-          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>(
-              best_sum_left_gradient, best_sum_left_hessian,
-              meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
-              constraints);
+      output->left_output = CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+          best_sum_left_gradient, best_sum_left_hessian,
+          meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
+          constraints, meta_->config->path_smooth, best_left_count, parent_output);
       output->left_count = best_left_count;
       output->left_sum_gradient = best_sum_left_gradient;
       output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
-      output->right_output =
-          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>(
-              sum_gradient - best_sum_left_gradient,
-              sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2,
-              meta_->config->max_delta_step, constraints);
+      output->right_output = CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+          sum_gradient - best_sum_left_gradient,
+          sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2,
+          meta_->config->max_delta_step, constraints, meta_->config->path_smooth,
+          num_data - best_left_count, parent_output);
       output->right_count = num_data - best_left_count;
       output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
       output->right_sum_hessian =
@@ -484,22 +517,23 @@ class FeatureHistogram {
 
   void GatherInfoForThreshold(double sum_gradient, double sum_hessian,
                               uint32_t threshold, data_size_t num_data,
-                              SplitInfo* output) {
+                              double parent_output, SplitInfo* output) {
     if (meta_->bin_type == BinType::NumericalBin) {
       GatherInfoForThresholdNumerical(sum_gradient, sum_hessian, threshold,
-                                      num_data, output);
+                                      num_data, parent_output, output);
     } else {
       GatherInfoForThresholdCategorical(sum_gradient, sum_hessian, threshold,
-                                        num_data, output);
+                                        num_data, parent_output, output);
     }
   }
 
   void GatherInfoForThresholdNumerical(double sum_gradient, double sum_hessian,
                                        uint32_t threshold, data_size_t num_data,
-                                       SplitInfo* output) {
-    double gain_shift = GetLeafGain<true, true>(
+                                       double parent_output, SplitInfo* output) {
+    bool use_smoothing = meta_->config->path_smooth > kEpsilon;
+    double gain_shift = GetLeafGainGivenOutput<true>(
         sum_gradient, sum_hessian, meta_->config->lambda_l1,
-        meta_->config->lambda_l2, meta_->config->max_delta_step);
+        meta_->config->lambda_l2, parent_output);
     double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
 
     // do stuff here
@@ -543,13 +577,28 @@ class FeatureHistogram {
     double sum_left_gradient = sum_gradient - sum_right_gradient;
     double sum_left_hessian = sum_hessian - sum_right_hessian;
     data_size_t left_count = num_data - right_count;
-    double current_gain =
-        GetLeafGain<true, true>(
-            sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1,
-            meta_->config->lambda_l2, meta_->config->max_delta_step) +
-        GetLeafGain<true, true>(
-            sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1,
-            meta_->config->lambda_l2, meta_->config->max_delta_step);
+    double current_gain;
+    if (use_smoothing) {
+      current_gain =
+          GetLeafGain<true, true, true>(
+              sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1,
+              meta_->config->lambda_l2, meta_->config->max_delta_step,
+              meta_->config->path_smooth, left_count, parent_output) +
+          GetLeafGain<true, true, true>(
+              sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1,
+              meta_->config->lambda_l2, meta_->config->max_delta_step,
+              meta_->config->path_smooth, right_count, parent_output);
+    } else {
+      current_gain =
+          GetLeafGain<true, true, false>(
+              sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1,
+              meta_->config->lambda_l2, meta_->config->max_delta_step,
+              meta_->config->path_smooth, left_count, parent_output) +
+          GetLeafGain<true, true, false>(
+              sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1,
+              meta_->config->lambda_l2, meta_->config->max_delta_step,
+              meta_->config->path_smooth, right_count, parent_output);
+    }
 
     // gain with split is worse than without split
     if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
@@ -561,16 +610,33 @@ class FeatureHistogram {
 
     // update split information
     output->threshold = threshold;
-    output->left_output = CalculateSplittedLeafOutput<true, true>(
-        sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1,
-        meta_->config->lambda_l2, meta_->config->max_delta_step);
+    if (use_smoothing) {
+      output->left_output = CalculateSplittedLeafOutput<true, true, true>(
+          sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1,
+          meta_->config->lambda_l2, meta_->config->max_delta_step,
+          meta_->config->path_smooth, left_count, parent_output);
+    } else {
+      output->left_output = CalculateSplittedLeafOutput<true, true, false>(
+          sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1,
+          meta_->config->lambda_l2, meta_->config->max_delta_step,
+          meta_->config->path_smooth, left_count, parent_output);
+    }
     output->left_count = left_count;
     output->left_sum_gradient = sum_left_gradient;
     output->left_sum_hessian = sum_left_hessian - kEpsilon;
-    output->right_output = CalculateSplittedLeafOutput<true, true>(
-        sum_gradient - sum_left_gradient, sum_hessian - sum_left_hessian,
-        meta_->config->lambda_l1, meta_->config->lambda_l2,
-        meta_->config->max_delta_step);
+    if (use_smoothing) {
+      output->right_output = CalculateSplittedLeafOutput<true, true, true>(
+          sum_gradient - sum_left_gradient, sum_hessian - sum_left_hessian,
+          meta_->config->lambda_l1, meta_->config->lambda_l2,
+          meta_->config->max_delta_step, meta_->config->path_smooth,
+          right_count, parent_output);
+    } else {
+      output->right_output = CalculateSplittedLeafOutput<true, true, false>(
+          sum_gradient - sum_left_gradient, sum_hessian - sum_left_hessian,
+          meta_->config->lambda_l1, meta_->config->lambda_l2,
+          meta_->config->max_delta_step, meta_->config->path_smooth,
+          right_count, parent_output);
+    }
     output->right_count = num_data - left_count;
     output->right_sum_gradient = sum_gradient - sum_left_gradient;
     output->right_sum_hessian = sum_hessian - sum_left_hessian - kEpsilon;
@@ -580,13 +646,12 @@ class FeatureHistogram {
 
   void GatherInfoForThresholdCategorical(double sum_gradient,
                                          double sum_hessian, uint32_t threshold,
-                                         data_size_t num_data,
+                                         data_size_t num_data, double parent_output,
                                          SplitInfo* output) {
     // get SplitInfo for a given one-hot categorical split.
     output->default_left = false;
-    double gain_shift = GetLeafGain<true, true>(
-        sum_gradient, sum_hessian, meta_->config->lambda_l1,
-        meta_->config->lambda_l2, meta_->config->max_delta_step);
+    double gain_shift = GetLeafGainGivenOutput<true>(
+        sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output);
     double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
     bool is_full_categorical = meta_->missing_type == MissingType::None;
     int used_bin = meta_->num_bin - 1 + is_full_categorical;
@@ -609,13 +674,33 @@ class FeatureHistogram {
     double sum_left_gradient = grad;
     double sum_right_gradient = sum_gradient - sum_left_gradient;
     // current split gain
-    double current_gain =
-        GetLeafGain<true, true>(sum_right_gradient, sum_right_hessian,
-                                meta_->config->lambda_l1, l2,
-                                meta_->config->max_delta_step) +
-        GetLeafGain<true, true>(sum_left_gradient, sum_left_hessian,
-                                meta_->config->lambda_l1, l2,
-                                meta_->config->max_delta_step);
+    double current_gain;
+    bool use_smoothing = meta_->config->path_smooth > kEpsilon;
+    if (use_smoothing) {
+      current_gain =
+          GetLeafGain<true, true, true>(sum_right_gradient, sum_right_hessian,
+                                        meta_->config->lambda_l1, l2,
+                                        meta_->config->max_delta_step,
+                                        meta_->config->path_smooth, right_count,
+                                        parent_output) +
+          GetLeafGain<true, true, true>(sum_left_gradient, sum_left_hessian,
+                                        meta_->config->lambda_l1, l2,
+                                        meta_->config->max_delta_step,
+                                        meta_->config->path_smooth, left_count,
+                                        parent_output);
+    } else {
+      current_gain =
+          GetLeafGain<true, true, false>(sum_right_gradient, sum_right_hessian,
+                                         meta_->config->lambda_l1, l2,
+                                         meta_->config->max_delta_step,
+                                         meta_->config->path_smooth, right_count,
+                                         parent_output) +
+          GetLeafGain<true, true, false>(sum_left_gradient, sum_left_hessian,
+                                         meta_->config->lambda_l1, l2,
+                                         meta_->config->max_delta_step,
+                                         meta_->config->path_smooth, left_count,
+                                         parent_output);
+    }
     if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
       output->gain = kMinScore;
       Log::Warning(
@@ -623,15 +708,31 @@ class FeatureHistogram {
       return;
     }
 
-    output->left_output = CalculateSplittedLeafOutput<true, true>(
-        sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, l2,
-        meta_->config->max_delta_step);
+    if (use_smoothing) {
+      output->left_output = CalculateSplittedLeafOutput<true, true, true>(
+          sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, l2,
+          meta_->config->max_delta_step, meta_->config->path_smooth, left_count,
+          parent_output);
+    } else {
+      output->left_output = CalculateSplittedLeafOutput<true, true, false>(
+          sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, l2,
+          meta_->config->max_delta_step, meta_->config->path_smooth, left_count,
+          parent_output);
+    }
     output->left_count = left_count;
     output->left_sum_gradient = sum_left_gradient;
     output->left_sum_hessian = sum_left_hessian - kEpsilon;
-    output->right_output = CalculateSplittedLeafOutput<true, true>(
-        sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2,
-        meta_->config->max_delta_step);
+    if (use_smoothing) {
+      output->right_output = CalculateSplittedLeafOutput<true, true, true>(
+          sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2,
+          meta_->config->max_delta_step, meta_->config->path_smooth, right_count,
+          parent_output);
+    } else {
+      output->right_output = CalculateSplittedLeafOutput<true, true, false>(
+          sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2,
+          meta_->config->max_delta_step, meta_->config->path_smooth, right_count,
+          parent_output);
+    }
     output->right_count = right_count;
     output->right_sum_gradient = sum_gradient - sum_left_gradient;
     output->right_sum_hessian = sum_right_hessian - kEpsilon;
@@ -670,35 +771,42 @@ class FeatureHistogram {
     return Common::Sign(s) * reg_s;
   }
 
-  template <bool USE_L1, bool USE_MAX_OUTPUT>
+  template <bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
   static double CalculateSplittedLeafOutput(double sum_gradients,
                                             double sum_hessians, double l1,
-                                            double l2, double max_delta_step) {
+                                            double l2, double max_delta_step,
+                                            double smoothing, data_size_t num_data,
+                                            double parent_output) {
+    double ret;
     if (USE_L1) {
-      double ret = -ThresholdL1(sum_gradients, l1) / (sum_hessians + l2);
+      ret = -ThresholdL1(sum_gradients, l1) / (sum_hessians + l2);
       if (USE_MAX_OUTPUT) {
         if (max_delta_step > 0 && std::fabs(ret) > max_delta_step) {
-          return Common::Sign(ret) * max_delta_step;
+          ret = Common::Sign(ret) * max_delta_step;
         }
       }
-      return ret;
     } else {
-      double ret = -sum_gradients / (sum_hessians + l2);
+      ret = -sum_gradients / (sum_hessians + l2);
       if (USE_MAX_OUTPUT) {
         if (max_delta_step > 0 && std::fabs(ret) > max_delta_step) {
-          return Common::Sign(ret) * max_delta_step;
+          ret = Common::Sign(ret) * max_delta_step;
         }
       }
-      return ret;
     }
+    if (USE_SMOOTHING) {
+      ret = ret * (num_data / smoothing) / (num_data / smoothing + 1) \
+          + parent_output / (num_data / smoothing + 1);
+    }
+    return ret;
   }
 
-  template <bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT>
+  template <bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
   static double CalculateSplittedLeafOutput(
       double sum_gradients, double sum_hessians, double l1, double l2,
-      double max_delta_step, const ConstraintEntry& constraints) {
-    double ret = CalculateSplittedLeafOutput<USE_L1, USE_MAX_OUTPUT>(
-        sum_gradients, sum_hessians, l1, l2, max_delta_step);
+      double max_delta_step, const ConstraintEntry& constraints,
+      double smoothing, data_size_t num_data, double parent_output) {
+    double ret = CalculateSplittedLeafOutput<USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+        sum_gradients, sum_hessians, l1, l2, max_delta_step, smoothing, num_data, parent_output);
     if (USE_MC) {
       if (ret < constraints.min) {
         ret = constraints.min;
@@ -710,30 +818,36 @@ class FeatureHistogram {
   }
 
  private:
-  template <bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT>
+  template <bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
   static double GetSplitGains(double sum_left_gradients,
                               double sum_left_hessians,
                               double sum_right_gradients,
                               double sum_right_hessians, double l1, double l2,
                               double max_delta_step,
                               const ConstraintEntry& constraints,
-                              int8_t monotone_constraint) {
+                              int8_t monotone_constraint,
+                              double smoothing,
+                              data_size_t left_count,
+                              data_size_t right_count,
+                              double parent_output) {
     if (!USE_MC) {
-      return GetLeafGain<USE_L1, USE_MAX_OUTPUT>(sum_left_gradients,
-                                                 sum_left_hessians, l1, l2,
-                                                 max_delta_step) +
-             GetLeafGain<USE_L1, USE_MAX_OUTPUT>(sum_right_gradients,
-                                                 sum_right_hessians, l1, l2,
-                                                 max_delta_step);
+      return GetLeafGain<USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(sum_left_gradients,
+                                                                sum_left_hessians, l1, l2,
+                                                                max_delta_step, smoothing,
+                                                                left_count, parent_output) +
+             GetLeafGain<USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(sum_right_gradients,
+                                                                sum_right_hessians, l1, l2,
+                                                                max_delta_step, smoothing,
+                                                                right_count, parent_output);
     } else {
       double left_output =
-          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>(
+          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
               sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step,
-              constraints);
+              constraints, smoothing, left_count, parent_output);
       double right_output =
-          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>(
+          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
               sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step,
-              constraints);
+              constraints, smoothing, right_count, parent_output);
       if (((monotone_constraint > 0) && (left_output > right_output)) ||
           ((monotone_constraint < 0) && (left_output < right_output))) {
         return 0;
@@ -745,10 +859,11 @@ class FeatureHistogram {
     }
   }
 
-  template <bool USE_L1, bool USE_MAX_OUTPUT>
+  template <bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
   static double GetLeafGain(double sum_gradients, double sum_hessians,
-                            double l1, double l2, double max_delta_step) {
-    if (!USE_MAX_OUTPUT) {
+                            double l1, double l2, double max_delta_step,
+                            double smoothing, data_size_t num_data, double parent_output) {
+    if (!USE_MAX_OUTPUT && !USE_SMOOTHING) {
       if (USE_L1) {
         const double sg_l1 = ThresholdL1(sum_gradients, l1);
         return (sg_l1 * sg_l1) / (sum_hessians + l2);
@@ -756,10 +871,9 @@ class FeatureHistogram {
         return (sum_gradients * sum_gradients) / (sum_hessians + l2);
       }
     } else {
-      double output = CalculateSplittedLeafOutput<USE_L1, USE_MAX_OUTPUT>(
-          sum_gradients, sum_hessians, l1, l2, max_delta_step);
-      return GetLeafGainGivenOutput<USE_L1>(sum_gradients, sum_hessians, l1, l2,
-                                            output);
+      double output = CalculateSplittedLeafOutput<USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+          sum_gradients, sum_hessians, l1, l2, max_delta_step, smoothing, num_data, parent_output);
+      return GetLeafGainGivenOutput<USE_L1>(sum_gradients, sum_hessians, l1, l2, output);
     }
   }
 
@@ -776,13 +890,13 @@ class FeatureHistogram {
     }
   }
 
-  template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT,
+  template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING,
             bool REVERSE, bool SKIP_DEFAULT_BIN, bool NA_AS_MISSING>
   void FindBestThresholdSequentially(double sum_gradient, double sum_hessian,
                                      data_size_t num_data,
                                      const ConstraintEntry& constraints,
                                      double min_gain_shift, SplitInfo* output,
-                                     int rand_threshold) {
+                                     int rand_threshold, double parent_output) {
     const int8_t offset = meta_->offset;
     double best_sum_left_gradient = NAN;
     double best_sum_left_hessian = NAN;
@@ -837,11 +951,12 @@ class FeatureHistogram {
           }
         }
         // current split gain
-        double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT>(
+        double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
             sum_left_gradient, sum_left_hessian, sum_right_gradient,
             sum_right_hessian, meta_->config->lambda_l1,
             meta_->config->lambda_l2, meta_->config->max_delta_step,
-            constraints, meta_->monotone_type);
+            constraints, meta_->monotone_type, meta_->config->path_smooth,
+            left_count, right_count, parent_output);
         // gain with split is worse than without split
         if (current_gain <= min_gain_shift) {
           continue;
@@ -921,11 +1036,12 @@ class FeatureHistogram {
           }
         }
         // current split gain
-        double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT>(
+        double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
             sum_left_gradient, sum_left_hessian, sum_right_gradient,
             sum_right_hessian, meta_->config->lambda_l1,
             meta_->config->lambda_l2, meta_->config->max_delta_step,
-            constraints, meta_->monotone_type);
+            constraints, meta_->monotone_type, meta_->config->path_smooth, left_count,
+            right_count, parent_output);
         // gain with split is worse than without split
         if (current_gain <= min_gain_shift) {
           continue;
@@ -948,19 +1064,21 @@ class FeatureHistogram {
       // update split information
       output->threshold = best_threshold;
       output->left_output =
-          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>(
+          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
               best_sum_left_gradient, best_sum_left_hessian,
               meta_->config->lambda_l1, meta_->config->lambda_l2,
-              meta_->config->max_delta_step, constraints);
+              meta_->config->max_delta_step, constraints, meta_->config->path_smooth,
+              best_left_count, parent_output);
       output->left_count = best_left_count;
       output->left_sum_gradient = best_sum_left_gradient;
       output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
       output->right_output =
-          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>(
+          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
               sum_gradient - best_sum_left_gradient,
               sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1,
               meta_->config->lambda_l2, meta_->config->max_delta_step,
-              constraints);
+              constraints, meta_->config->path_smooth, num_data - best_left_count,
+              parent_output);
       output->right_count = num_data - best_left_count;
       output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
       output->right_sum_hessian =
@@ -976,7 +1094,7 @@ class FeatureHistogram {
   bool is_splittable_ = true;
 
   std::function<void(double, double, data_size_t, const ConstraintEntry&,
-                     SplitInfo*)>
+                     double, SplitInfo*)>
       find_best_threshold_fun_;
 };
 
@@ -1133,7 +1251,8 @@ class HistogramPool {
     if (old_config->lambda_l1 != config->lambda_l1 ||
         old_config->monotone_constraints != config->monotone_constraints ||
         old_config->extra_trees != config->extra_trees ||
-        old_config->max_delta_step != config->max_delta_step) {
+        old_config->max_delta_step != config->max_delta_step ||
+        old_config->path_smooth != config->path_smooth) {
 #pragma omp parallel for schedule(static)
       for (int i = 0; i < cache_size_; ++i) {
         for (int j = 0; j < train_data->num_features(); ++j) {
diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp
index 6d18c3b6f174..1f396198e580 100644
--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
@@ -21,7 +21,7 @@ class LeafSplits {
  public:
   explicit LeafSplits(data_size_t num_data)
     :num_data_in_leaf_(num_data), num_data_(num_data),
-    data_indices_(nullptr) {
+    data_indices_(nullptr), weight_(0) {
   }
   void ResetNumData(data_size_t num_data) {
     num_data_ = num_data;
@@ -37,11 +37,13 @@ class LeafSplits {
   * \param sum_gradients
   * \param sum_hessians
   */
-  void Init(int leaf, const DataPartition* data_partition, double sum_gradients, double sum_hessians) {
+  void Init(int leaf, const DataPartition* data_partition, double sum_gradients,
+            double sum_hessians, double weight) {
     leaf_index_ = leaf;
     data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
     sum_gradients_ = sum_gradients;
     sum_hessians_ = sum_hessians;
+    weight_ = weight;
   }
 
   /*!
@@ -135,6 +137,10 @@ class LeafSplits {
   /*! \brief Get indices of data of current leaf */
   const data_size_t* data_indices() const { return data_indices_; }
 
+ /*! \brief Get weight of current leaf */
+ double weight() const { return weight_; }
+
+
 
  private:
   /*! \brief current leaf index */
@@ -149,6 +155,8 @@ class LeafSplits {
   double sum_hessians_;
   /*! \brief indices of data of current leaf */
   const data_size_t* data_indices_;
+  /*! \brief weight of current leaf */
+  double weight_;
 };
 
 }  // namespace LightGBM
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index b7569d22c8e2..09cbb16921e7 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -212,9 +212,16 @@ Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const score_t*
       sum_grad += gradients[idx];
       sum_hess += hessians[idx];
     }
-    double output = FeatureHistogram::CalculateSplittedLeafOutput<true, true>(
-        sum_grad, sum_hess, config_->lambda_l1, config_->lambda_l2,
-        config_->max_delta_step);
+    double output;
+    if ((config_->path_smooth > kEpsilon) & (i > 0)) {
+      output = FeatureHistogram::CalculateSplittedLeafOutput<true, true, true>(
+          sum_grad, sum_hess, config_->lambda_l1, config_->lambda_l2,
+          config_->max_delta_step, config_->path_smooth, cnt_leaf_data, tree->leaf_parent(i));
+    } else {
+      output = FeatureHistogram::CalculateSplittedLeafOutput<true, true, false>(
+          sum_grad, sum_hess, config_->lambda_l1, config_->lambda_l2,
+          config_->max_delta_step, config_->path_smooth, cnt_leaf_data, 0);
+    }
     auto old_leaf_output = tree->LeafOutput(i);
     auto new_leaf_output = output * tree->shrinkage();
     tree->SetLeafOutput(i, config_->refit_decay_rate * old_leaf_output + (1.0 - config_->refit_decay_rate) * new_leaf_output);
@@ -447,6 +454,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf,
               left_leaf_splits->sum_hessians(),
               left_threshold,
               left_leaf_splits->num_data_in_leaf(),
+              left_leaf_splits->weight(),
               &left_split);
       left_split.feature = left_feature;
       forceSplitMap[*left_leaf] = left_split;
@@ -468,6 +476,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf,
         right_leaf_splits->sum_hessians(),
         right_threshold,
         right_leaf_splits->num_data_in_leaf(),
+        right_leaf_splits->weight(),
         &right_split);
       right_split.feature = right_feature;
       forceSplitMap[*right_leaf] = right_split;
@@ -611,18 +620,22 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf,
     CHECK_GT(best_split_info.left_count, 0);
     smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(),
                                best_split_info.left_sum_gradient,
-                               best_split_info.left_sum_hessian);
+                               best_split_info.left_sum_hessian,
+                               best_split_info.left_output);
     larger_leaf_splits_->Init(*right_leaf, data_partition_.get(),
                               best_split_info.right_sum_gradient,
-                              best_split_info.right_sum_hessian);
+                              best_split_info.right_sum_hessian,
+                              best_split_info.right_output);
   } else {
     CHECK_GT(best_split_info.right_count, 0);
     smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(),
                                best_split_info.right_sum_gradient,
-                               best_split_info.right_sum_hessian);
+                               best_split_info.right_sum_hessian,
+                               best_split_info.right_output);
     larger_leaf_splits_->Init(*left_leaf, data_partition_.get(),
                               best_split_info.left_sum_gradient,
-                              best_split_info.left_sum_hessian);
+                              best_split_info.left_sum_hessian,
+                              best_split_info.left_output);
   }
   auto leaves_need_update = constraints_->Update(
       tree, is_numerical_split, *left_leaf, *right_leaf,
@@ -683,9 +696,19 @@ void SerialTreeLearner::ComputeBestSplitForFeature(
     return;
   }
   SplitInfo new_split;
-  histogram_array_[feature_index].FindBestThreshold(
-      leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data,
-      constraints_->Get(leaf_splits->leaf_index()), &new_split);
+  double parent_output;
+  if (leaf_splits->leaf_index() == 0) {
+    // for root leaf the "parent" output is its own output because we don't apply any smoothing to the root
+    parent_output = FeatureHistogram::CalculateSplittedLeafOutput<true, true, true, false>(
+        leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), config_->lambda_l1,
+        config_->lambda_l2, config_->max_delta_step, constraints_->Get(leaf_splits->leaf_index()),
+        config_->path_smooth, static_cast<data_size_t>(num_data), 0);
+  } else {
+    parent_output = leaf_splits->weight();
+  }
+   histogram_array_[feature_index].FindBestThreshold(
+       leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data,
+       constraints_->Get(leaf_splits->leaf_index()),  parent_output, &new_split);
   new_split.feature = real_fidx;
   if (cegb_ != nullptr) {
     new_split.gain -=
diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp
index e7a8e55054f7..defd4c206377 100644
--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -436,17 +436,21 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf,
   if (best_split_info.left_count < best_split_info.right_count) {
     smaller_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(),
       best_split_info.left_sum_gradient,
-      best_split_info.left_sum_hessian);
+      best_split_info.left_sum_hessian,
+      best_split_info.left_output);
     larger_leaf_splits_global_->Init(*right_leaf, this->data_partition_.get(),
       best_split_info.right_sum_gradient,
-      best_split_info.right_sum_hessian);
+      best_split_info.right_sum_hessian,
+      best_split_info.right_output);
   } else {
     smaller_leaf_splits_global_->Init(*right_leaf, this->data_partition_.get(),
       best_split_info.right_sum_gradient,
-      best_split_info.right_sum_hessian);
+      best_split_info.right_sum_hessian,
+      best_split_info.right_output);
     larger_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(),
       best_split_info.left_sum_gradient,
-      best_split_info.left_sum_hessian);
+      best_split_info.left_sum_hessian,
+      best_split_info.left_output);
   }
 }
 
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 51be083a9f01..efbb65129287 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -2034,6 +2034,23 @@ def test_extra_trees(self):
         err_new = mean_squared_error(y, predicted_new)
         self.assertLess(err, err_new)
 
+    def test_path_smoothing(self):
+        # check path smoothing increases regularization
+        X, y = load_boston(True)
+        lgb_x = lgb.Dataset(X, label=y)
+        params = {'objective': 'regression',
+                  'num_leaves': 32,
+                  'verbose': -1,
+                  'seed': 0}
+        est = lgb.train(params, lgb_x, num_boost_round=10)
+        predicted = est.predict(X)
+        err = mean_squared_error(y, predicted)
+        params['path_smooth'] = 1
+        est = lgb.train(params, lgb_x, num_boost_round=10)
+        predicted_new = est.predict(X)
+        err_new = mean_squared_error(y, predicted_new)
+        self.assertLess(err, err_new)
+
     @unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
     def test_trees_to_dataframe(self):
 

From 198f87cfa47d5ec9df8305036ac4f216420b081b Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Fri, 27 Mar 2020 16:05:57 +1100
Subject: [PATCH 02/10] Try to fix issue with gpu version.

---
 src/treelearner/gpu_tree_learner.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp
index ba0b43c58660..f8f57e4b4236 100644
--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -1090,8 +1090,8 @@ void GPUTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right
         Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
       }
     } else {
-      smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
-      larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);
+      smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian, best_split_info.right_output);
+      larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian, best_split_info.left_output);
       if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) ||
           (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) {
         Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());

From 45def6c7c679bdf54ee407bedb6f05d9184e1b5b Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Mon, 30 Mar 2020 11:55:51 +1100
Subject: [PATCH 03/10] Fix failing CI for R package.

---
 .ci/test_r_package.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh
index 85c277e42a57..d15ceeed8f95 100755
--- a/.ci/test_r_package.sh
+++ b/.ci/test_r_package.sh
@@ -91,7 +91,7 @@ if grep -q -R "WARNING" "$LOG_FILE_NAME"; then
     exit -1
 fi
 
-ALLOWED_CHECK_NOTES=3
+ALLOWED_CHECK_NOTES=4
 NUM_CHECK_NOTES=$(
     cat ${LOG_FILE_NAME} \
         | grep -e '^Status: .* NOTE.*' \

From 3a111824149608471c347f975f336c0091ade828 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Tue, 7 Apr 2020 11:46:45 +1000
Subject: [PATCH 04/10] Minor fixes.

---
 docs/Parameters.rst                   |  4 +++-
 include/LightGBM/config.h             |  3 ++-
 src/treelearner/feature_histogram.hpp | 31 ++++++++++-----------------
 3 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 13896a6e2fdf..0652fad0e1b3 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -526,7 +526,9 @@ Learning Control Parameters
 
    -  larger values give stronger regularisation
 
-      -  the weight of each node is `(n / path_smooth) * w + w_p / (n / path_smooth + 1)`, where `n` is the number of samples in the node, `w` is the calculated node weight, and `w_p` is the weight of the parent node
+      -  the weight of each node is `(n / path_smooth) * w + w_p / (n / path_smooth + 1)`, where `n` is the number of samples in the node, `w` is the optimal node weight to minimise the loss (approximately `-sum_gradients / sum_hessians`), and `w_p` is the weight of the parent node
+
+      -  (note that the parent output `w_p` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth)
 
 -  ``verbosity`` :raw-html:`<a id="verbosity" title="Permalink to this parameter" href="#verbosity">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, aliases: ``verbose``
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 8feff729d0a8..f08b5171984e 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -494,7 +494,8 @@ struct Config {
   // desc = if set to zero, no smoothing is applied
   // desc = if `path_smooth > 0` then `min_data_in_leaf` must be at least `2`.
   // desc = larger values give stronger regularisation
-  // descl2 = the weight of each node is `(n / path_smooth) * w + w_p / (n / path_smooth + 1)`, where `n` is the number of samples in the node, `w` is the calculated node weight, and `w_p` is the weight of the parent node
+  // descl2 = the weight of each node is `(n / path_smooth) * w + w_p / (n / path_smooth + 1)`, where `n` is the number of samples in the node, `w` is the optimal node weight to minimise the loss (approximately `-sum_gradients / sum_hessians`), and `w_p` is the weight of the parent node
+  // descl2 = (note that the parent output `w_p` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth)
   double path_smooth = 0;
 
   // alias = verbose
diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp
index 79bf63c34457..bb2a0f9f44c0 100644
--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -95,19 +95,15 @@ class FeatureHistogram {
   }
 
   template <bool USE_RAND, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
-  double BeforeNumercal(double sum_gradient, double sum_hessian, double parent_output,
+  double BeforeNumercal(double sum_gradient, double sum_hessian, double parent_output, data_size_t num_data,
                         SplitInfo* output, int* rand_threshold) {
     is_splittable_ = false;
     output->monotone_type = meta_->monotone_type;
-    double gain_shift;
-    if (USE_SMOOTHING) {
-      gain_shift = GetLeafGainGivenOutput<USE_L1>(
-          sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output);
-    } else {
-      gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT, false>(
-          sum_gradient, sum_hessian, meta_->config->lambda_l1,
-          meta_->config->lambda_l2, meta_->config->max_delta_step, 0, 0, 0);
-    }
+    double output_without_split = CalculateSplittedLeafOutput<USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+        sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2,
+        meta_->config->max_delta_step, meta_->config->path_smooth, num_data, parent_output);
+    double gain_shift = GetLeafGainGivenOutput<USE_L1>(
+        sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, output_without_split);
     *rand_threshold = 0;
     if (USE_RAND) {
       if (meta_->num_bin - 2 > 0) {
@@ -164,7 +160,7 @@ class FeatureHistogram {
 #define LAMBDA_ARGUMENTS                                         \
   double sum_gradient, double sum_hessian, data_size_t num_data, \
       const ConstraintEntry &constraints, double parent_output, SplitInfo *output
-#define BEFORE_ARGUMENTS sum_gradient, sum_hessian, parent_output, output, &rand_threshold
+#define BEFORE_ARGUMENTS sum_gradient, sum_hessian, parent_output, num_data, output, &rand_threshold
 #define FUNC_ARGUMENTS                                                      \
   sum_gradient, sum_hessian, num_data, constraints, min_gain_shift, \
       output, rand_threshold, parent_output
@@ -780,17 +776,12 @@ class FeatureHistogram {
     double ret;
     if (USE_L1) {
       ret = -ThresholdL1(sum_gradients, l1) / (sum_hessians + l2);
-      if (USE_MAX_OUTPUT) {
-        if (max_delta_step > 0 && std::fabs(ret) > max_delta_step) {
-          ret = Common::Sign(ret) * max_delta_step;
-        }
-      }
     } else {
       ret = -sum_gradients / (sum_hessians + l2);
-      if (USE_MAX_OUTPUT) {
-        if (max_delta_step > 0 && std::fabs(ret) > max_delta_step) {
-          ret = Common::Sign(ret) * max_delta_step;
-        }
+    }
+    if (USE_MAX_OUTPUT) {
+      if (max_delta_step > 0 && std::fabs(ret) > max_delta_step) {
+        ret = Common::Sign(ret) * max_delta_step;
       }
     }
     if (USE_SMOOTHING) {

From ab2ca816b22fa2a67c36811994aa378464d9fbd6 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Thu, 9 Apr 2020 17:20:46 +1000
Subject: [PATCH 05/10] Minor refactor.

---
 src/treelearner/feature_histogram.hpp | 177 +++++++++++---------------
 1 file changed, 73 insertions(+), 104 deletions(-)

diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp
index bb2a0f9f44c0..3e99747a73e2 100644
--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -99,11 +99,10 @@ class FeatureHistogram {
                         SplitInfo* output, int* rand_threshold) {
     is_splittable_ = false;
     output->monotone_type = meta_->monotone_type;
-    double output_without_split = CalculateSplittedLeafOutput<USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+
+    double gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
         sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2,
-        meta_->config->max_delta_step, meta_->config->path_smooth, num_data, parent_output);
-    double gain_shift = GetLeafGainGivenOutput<USE_L1>(
-        sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, output_without_split);
+      meta_->config->max_delta_step, meta_->config->path_smooth, num_data, parent_output);
     *rand_threshold = 0;
     if (USE_RAND) {
       if (meta_->num_bin - 2 > 0) {
@@ -527,6 +526,21 @@ class FeatureHistogram {
                                        uint32_t threshold, data_size_t num_data,
                                        double parent_output, SplitInfo* output) {
     bool use_smoothing = meta_->config->path_smooth > kEpsilon;
+    if (use_smoothing) {
+      GatherInfoForThresholdNumericalInner<true>(sum_gradient, sum_hessian,
+                                                 threshold, num_data,
+                                                 parent_output, output);
+    } else {
+      GatherInfoForThresholdNumericalInner<false>(sum_gradient, sum_hessian,
+                                                  threshold, num_data,
+                                                  parent_output, output);
+    }
+  }
+
+  template<bool USE_SMOOTHING>
+  void GatherInfoForThresholdNumericalInner(double sum_gradient, double sum_hessian,
+                                            uint32_t threshold, data_size_t num_data,
+                                            double parent_output, SplitInfo* output) {
     double gain_shift = GetLeafGainGivenOutput<true>(
         sum_gradient, sum_hessian, meta_->config->lambda_l1,
         meta_->config->lambda_l2, parent_output);
@@ -573,28 +587,15 @@ class FeatureHistogram {
     double sum_left_gradient = sum_gradient - sum_right_gradient;
     double sum_left_hessian = sum_hessian - sum_right_hessian;
     data_size_t left_count = num_data - right_count;
-    double current_gain;
-    if (use_smoothing) {
-      current_gain =
-          GetLeafGain<true, true, true>(
-              sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1,
-              meta_->config->lambda_l2, meta_->config->max_delta_step,
-              meta_->config->path_smooth, left_count, parent_output) +
-          GetLeafGain<true, true, true>(
-              sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1,
-              meta_->config->lambda_l2, meta_->config->max_delta_step,
-              meta_->config->path_smooth, right_count, parent_output);
-    } else {
-      current_gain =
-          GetLeafGain<true, true, false>(
-              sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1,
-              meta_->config->lambda_l2, meta_->config->max_delta_step,
-              meta_->config->path_smooth, left_count, parent_output) +
-          GetLeafGain<true, true, false>(
-              sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1,
-              meta_->config->lambda_l2, meta_->config->max_delta_step,
-              meta_->config->path_smooth, right_count, parent_output);
-    }
+    double current_gain =
+        GetLeafGain<true, true, USE_SMOOTHING>(
+            sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1,
+            meta_->config->lambda_l2, meta_->config->max_delta_step,
+            meta_->config->path_smooth, left_count, parent_output) +
+        GetLeafGain<true, true, USE_SMOOTHING>(
+            sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1,
+            meta_->config->lambda_l2, meta_->config->max_delta_step,
+            meta_->config->path_smooth, right_count, parent_output);
 
     // gain with split is worse than without split
     if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
@@ -606,33 +607,18 @@ class FeatureHistogram {
 
     // update split information
     output->threshold = threshold;
-    if (use_smoothing) {
-      output->left_output = CalculateSplittedLeafOutput<true, true, true>(
-          sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1,
-          meta_->config->lambda_l2, meta_->config->max_delta_step,
-          meta_->config->path_smooth, left_count, parent_output);
-    } else {
-      output->left_output = CalculateSplittedLeafOutput<true, true, false>(
-          sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1,
-          meta_->config->lambda_l2, meta_->config->max_delta_step,
-          meta_->config->path_smooth, left_count, parent_output);
-    }
+    output->left_output = CalculateSplittedLeafOutput<true, true, USE_SMOOTHING>(
+        sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1,
+        meta_->config->lambda_l2, meta_->config->max_delta_step,
+        meta_->config->path_smooth, left_count, parent_output);
     output->left_count = left_count;
     output->left_sum_gradient = sum_left_gradient;
     output->left_sum_hessian = sum_left_hessian - kEpsilon;
-    if (use_smoothing) {
-      output->right_output = CalculateSplittedLeafOutput<true, true, true>(
-          sum_gradient - sum_left_gradient, sum_hessian - sum_left_hessian,
-          meta_->config->lambda_l1, meta_->config->lambda_l2,
-          meta_->config->max_delta_step, meta_->config->path_smooth,
-          right_count, parent_output);
-    } else {
-      output->right_output = CalculateSplittedLeafOutput<true, true, false>(
-          sum_gradient - sum_left_gradient, sum_hessian - sum_left_hessian,
-          meta_->config->lambda_l1, meta_->config->lambda_l2,
-          meta_->config->max_delta_step, meta_->config->path_smooth,
-          right_count, parent_output);
-    }
+    output->right_output = CalculateSplittedLeafOutput<true, true, USE_SMOOTHING>(
+        sum_gradient - sum_left_gradient, sum_hessian - sum_left_hessian,
+        meta_->config->lambda_l1, meta_->config->lambda_l2,
+        meta_->config->max_delta_step, meta_->config->path_smooth,
+        right_count, parent_output);
     output->right_count = num_data - left_count;
     output->right_sum_gradient = sum_gradient - sum_left_gradient;
     output->right_sum_hessian = sum_hessian - sum_left_hessian - kEpsilon;
@@ -640,10 +626,24 @@ class FeatureHistogram {
     output->default_left = true;
   }
 
-  void GatherInfoForThresholdCategorical(double sum_gradient,
-                                         double sum_hessian, uint32_t threshold,
-                                         data_size_t num_data, double parent_output,
-                                         SplitInfo* output) {
+  void GatherInfoForThresholdCategorical(double sum_gradient,  double sum_hessian,
+                                         uint32_t threshold, data_size_t num_data,
+                                         double parent_output, SplitInfo* output) {
+    bool use_smoothing = meta_->config->path_smooth > kEpsilon;
+    if (use_smoothing) {
+      GatherInfoForThresholdCategoricalInner<true>(sum_gradient, sum_hessian, threshold,
+                                                   num_data, parent_output, output);
+    } else {
+      GatherInfoForThresholdCategoricalInner<false>(sum_gradient, sum_hessian, threshold,
+                                                    num_data, parent_output, output);
+    }
+  }
+
+  template<bool USE_SMOOTHING>
+  void GatherInfoForThresholdCategoricalInner(double sum_gradient,
+                                              double sum_hessian, uint32_t threshold,
+                                              data_size_t num_data, double parent_output,
+                                              SplitInfo* output) {
     // get SplitInfo for a given one-hot categorical split.
     output->default_left = false;
     double gain_shift = GetLeafGainGivenOutput<true>(
@@ -670,65 +670,34 @@ class FeatureHistogram {
     double sum_left_gradient = grad;
     double sum_right_gradient = sum_gradient - sum_left_gradient;
     // current split gain
-    double current_gain;
-    bool use_smoothing = meta_->config->path_smooth > kEpsilon;
-    if (use_smoothing) {
-      current_gain =
-          GetLeafGain<true, true, true>(sum_right_gradient, sum_right_hessian,
-                                        meta_->config->lambda_l1, l2,
-                                        meta_->config->max_delta_step,
-                                        meta_->config->path_smooth, right_count,
-                                        parent_output) +
-          GetLeafGain<true, true, true>(sum_left_gradient, sum_left_hessian,
-                                        meta_->config->lambda_l1, l2,
-                                        meta_->config->max_delta_step,
-                                        meta_->config->path_smooth, left_count,
-                                        parent_output);
-    } else {
-      current_gain =
-          GetLeafGain<true, true, false>(sum_right_gradient, sum_right_hessian,
-                                         meta_->config->lambda_l1, l2,
-                                         meta_->config->max_delta_step,
-                                         meta_->config->path_smooth, right_count,
-                                         parent_output) +
-          GetLeafGain<true, true, false>(sum_left_gradient, sum_left_hessian,
-                                         meta_->config->lambda_l1, l2,
-                                         meta_->config->max_delta_step,
-                                         meta_->config->path_smooth, left_count,
-                                         parent_output);
-    }
+    double current_gain =
+        GetLeafGain<true, true, USE_SMOOTHING>(sum_right_gradient, sum_right_hessian,
+                                      meta_->config->lambda_l1, l2,
+                                      meta_->config->max_delta_step,
+                                      meta_->config->path_smooth, right_count,
+                                      parent_output) +
+        GetLeafGain<true, true, USE_SMOOTHING>(sum_left_gradient, sum_left_hessian,
+                                      meta_->config->lambda_l1, l2,
+                                      meta_->config->max_delta_step,
+                                      meta_->config->path_smooth, left_count,
+                                      parent_output);
     if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
       output->gain = kMinScore;
       Log::Warning(
           "'Forced Split' will be ignored since the gain getting worse.");
       return;
     }
-
-    if (use_smoothing) {
-      output->left_output = CalculateSplittedLeafOutput<true, true, true>(
-          sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, l2,
-          meta_->config->max_delta_step, meta_->config->path_smooth, left_count,
-          parent_output);
-    } else {
-      output->left_output = CalculateSplittedLeafOutput<true, true, false>(
-          sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, l2,
-          meta_->config->max_delta_step, meta_->config->path_smooth, left_count,
-          parent_output);
-    }
+    output->left_output = CalculateSplittedLeafOutput<true, true, USE_SMOOTHING>(
+        sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, l2,
+        meta_->config->max_delta_step, meta_->config->path_smooth, left_count,
+        parent_output);
     output->left_count = left_count;
     output->left_sum_gradient = sum_left_gradient;
     output->left_sum_hessian = sum_left_hessian - kEpsilon;
-    if (use_smoothing) {
-      output->right_output = CalculateSplittedLeafOutput<true, true, true>(
-          sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2,
-          meta_->config->max_delta_step, meta_->config->path_smooth, right_count,
-          parent_output);
-    } else {
-      output->right_output = CalculateSplittedLeafOutput<true, true, false>(
-          sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2,
-          meta_->config->max_delta_step, meta_->config->path_smooth, right_count,
-          parent_output);
-    }
+    output->right_output = CalculateSplittedLeafOutput<true, true, USE_SMOOTHING>(
+        sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2,
+        meta_->config->max_delta_step, meta_->config->path_smooth, right_count,
+        parent_output);
     output->right_count = right_count;
     output->right_sum_gradient = sum_gradient - sum_left_gradient;
     output->right_sum_hessian = sum_right_hessian - kEpsilon;

From e4a84dad25c1a7fcc6139483c535277b41c5b256 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Thu, 9 Apr 2020 17:48:20 +1000
Subject: [PATCH 06/10] Restore old code to get CI working.

---
 .ci/test_r_package.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh
index 31fae4ae06bd..4fd21a028ca1 100755
--- a/.ci/test_r_package.sh
+++ b/.ci/test_r_package.sh
@@ -91,7 +91,7 @@ if grep -q -R "WARNING" "$LOG_FILE_NAME"; then
     exit -1
 fi
 
-ALLOWED_CHECK_NOTES=2
+ALLOWED_CHECK_NOTES=4
 NUM_CHECK_NOTES=$(
     cat ${LOG_FILE_NAME} \
         | grep -e '^Status: .* NOTE.*' \

From 1a86a6b6b6c3af16aaedaf44d01e13ef88593a98 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Sun, 26 Apr 2020 15:11:44 +1000
Subject: [PATCH 07/10] Fix style issues.

---
 src/treelearner/leaf_splits.hpp         | 4 ++--
 src/treelearner/serial_tree_learner.cpp | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp
index 1f396198e580..b0a753eafe1b 100644
--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
@@ -137,8 +137,8 @@ class LeafSplits {
   /*! \brief Get indices of data of current leaf */
   const data_size_t* data_indices() const { return data_indices_; }
 
- /*! \brief Get weight of current leaf */
- double weight() const { return weight_; }
+  /*! \brief Get weight of current leaf */
+  double weight() const { return weight_; }
 
 
 
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index 491e80f00505..a68a65ee91b4 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -708,9 +708,9 @@ void SerialTreeLearner::ComputeBestSplitForFeature(
   } else {
     parent_output = leaf_splits->weight();
   }
-   histogram_array_[feature_index].FindBestThreshold(
-       leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data,
-       constraints_->Get(leaf_splits->leaf_index()),  parent_output, &new_split);
+  histogram_array_[feature_index].FindBestThreshold(
+      leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data,
+      constraints_->Get(leaf_splits->leaf_index()),  parent_output, &new_split);
   new_split.feature = real_fidx;
   if (cegb_ != nullptr) {
     new_split.gain -=

From a77b509ab0a58d8be60fe046aa64810b203b41e1 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Mon, 27 Apr 2020 21:36:38 +1000
Subject: [PATCH 08/10] Fix ci for R package.

---
 .ci/test_r_package.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh
index 1ef61c3910a2..8bab60fac567 100755
--- a/.ci/test_r_package.sh
+++ b/.ci/test_r_package.sh
@@ -98,7 +98,7 @@ if grep -q -R "WARNING" "$LOG_FILE_NAME"; then
     exit -1
 fi
 
-ALLOWED_CHECK_NOTES=4
+ALLOWED_CHECK_NOTES=3
 NUM_CHECK_NOTES=$(
     cat ${LOG_FILE_NAME} \
         | grep -e '^Status: .* NOTE.*' \

From 239d90ff145498c68e9fccfc8ffdc0695c6ba1ba Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Mon, 27 Apr 2020 21:59:30 +1000
Subject: [PATCH 09/10] Minor fixes for docs and code style.

---
 docs/Parameters-Tuning.rst            | 4 ++--
 docs/Parameters.rst                   | 6 +++---
 include/LightGBM/config.h             | 6 +++---
 src/treelearner/feature_histogram.hpp | 2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/Parameters-Tuning.rst b/docs/Parameters-Tuning.rst
index 1f39b29ded46..bb29cb7b7412 100644
--- a/docs/Parameters-Tuning.rst
+++ b/docs/Parameters-Tuning.rst
@@ -81,6 +81,6 @@ Deal with Over-fitting
 
 -  Try ``extra_trees``
 
-.. _Optuna: https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258
-
 -  Try increasing ``path_smoothing``
+
+.. _Optuna: https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index b87b8d0b9f4e..8e3c934beb1b 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -530,13 +530,13 @@ Learning Control Parameters
 
    -  if set to zero, no smoothing is applied
 
-   -  if `path_smooth > 0` then `min_data_in_leaf` must be at least `2`.
+   -  if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2``.
 
    -  larger values give stronger regularisation
 
-      -  the weight of each node is `(n / path_smooth) * w + w_p / (n / path_smooth + 1)`, where `n` is the number of samples in the node, `w` is the optimal node weight to minimise the loss (approximately `-sum_gradients / sum_hessians`), and `w_p` is the weight of the parent node
+      -  the weight of each node is ``(n / path_smooth) * w + w_p / (n / path_smooth + 1)``, where ``n`` is the number of samples in the node, ``w`` is the optimal node weight to minimise the loss (approximately ``-sum_gradients / sum_hessians``), and ``w_p`` is the weight of the parent node
 
-      -  (note that the parent output `w_p` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth)
+      -  note that the parent output ``w_p`` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth
 
 -  ``verbosity`` :raw-html:`<a id="verbosity" title="Permalink to this parameter" href="#verbosity">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, aliases: ``verbose``
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 69911de006a1..1306beedd839 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -499,10 +499,10 @@ struct Config {
   // desc = controls smoothing applied to tree nodes
   // desc = helps prevent overfitting on leaves with few samples
   // desc = if set to zero, no smoothing is applied
-  // desc = if `path_smooth > 0` then `min_data_in_leaf` must be at least `2`.
+  // desc = if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2``.
   // desc = larger values give stronger regularisation
-  // descl2 = the weight of each node is `(n / path_smooth) * w + w_p / (n / path_smooth + 1)`, where `n` is the number of samples in the node, `w` is the optimal node weight to minimise the loss (approximately `-sum_gradients / sum_hessians`), and `w_p` is the weight of the parent node
-  // descl2 = (note that the parent output `w_p` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth)
+  // descl2 = the weight of each node is ``(n / path_smooth) * w + w_p / (n / path_smooth + 1)``, where ``n`` is the number of samples in the node, ``w`` is the optimal node weight to minimise the loss (approximately ``-sum_gradients / sum_hessians``), and ``w_p`` is the weight of the parent node
+  // descl2 = note that the parent output ``w_p`` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth
   double path_smooth = 0;
 
   // alias = verbose
diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp
index 3e99747a73e2..584d37257ab5 100644
--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -102,7 +102,7 @@ class FeatureHistogram {
 
     double gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
         sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2,
-      meta_->config->max_delta_step, meta_->config->path_smooth, num_data, parent_output);
+        meta_->config->max_delta_step, meta_->config->path_smooth, num_data, parent_output);
     *rand_threshold = 0;
     if (USE_RAND) {
       if (meta_->num_bin - 2 > 0) {

From eeb4b75e81ec32bc446c640b6a8fb507608ca0c2 Mon Sep 17 00:00:00 2001
From: btrotta <btrotta@users.noreply.github.com>
Date: Wed, 29 Apr 2020 09:32:30 +1000
Subject: [PATCH 10/10] Update docs.

---
 docs/Parameters-Tuning.rst | 2 +-
 docs/Parameters.rst        | 2 +-
 include/LightGBM/config.h  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/Parameters-Tuning.rst b/docs/Parameters-Tuning.rst
index bb29cb7b7412..1d16e823220d 100644
--- a/docs/Parameters-Tuning.rst
+++ b/docs/Parameters-Tuning.rst
@@ -81,6 +81,6 @@ Deal with Over-fitting
 
 -  Try ``extra_trees``
 
--  Try increasing ``path_smoothing``
+-  Try increasing ``path_smooth``
 
 .. _Optuna: https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 8e3c934beb1b..64251f8d4573 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -530,7 +530,7 @@ Learning Control Parameters
 
    -  if set to zero, no smoothing is applied
 
-   -  if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2``.
+   -  if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2``
 
    -  larger values give stronger regularisation
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 1306beedd839..5cdc6139dc0e 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -499,7 +499,7 @@ struct Config {
   // desc = controls smoothing applied to tree nodes
   // desc = helps prevent overfitting on leaves with few samples
   // desc = if set to zero, no smoothing is applied
-  // desc = if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2``.
+  // desc = if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2``
   // desc = larger values give stronger regularisation
   // descl2 = the weight of each node is ``(n / path_smooth) * w + w_p / (n / path_smooth + 1)``, where ``n`` is the number of samples in the node, ``w`` is the optimal node weight to minimise the loss (approximately ``-sum_gradients / sum_hessians``), and ``w_p`` is the weight of the parent node
   // descl2 = note that the parent output ``w_p`` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth