From 0050954c85f8df08ce43c6c0c85204328ebdb057 Mon Sep 17 00:00:00 2001 From: btrotta Date: Fri, 27 Mar 2020 14:38:32 +1100 Subject: [PATCH 01/10] Path smoothing --- docs/Parameters-Tuning.rst | 2 + docs/Parameters.rst | 14 + include/LightGBM/config.h | 9 + include/LightGBM/tree.h | 5 +- src/io/config.cpp | 8 + src/io/config_auto.cpp | 5 + src/treelearner/feature_histogram.hpp | 359 ++++++++++++------ src/treelearner/leaf_splits.hpp | 12 +- src/treelearner/serial_tree_learner.cpp | 43 ++- .../voting_parallel_tree_learner.cpp | 12 +- tests/python_package_test/test_engine.py | 17 + 11 files changed, 348 insertions(+), 138 deletions(-) diff --git a/docs/Parameters-Tuning.rst b/docs/Parameters-Tuning.rst index d2579361a1c9..1f39b29ded46 100644 --- a/docs/Parameters-Tuning.rst +++ b/docs/Parameters-Tuning.rst @@ -82,3 +82,5 @@ Deal with Over-fitting - Try ``extra_trees`` .. _Optuna: https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258 + +- Try increasing ``path_smoothing`` diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 0d9ade659fef..13896a6e2fdf 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -514,6 +514,20 @@ Learning Control Parameters - applied once per forest +- ``path_smooth`` :raw-html:`🔗︎`, default = ``0``, type = double, constraints: ``path_smooth >= 0.0`` + + - controls smoothing applied to tree nodes + + - helps prevent overfitting on leaves with few samples + + - if set to zero, no smoothing is applied + + - if `path_smooth > 0` then `min_data_in_leaf` must be at least `2`. + + - larger values give stronger regularisation + + - the weight of each node is `(n / path_smooth) * w + w_p / (n / path_smooth + 1)`, where `n` is the number of samples in the node, `w` is the calculated node weight, and `w_p` is the weight of the parent node + - ``verbosity`` :raw-html:`🔗︎`, default = ``1``, type = int, aliases: ``verbose`` - controls the level of LightGBM's verbosity diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 057f56e99491..8feff729d0a8 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -488,6 +488,15 @@ struct Config { // desc = applied once per forest std::vector cegb_penalty_feature_coupled; + // check = >= 0.0 + // desc = controls smoothing applied to tree nodes + // desc = helps prevent overfitting on leaves with few samples + // desc = if set to zero, no smoothing is applied + // desc = if `path_smooth > 0` then `min_data_in_leaf` must be at least `2`. + // desc = larger values give stronger regularisation + // descl2 = the weight of each node is `(n / path_smooth) * w + w_p / (n / path_smooth + 1)`, where `n` is the number of samples in the node, `w` is the calculated node weight, and `w_p` is the weight of the parent node + double path_smooth = 0; + // alias = verbose // desc = controls the level of LightGBM's verbosity // desc = ``< 0``: Fatal, ``= 0``: Error (Warning), ``= 1``: Info, ``> 1``: Debug diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index 55568e41f544..66ba8fb8eacb 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -142,6 +142,9 @@ class Tree { /*! \brief Get depth of specific leaf*/ inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; } + /*! \brief Get parent of specific leaf*/ + inline int leaf_parent(int leaf_idx) const {return leaf_parent_[leaf_idx]; } + /*! \brief Get feature of specific split*/ inline int split_feature(int split_idx) const { return split_feature_[split_idx]; } @@ -163,8 +166,6 @@ class Tree { return split_feature_inner_[node_idx]; } - inline int leaf_parent(int leaf_idx) const { return leaf_parent_[leaf_idx]; } - inline uint32_t threshold_in_bin(int node_idx) const { return threshold_in_bin_[node_idx]; } diff --git a/src/io/config.cpp b/src/io/config.cpp index 0cf1d3c8bf21..fb831b4ec495 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -317,6 +317,14 @@ void Config::CheckParamConflict() { force_col_wise = true; force_row_wise = false; } + // min_data_in_leaf must be at least 2 if path smoothing is active. This is because when the split is calculated + // the count is calculated using the proportion of hessian in the leaf which is rounded up to nearest int, so it can + // be 1 when there is actually no data in the leaf. In rare cases this can cause a bug because with path smoothing the + // calculated split gain can be positive even with zero gradient and hessian. + if (path_smooth > kEpsilon && min_data_in_leaf < 2) { + min_data_in_leaf = 2; + Log::Warning("min_data_in_leaf has been increased to 2 because this is required when path smoothing is active."); + } if (is_parallel && monotone_constraints_method == std::string("intermediate")) { // In distributed mode, local node doesn't have histograms on all features, cannot perform "intermediate" monotone constraints. Log::Warning("Cannot use \"intermediate\" monotone constraints in parallel learning, auto set to \"basic\" method."); diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 59cc62a5d375..564db37c3334 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -225,6 +225,7 @@ const std::unordered_set& Config::parameter_set() { "cegb_penalty_split", "cegb_penalty_feature_lazy", "cegb_penalty_feature_coupled", + "path_smooth", "verbosity", "input_model", "output_model", @@ -443,6 +444,9 @@ void Config::GetMembersFromString(const std::unordered_map(tmp_str, ','); } + GetDouble(params, "path_smooth", &path_smooth); + CHECK_GE(path_smooth, 0.0); + GetInt(params, "verbosity", &verbosity); GetString(params, "input_model", &input_model); @@ -646,6 +650,7 @@ std::string Config::SaveMembersToString() const { str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n"; str_buf << "[cegb_penalty_feature_lazy: " << Common::Join(cegb_penalty_feature_lazy, ",") << "]\n"; str_buf << "[cegb_penalty_feature_coupled: " << Common::Join(cegb_penalty_feature_coupled, ",") << "]\n"; + str_buf << "[path_smooth: " << path_smooth << "]\n"; str_buf << "[verbosity: " << verbosity << "]\n"; str_buf << "[max_bin: " << max_bin << "]\n"; str_buf << "[max_bin_by_feature: " << Common::Join(max_bin_by_feature, ",") << "]\n"; diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 9c859f7ec1a2..79bf63c34457 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -85,22 +85,29 @@ class FeatureHistogram { void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data, const ConstraintEntry& constraints, + double parent_output, SplitInfo* output) { output->default_left = true; output->gain = kMinScore; find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, - constraints, output); + constraints, parent_output, output); output->gain *= meta_->penalty; } - template - double BeforeNumercal(double sum_gradient, double sum_hessian, + template + double BeforeNumercal(double sum_gradient, double sum_hessian, double parent_output, SplitInfo* output, int* rand_threshold) { is_splittable_ = false; output->monotone_type = meta_->monotone_type; - double gain_shift = GetLeafGain( - sum_gradient, sum_hessian, meta_->config->lambda_l1, - meta_->config->lambda_l2, meta_->config->max_delta_step); + double gain_shift; + if (USE_SMOOTHING) { + gain_shift = GetLeafGainGivenOutput( + sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output); + } else { + gain_shift = GetLeafGain( + sum_gradient, sum_hessian, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, 0, 0, 0); + } *rand_threshold = 0; if (USE_RAND) { if (meta_->num_bin - 2 > 0) { @@ -144,21 +151,30 @@ class FeatureHistogram { template void FuncForNumricalL2() { -#define TEMPLATE_PREFIX USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT + if (meta_->config->path_smooth > kEpsilon) { + FuncForNumricalL3(); + } else { + FuncForNumricalL3(); + } + } + + template + void FuncForNumricalL3() { +#define TEMPLATE_PREFIX USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING #define LAMBDA_ARGUMENTS \ double sum_gradient, double sum_hessian, data_size_t num_data, \ - const ConstraintEntry &constraints, SplitInfo *output -#define BEFORE_ARGUMENTS sum_gradient, sum_hessian, output, &rand_threshold + const ConstraintEntry &constraints, double parent_output, SplitInfo *output +#define BEFORE_ARGUMENTS sum_gradient, sum_hessian, parent_output, output, &rand_threshold #define FUNC_ARGUMENTS \ - sum_gradient, sum_hessian, num_data, constraints, min_gain_shift, output, \ - rand_threshold + sum_gradient, sum_hessian, num_data, constraints, min_gain_shift, \ + output, rand_threshold, parent_output if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) { if (meta_->missing_type == MissingType::Zero) { find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { int rand_threshold = 0; double min_gain_shift = - BeforeNumercal( + BeforeNumercal( BEFORE_ARGUMENTS); FindBestThresholdSequentially( FUNC_ARGUMENTS); @@ -169,7 +185,7 @@ class FeatureHistogram { find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { int rand_threshold = 0; double min_gain_shift = - BeforeNumercal( + BeforeNumercal( BEFORE_ARGUMENTS); FindBestThresholdSequentially( FUNC_ARGUMENTS); @@ -182,7 +198,7 @@ class FeatureHistogram { find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { int rand_threshold = 0; double min_gain_shift = - BeforeNumercal( + BeforeNumercal( BEFORE_ARGUMENTS); FindBestThresholdSequentially( FUNC_ARGUMENTS); @@ -191,10 +207,9 @@ class FeatureHistogram { find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { int rand_threshold = 0; double min_gain_shift = - BeforeNumercal( + BeforeNumercal( BEFORE_ARGUMENTS); - FindBestThresholdSequentially( + FindBestThresholdSequentially( FUNC_ARGUMENTS); output->default_left = false; }; @@ -224,42 +239,52 @@ class FeatureHistogram { template void FuncForCategoricalL1() { + if (meta_->config->path_smooth > kEpsilon) { + FuncForCategoricalL2(); + } else { + FuncForCategoricalL2(); + } + } + + template + void FuncForCategoricalL2() { #define ARGUMENTS \ std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, \ - std::placeholders::_4, std::placeholders::_5 + std::placeholders::_4, std::placeholders::_5, std::placeholders::_6 if (meta_->config->lambda_l1 > 0) { if (meta_->config->max_delta_step > 0) { find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner< - USE_RAND, USE_MC, true, true>, + USE_RAND, USE_MC, true, true, USE_SMOOTHING>, this, ARGUMENTS); } else { find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner< - USE_RAND, USE_MC, true, false>, + USE_RAND, USE_MC, true, false, USE_SMOOTHING>, this, ARGUMENTS); } } else { if (meta_->config->max_delta_step > 0) { find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner< - USE_RAND, USE_MC, false, true>, + USE_RAND, USE_MC, false, true, USE_SMOOTHING>, this, ARGUMENTS); } else { find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner< - USE_RAND, USE_MC, false, false>, + USE_RAND, USE_MC, false, false, USE_SMOOTHING>, this, ARGUMENTS); } } #undef ARGUMENTS } - template + template void FindBestThresholdCategoricalInner(double sum_gradient, double sum_hessian, data_size_t num_data, const ConstraintEntry& constraints, + double parent_output, SplitInfo* output) { is_splittable_ = false; output->default_left = false; @@ -267,9 +292,17 @@ class FeatureHistogram { data_size_t best_left_count = 0; double best_sum_left_gradient = 0; double best_sum_left_hessian = 0; - double gain_shift = GetLeafGain( - sum_gradient, sum_hessian, meta_->config->lambda_l1, - meta_->config->lambda_l2, meta_->config->max_delta_step); + double gain_shift; + if (USE_SMOOTHING) { + gain_shift = GetLeafGainGivenOutput( + sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output); + } else { + // Need special case for no smoothing to preserve existing behaviour. If no smoothing, the parent output is calculated + // with the larger categorical l2, whereas min_split_gain uses the original l2. + gain_shift = GetLeafGain(sum_gradient, sum_hessian, + meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, 0, + num_data, 0); + } double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; bool is_full_categorical = meta_->missing_type == MissingType::None; @@ -317,10 +350,10 @@ class FeatureHistogram { } } // current split gain - double current_gain = GetSplitGains( + double current_gain = GetSplitGains( sum_other_gradient, sum_other_hessian, grad, hess + kEpsilon, meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - constraints, 0); + constraints, 0, meta_->config->path_smooth, other_count, cnt, parent_output); // gain with split is worse than without split if (current_gain <= min_gain_shift) { continue; @@ -419,10 +452,11 @@ class FeatureHistogram { continue; } } - double current_gain = GetSplitGains( + double current_gain = GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2, - meta_->config->max_delta_step, constraints, 0); + meta_->config->max_delta_step, constraints, 0, meta_->config->path_smooth, + left_count, right_count, parent_output); if (current_gain <= min_gain_shift) { continue; } @@ -440,19 +474,18 @@ class FeatureHistogram { } if (is_splittable_) { - output->left_output = - CalculateSplittedLeafOutput( - best_sum_left_gradient, best_sum_left_hessian, - meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - constraints); + output->left_output = CalculateSplittedLeafOutput( + best_sum_left_gradient, best_sum_left_hessian, + meta_->config->lambda_l1, l2, meta_->config->max_delta_step, + constraints, meta_->config->path_smooth, best_left_count, parent_output); output->left_count = best_left_count; output->left_sum_gradient = best_sum_left_gradient; output->left_sum_hessian = best_sum_left_hessian - kEpsilon; - output->right_output = - CalculateSplittedLeafOutput( - sum_gradient - best_sum_left_gradient, - sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2, - meta_->config->max_delta_step, constraints); + output->right_output = CalculateSplittedLeafOutput( + sum_gradient - best_sum_left_gradient, + sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, constraints, meta_->config->path_smooth, + num_data - best_left_count, parent_output); output->right_count = num_data - best_left_count; output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_hessian = @@ -484,22 +517,23 @@ class FeatureHistogram { void GatherInfoForThreshold(double sum_gradient, double sum_hessian, uint32_t threshold, data_size_t num_data, - SplitInfo* output) { + double parent_output, SplitInfo* output) { if (meta_->bin_type == BinType::NumericalBin) { GatherInfoForThresholdNumerical(sum_gradient, sum_hessian, threshold, - num_data, output); + num_data, parent_output, output); } else { GatherInfoForThresholdCategorical(sum_gradient, sum_hessian, threshold, - num_data, output); + num_data, parent_output, output); } } void GatherInfoForThresholdNumerical(double sum_gradient, double sum_hessian, uint32_t threshold, data_size_t num_data, - SplitInfo* output) { - double gain_shift = GetLeafGain( + double parent_output, SplitInfo* output) { + bool use_smoothing = meta_->config->path_smooth > kEpsilon; + double gain_shift = GetLeafGainGivenOutput( sum_gradient, sum_hessian, meta_->config->lambda_l1, - meta_->config->lambda_l2, meta_->config->max_delta_step); + meta_->config->lambda_l2, parent_output); double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; // do stuff here @@ -543,13 +577,28 @@ class FeatureHistogram { double sum_left_gradient = sum_gradient - sum_right_gradient; double sum_left_hessian = sum_hessian - sum_right_hessian; data_size_t left_count = num_data - right_count; - double current_gain = - GetLeafGain( - sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, - meta_->config->lambda_l2, meta_->config->max_delta_step) + - GetLeafGain( - sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, - meta_->config->lambda_l2, meta_->config->max_delta_step); + double current_gain; + if (use_smoothing) { + current_gain = + GetLeafGain( + sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + meta_->config->path_smooth, left_count, parent_output) + + GetLeafGain( + sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + meta_->config->path_smooth, right_count, parent_output); + } else { + current_gain = + GetLeafGain( + sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + meta_->config->path_smooth, left_count, parent_output) + + GetLeafGain( + sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + meta_->config->path_smooth, right_count, parent_output); + } // gain with split is worse than without split if (std::isnan(current_gain) || current_gain <= min_gain_shift) { @@ -561,16 +610,33 @@ class FeatureHistogram { // update split information output->threshold = threshold; - output->left_output = CalculateSplittedLeafOutput( - sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, - meta_->config->lambda_l2, meta_->config->max_delta_step); + if (use_smoothing) { + output->left_output = CalculateSplittedLeafOutput( + sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + meta_->config->path_smooth, left_count, parent_output); + } else { + output->left_output = CalculateSplittedLeafOutput( + sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + meta_->config->path_smooth, left_count, parent_output); + } output->left_count = left_count; output->left_sum_gradient = sum_left_gradient; output->left_sum_hessian = sum_left_hessian - kEpsilon; - output->right_output = CalculateSplittedLeafOutput( - sum_gradient - sum_left_gradient, sum_hessian - sum_left_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, - meta_->config->max_delta_step); + if (use_smoothing) { + output->right_output = CalculateSplittedLeafOutput( + sum_gradient - sum_left_gradient, sum_hessian - sum_left_hessian, + meta_->config->lambda_l1, meta_->config->lambda_l2, + meta_->config->max_delta_step, meta_->config->path_smooth, + right_count, parent_output); + } else { + output->right_output = CalculateSplittedLeafOutput( + sum_gradient - sum_left_gradient, sum_hessian - sum_left_hessian, + meta_->config->lambda_l1, meta_->config->lambda_l2, + meta_->config->max_delta_step, meta_->config->path_smooth, + right_count, parent_output); + } output->right_count = num_data - left_count; output->right_sum_gradient = sum_gradient - sum_left_gradient; output->right_sum_hessian = sum_hessian - sum_left_hessian - kEpsilon; @@ -580,13 +646,12 @@ class FeatureHistogram { void GatherInfoForThresholdCategorical(double sum_gradient, double sum_hessian, uint32_t threshold, - data_size_t num_data, + data_size_t num_data, double parent_output, SplitInfo* output) { // get SplitInfo for a given one-hot categorical split. output->default_left = false; - double gain_shift = GetLeafGain( - sum_gradient, sum_hessian, meta_->config->lambda_l1, - meta_->config->lambda_l2, meta_->config->max_delta_step); + double gain_shift = GetLeafGainGivenOutput( + sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output); double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; bool is_full_categorical = meta_->missing_type == MissingType::None; int used_bin = meta_->num_bin - 1 + is_full_categorical; @@ -609,13 +674,33 @@ class FeatureHistogram { double sum_left_gradient = grad; double sum_right_gradient = sum_gradient - sum_left_gradient; // current split gain - double current_gain = - GetLeafGain(sum_right_gradient, sum_right_hessian, - meta_->config->lambda_l1, l2, - meta_->config->max_delta_step) + - GetLeafGain(sum_left_gradient, sum_left_hessian, - meta_->config->lambda_l1, l2, - meta_->config->max_delta_step); + double current_gain; + bool use_smoothing = meta_->config->path_smooth > kEpsilon; + if (use_smoothing) { + current_gain = + GetLeafGain(sum_right_gradient, sum_right_hessian, + meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, + meta_->config->path_smooth, right_count, + parent_output) + + GetLeafGain(sum_left_gradient, sum_left_hessian, + meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, + meta_->config->path_smooth, left_count, + parent_output); + } else { + current_gain = + GetLeafGain(sum_right_gradient, sum_right_hessian, + meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, + meta_->config->path_smooth, right_count, + parent_output) + + GetLeafGain(sum_left_gradient, sum_left_hessian, + meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, + meta_->config->path_smooth, left_count, + parent_output); + } if (std::isnan(current_gain) || current_gain <= min_gain_shift) { output->gain = kMinScore; Log::Warning( @@ -623,15 +708,31 @@ class FeatureHistogram { return; } - output->left_output = CalculateSplittedLeafOutput( - sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, l2, - meta_->config->max_delta_step); + if (use_smoothing) { + output->left_output = CalculateSplittedLeafOutput( + sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, meta_->config->path_smooth, left_count, + parent_output); + } else { + output->left_output = CalculateSplittedLeafOutput( + sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, meta_->config->path_smooth, left_count, + parent_output); + } output->left_count = left_count; output->left_sum_gradient = sum_left_gradient; output->left_sum_hessian = sum_left_hessian - kEpsilon; - output->right_output = CalculateSplittedLeafOutput( - sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2, - meta_->config->max_delta_step); + if (use_smoothing) { + output->right_output = CalculateSplittedLeafOutput( + sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, meta_->config->path_smooth, right_count, + parent_output); + } else { + output->right_output = CalculateSplittedLeafOutput( + sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, meta_->config->path_smooth, right_count, + parent_output); + } output->right_count = right_count; output->right_sum_gradient = sum_gradient - sum_left_gradient; output->right_sum_hessian = sum_right_hessian - kEpsilon; @@ -670,35 +771,42 @@ class FeatureHistogram { return Common::Sign(s) * reg_s; } - template + template static double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians, double l1, - double l2, double max_delta_step) { + double l2, double max_delta_step, + double smoothing, data_size_t num_data, + double parent_output) { + double ret; if (USE_L1) { - double ret = -ThresholdL1(sum_gradients, l1) / (sum_hessians + l2); + ret = -ThresholdL1(sum_gradients, l1) / (sum_hessians + l2); if (USE_MAX_OUTPUT) { if (max_delta_step > 0 && std::fabs(ret) > max_delta_step) { - return Common::Sign(ret) * max_delta_step; + ret = Common::Sign(ret) * max_delta_step; } } - return ret; } else { - double ret = -sum_gradients / (sum_hessians + l2); + ret = -sum_gradients / (sum_hessians + l2); if (USE_MAX_OUTPUT) { if (max_delta_step > 0 && std::fabs(ret) > max_delta_step) { - return Common::Sign(ret) * max_delta_step; + ret = Common::Sign(ret) * max_delta_step; } } - return ret; } + if (USE_SMOOTHING) { + ret = ret * (num_data / smoothing) / (num_data / smoothing + 1) \ + + parent_output / (num_data / smoothing + 1); + } + return ret; } - template + template static double CalculateSplittedLeafOutput( double sum_gradients, double sum_hessians, double l1, double l2, - double max_delta_step, const ConstraintEntry& constraints) { - double ret = CalculateSplittedLeafOutput( - sum_gradients, sum_hessians, l1, l2, max_delta_step); + double max_delta_step, const ConstraintEntry& constraints, + double smoothing, data_size_t num_data, double parent_output) { + double ret = CalculateSplittedLeafOutput( + sum_gradients, sum_hessians, l1, l2, max_delta_step, smoothing, num_data, parent_output); if (USE_MC) { if (ret < constraints.min) { ret = constraints.min; @@ -710,30 +818,36 @@ class FeatureHistogram { } private: - template + template static double GetSplitGains(double sum_left_gradients, double sum_left_hessians, double sum_right_gradients, double sum_right_hessians, double l1, double l2, double max_delta_step, const ConstraintEntry& constraints, - int8_t monotone_constraint) { + int8_t monotone_constraint, + double smoothing, + data_size_t left_count, + data_size_t right_count, + double parent_output) { if (!USE_MC) { - return GetLeafGain(sum_left_gradients, - sum_left_hessians, l1, l2, - max_delta_step) + - GetLeafGain(sum_right_gradients, - sum_right_hessians, l1, l2, - max_delta_step); + return GetLeafGain(sum_left_gradients, + sum_left_hessians, l1, l2, + max_delta_step, smoothing, + left_count, parent_output) + + GetLeafGain(sum_right_gradients, + sum_right_hessians, l1, l2, + max_delta_step, smoothing, + right_count, parent_output); } else { double left_output = - CalculateSplittedLeafOutput( + CalculateSplittedLeafOutput( sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, - constraints); + constraints, smoothing, left_count, parent_output); double right_output = - CalculateSplittedLeafOutput( + CalculateSplittedLeafOutput( sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, - constraints); + constraints, smoothing, right_count, parent_output); if (((monotone_constraint > 0) && (left_output > right_output)) || ((monotone_constraint < 0) && (left_output < right_output))) { return 0; @@ -745,10 +859,11 @@ class FeatureHistogram { } } - template + template static double GetLeafGain(double sum_gradients, double sum_hessians, - double l1, double l2, double max_delta_step) { - if (!USE_MAX_OUTPUT) { + double l1, double l2, double max_delta_step, + double smoothing, data_size_t num_data, double parent_output) { + if (!USE_MAX_OUTPUT && !USE_SMOOTHING) { if (USE_L1) { const double sg_l1 = ThresholdL1(sum_gradients, l1); return (sg_l1 * sg_l1) / (sum_hessians + l2); @@ -756,10 +871,9 @@ class FeatureHistogram { return (sum_gradients * sum_gradients) / (sum_hessians + l2); } } else { - double output = CalculateSplittedLeafOutput( - sum_gradients, sum_hessians, l1, l2, max_delta_step); - return GetLeafGainGivenOutput(sum_gradients, sum_hessians, l1, l2, - output); + double output = CalculateSplittedLeafOutput( + sum_gradients, sum_hessians, l1, l2, max_delta_step, smoothing, num_data, parent_output); + return GetLeafGainGivenOutput(sum_gradients, sum_hessians, l1, l2, output); } } @@ -776,13 +890,13 @@ class FeatureHistogram { } } - template void FindBestThresholdSequentially(double sum_gradient, double sum_hessian, data_size_t num_data, const ConstraintEntry& constraints, double min_gain_shift, SplitInfo* output, - int rand_threshold) { + int rand_threshold, double parent_output) { const int8_t offset = meta_->offset; double best_sum_left_gradient = NAN; double best_sum_left_hessian = NAN; @@ -837,11 +951,12 @@ class FeatureHistogram { } } // current split gain - double current_gain = GetSplitGains( + double current_gain = GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - constraints, meta_->monotone_type); + constraints, meta_->monotone_type, meta_->config->path_smooth, + left_count, right_count, parent_output); // gain with split is worse than without split if (current_gain <= min_gain_shift) { continue; @@ -921,11 +1036,12 @@ class FeatureHistogram { } } // current split gain - double current_gain = GetSplitGains( + double current_gain = GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - constraints, meta_->monotone_type); + constraints, meta_->monotone_type, meta_->config->path_smooth, left_count, + right_count, parent_output); // gain with split is worse than without split if (current_gain <= min_gain_shift) { continue; @@ -948,19 +1064,21 @@ class FeatureHistogram { // update split information output->threshold = best_threshold; output->left_output = - CalculateSplittedLeafOutput( + CalculateSplittedLeafOutput( best_sum_left_gradient, best_sum_left_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, - meta_->config->max_delta_step, constraints); + meta_->config->max_delta_step, constraints, meta_->config->path_smooth, + best_left_count, parent_output); output->left_count = best_left_count; output->left_sum_gradient = best_sum_left_gradient; output->left_sum_hessian = best_sum_left_hessian - kEpsilon; output->right_output = - CalculateSplittedLeafOutput( + CalculateSplittedLeafOutput( sum_gradient - best_sum_left_gradient, sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - constraints); + constraints, meta_->config->path_smooth, num_data - best_left_count, + parent_output); output->right_count = num_data - best_left_count; output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_hessian = @@ -976,7 +1094,7 @@ class FeatureHistogram { bool is_splittable_ = true; std::function + double, SplitInfo*)> find_best_threshold_fun_; }; @@ -1133,7 +1251,8 @@ class HistogramPool { if (old_config->lambda_l1 != config->lambda_l1 || old_config->monotone_constraints != config->monotone_constraints || old_config->extra_trees != config->extra_trees || - old_config->max_delta_step != config->max_delta_step) { + old_config->max_delta_step != config->max_delta_step || + old_config->path_smooth != config->path_smooth) { #pragma omp parallel for schedule(static) for (int i = 0; i < cache_size_; ++i) { for (int j = 0; j < train_data->num_features(); ++j) { diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp index 6d18c3b6f174..1f396198e580 100644 --- a/src/treelearner/leaf_splits.hpp +++ b/src/treelearner/leaf_splits.hpp @@ -21,7 +21,7 @@ class LeafSplits { public: explicit LeafSplits(data_size_t num_data) :num_data_in_leaf_(num_data), num_data_(num_data), - data_indices_(nullptr) { + data_indices_(nullptr), weight_(0) { } void ResetNumData(data_size_t num_data) { num_data_ = num_data; @@ -37,11 +37,13 @@ class LeafSplits { * \param sum_gradients * \param sum_hessians */ - void Init(int leaf, const DataPartition* data_partition, double sum_gradients, double sum_hessians) { + void Init(int leaf, const DataPartition* data_partition, double sum_gradients, + double sum_hessians, double weight) { leaf_index_ = leaf; data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_); sum_gradients_ = sum_gradients; sum_hessians_ = sum_hessians; + weight_ = weight; } /*! @@ -135,6 +137,10 @@ class LeafSplits { /*! \brief Get indices of data of current leaf */ const data_size_t* data_indices() const { return data_indices_; } + /*! \brief Get weight of current leaf */ + double weight() const { return weight_; } + + private: /*! \brief current leaf index */ @@ -149,6 +155,8 @@ class LeafSplits { double sum_hessians_; /*! \brief indices of data of current leaf */ const data_size_t* data_indices_; + /*! \brief weight of current leaf */ + double weight_; }; } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index b7569d22c8e2..09cbb16921e7 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -212,9 +212,16 @@ Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const score_t* sum_grad += gradients[idx]; sum_hess += hessians[idx]; } - double output = FeatureHistogram::CalculateSplittedLeafOutput( - sum_grad, sum_hess, config_->lambda_l1, config_->lambda_l2, - config_->max_delta_step); + double output; + if ((config_->path_smooth > kEpsilon) & (i > 0)) { + output = FeatureHistogram::CalculateSplittedLeafOutput( + sum_grad, sum_hess, config_->lambda_l1, config_->lambda_l2, + config_->max_delta_step, config_->path_smooth, cnt_leaf_data, tree->leaf_parent(i)); + } else { + output = FeatureHistogram::CalculateSplittedLeafOutput( + sum_grad, sum_hess, config_->lambda_l1, config_->lambda_l2, + config_->max_delta_step, config_->path_smooth, cnt_leaf_data, 0); + } auto old_leaf_output = tree->LeafOutput(i); auto new_leaf_output = output * tree->shrinkage(); tree->SetLeafOutput(i, config_->refit_decay_rate * old_leaf_output + (1.0 - config_->refit_decay_rate) * new_leaf_output); @@ -447,6 +454,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf, left_leaf_splits->sum_hessians(), left_threshold, left_leaf_splits->num_data_in_leaf(), + left_leaf_splits->weight(), &left_split); left_split.feature = left_feature; forceSplitMap[*left_leaf] = left_split; @@ -468,6 +476,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf, right_leaf_splits->sum_hessians(), right_threshold, right_leaf_splits->num_data_in_leaf(), + right_leaf_splits->weight(), &right_split); right_split.feature = right_feature; forceSplitMap[*right_leaf] = right_split; @@ -611,18 +620,22 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, CHECK_GT(best_split_info.left_count, 0); smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, - best_split_info.left_sum_hessian); + best_split_info.left_sum_hessian, + best_split_info.left_output); larger_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, - best_split_info.right_sum_hessian); + best_split_info.right_sum_hessian, + best_split_info.right_output); } else { CHECK_GT(best_split_info.right_count, 0); smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, - best_split_info.right_sum_hessian); + best_split_info.right_sum_hessian, + best_split_info.right_output); larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, - best_split_info.left_sum_hessian); + best_split_info.left_sum_hessian, + best_split_info.left_output); } auto leaves_need_update = constraints_->Update( tree, is_numerical_split, *left_leaf, *right_leaf, @@ -683,9 +696,19 @@ void SerialTreeLearner::ComputeBestSplitForFeature( return; } SplitInfo new_split; - histogram_array_[feature_index].FindBestThreshold( - leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data, - constraints_->Get(leaf_splits->leaf_index()), &new_split); + double parent_output; + if (leaf_splits->leaf_index() == 0) { + // for root leaf the "parent" output is its own output because we don't apply any smoothing to the root + parent_output = FeatureHistogram::CalculateSplittedLeafOutput( + leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), config_->lambda_l1, + config_->lambda_l2, config_->max_delta_step, constraints_->Get(leaf_splits->leaf_index()), + config_->path_smooth, static_cast(num_data), 0); + } else { + parent_output = leaf_splits->weight(); + } + histogram_array_[feature_index].FindBestThreshold( + leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data, + constraints_->Get(leaf_splits->leaf_index()), parent_output, &new_split); new_split.feature = real_fidx; if (cegb_ != nullptr) { new_split.gain -= diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp index e7a8e55054f7..defd4c206377 100644 --- a/src/treelearner/voting_parallel_tree_learner.cpp +++ b/src/treelearner/voting_parallel_tree_learner.cpp @@ -436,17 +436,21 @@ void VotingParallelTreeLearner::Split(Tree* tree, int best_Leaf, if (best_split_info.left_count < best_split_info.right_count) { smaller_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(), best_split_info.left_sum_gradient, - best_split_info.left_sum_hessian); + best_split_info.left_sum_hessian, + best_split_info.left_output); larger_leaf_splits_global_->Init(*right_leaf, this->data_partition_.get(), best_split_info.right_sum_gradient, - best_split_info.right_sum_hessian); + best_split_info.right_sum_hessian, + best_split_info.right_output); } else { smaller_leaf_splits_global_->Init(*right_leaf, this->data_partition_.get(), best_split_info.right_sum_gradient, - best_split_info.right_sum_hessian); + best_split_info.right_sum_hessian, + best_split_info.right_output); larger_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(), best_split_info.left_sum_gradient, - best_split_info.left_sum_hessian); + best_split_info.left_sum_hessian, + best_split_info.left_output); } } diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 51be083a9f01..efbb65129287 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -2034,6 +2034,23 @@ def test_extra_trees(self): err_new = mean_squared_error(y, predicted_new) self.assertLess(err, err_new) + def test_path_smoothing(self): + # check path smoothing increases regularization + X, y = load_boston(True) + lgb_x = lgb.Dataset(X, label=y) + params = {'objective': 'regression', + 'num_leaves': 32, + 'verbose': -1, + 'seed': 0} + est = lgb.train(params, lgb_x, num_boost_round=10) + predicted = est.predict(X) + err = mean_squared_error(y, predicted) + params['path_smooth'] = 1 + est = lgb.train(params, lgb_x, num_boost_round=10) + predicted_new = est.predict(X) + err_new = mean_squared_error(y, predicted_new) + self.assertLess(err, err_new) + @unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed') def test_trees_to_dataframe(self): From 198f87cfa47d5ec9df8305036ac4f216420b081b Mon Sep 17 00:00:00 2001 From: btrotta Date: Fri, 27 Mar 2020 16:05:57 +1100 Subject: [PATCH 02/10] Try to fix issue with gpu version. --- src/treelearner/gpu_tree_learner.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp index ba0b43c58660..f8f57e4b4236 100644 --- a/src/treelearner/gpu_tree_learner.cpp +++ b/src/treelearner/gpu_tree_learner.cpp @@ -1090,8 +1090,8 @@ void GPUTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf()); } } else { - smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian); - larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian); + smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian, best_split_info.right_output); + larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian, best_split_info.left_output); if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) || (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) { Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf()); From 45def6c7c679bdf54ee407bedb6f05d9184e1b5b Mon Sep 17 00:00:00 2001 From: btrotta Date: Mon, 30 Mar 2020 11:55:51 +1100 Subject: [PATCH 03/10] Fix failing CI for R package. --- .ci/test_r_package.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh index 85c277e42a57..d15ceeed8f95 100755 --- a/.ci/test_r_package.sh +++ b/.ci/test_r_package.sh @@ -91,7 +91,7 @@ if grep -q -R "WARNING" "$LOG_FILE_NAME"; then exit -1 fi -ALLOWED_CHECK_NOTES=3 +ALLOWED_CHECK_NOTES=4 NUM_CHECK_NOTES=$( cat ${LOG_FILE_NAME} \ | grep -e '^Status: .* NOTE.*' \ From 3a111824149608471c347f975f336c0091ade828 Mon Sep 17 00:00:00 2001 From: btrotta Date: Tue, 7 Apr 2020 11:46:45 +1000 Subject: [PATCH 04/10] Minor fixes. --- docs/Parameters.rst | 4 +++- include/LightGBM/config.h | 3 ++- src/treelearner/feature_histogram.hpp | 31 ++++++++++----------------- 3 files changed, 16 insertions(+), 22 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 13896a6e2fdf..0652fad0e1b3 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -526,7 +526,9 @@ Learning Control Parameters - larger values give stronger regularisation - - the weight of each node is `(n / path_smooth) * w + w_p / (n / path_smooth + 1)`, where `n` is the number of samples in the node, `w` is the calculated node weight, and `w_p` is the weight of the parent node + - the weight of each node is `(n / path_smooth) * w + w_p / (n / path_smooth + 1)`, where `n` is the number of samples in the node, `w` is the optimal node weight to minimise the loss (approximately `-sum_gradients / sum_hessians`), and `w_p` is the weight of the parent node + + - (note that the parent output `w_p` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth) - ``verbosity`` :raw-html:`🔗︎`, default = ``1``, type = int, aliases: ``verbose`` diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 8feff729d0a8..f08b5171984e 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -494,7 +494,8 @@ struct Config { // desc = if set to zero, no smoothing is applied // desc = if `path_smooth > 0` then `min_data_in_leaf` must be at least `2`. // desc = larger values give stronger regularisation - // descl2 = the weight of each node is `(n / path_smooth) * w + w_p / (n / path_smooth + 1)`, where `n` is the number of samples in the node, `w` is the calculated node weight, and `w_p` is the weight of the parent node + // descl2 = the weight of each node is `(n / path_smooth) * w + w_p / (n / path_smooth + 1)`, where `n` is the number of samples in the node, `w` is the optimal node weight to minimise the loss (approximately `-sum_gradients / sum_hessians`), and `w_p` is the weight of the parent node + // descl2 = (note that the parent output `w_p` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth) double path_smooth = 0; // alias = verbose diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 79bf63c34457..bb2a0f9f44c0 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -95,19 +95,15 @@ class FeatureHistogram { } template - double BeforeNumercal(double sum_gradient, double sum_hessian, double parent_output, + double BeforeNumercal(double sum_gradient, double sum_hessian, double parent_output, data_size_t num_data, SplitInfo* output, int* rand_threshold) { is_splittable_ = false; output->monotone_type = meta_->monotone_type; - double gain_shift; - if (USE_SMOOTHING) { - gain_shift = GetLeafGainGivenOutput( - sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output); - } else { - gain_shift = GetLeafGain( - sum_gradient, sum_hessian, meta_->config->lambda_l1, - meta_->config->lambda_l2, meta_->config->max_delta_step, 0, 0, 0); - } + double output_without_split = CalculateSplittedLeafOutput( + sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, + meta_->config->max_delta_step, meta_->config->path_smooth, num_data, parent_output); + double gain_shift = GetLeafGainGivenOutput( + sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, output_without_split); *rand_threshold = 0; if (USE_RAND) { if (meta_->num_bin - 2 > 0) { @@ -164,7 +160,7 @@ class FeatureHistogram { #define LAMBDA_ARGUMENTS \ double sum_gradient, double sum_hessian, data_size_t num_data, \ const ConstraintEntry &constraints, double parent_output, SplitInfo *output -#define BEFORE_ARGUMENTS sum_gradient, sum_hessian, parent_output, output, &rand_threshold +#define BEFORE_ARGUMENTS sum_gradient, sum_hessian, parent_output, num_data, output, &rand_threshold #define FUNC_ARGUMENTS \ sum_gradient, sum_hessian, num_data, constraints, min_gain_shift, \ output, rand_threshold, parent_output @@ -780,17 +776,12 @@ class FeatureHistogram { double ret; if (USE_L1) { ret = -ThresholdL1(sum_gradients, l1) / (sum_hessians + l2); - if (USE_MAX_OUTPUT) { - if (max_delta_step > 0 && std::fabs(ret) > max_delta_step) { - ret = Common::Sign(ret) * max_delta_step; - } - } } else { ret = -sum_gradients / (sum_hessians + l2); - if (USE_MAX_OUTPUT) { - if (max_delta_step > 0 && std::fabs(ret) > max_delta_step) { - ret = Common::Sign(ret) * max_delta_step; - } + } + if (USE_MAX_OUTPUT) { + if (max_delta_step > 0 && std::fabs(ret) > max_delta_step) { + ret = Common::Sign(ret) * max_delta_step; } } if (USE_SMOOTHING) { From ab2ca816b22fa2a67c36811994aa378464d9fbd6 Mon Sep 17 00:00:00 2001 From: btrotta Date: Thu, 9 Apr 2020 17:20:46 +1000 Subject: [PATCH 05/10] Minor refactor. --- src/treelearner/feature_histogram.hpp | 177 +++++++++++--------------- 1 file changed, 73 insertions(+), 104 deletions(-) diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index bb2a0f9f44c0..3e99747a73e2 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -99,11 +99,10 @@ class FeatureHistogram { SplitInfo* output, int* rand_threshold) { is_splittable_ = false; output->monotone_type = meta_->monotone_type; - double output_without_split = CalculateSplittedLeafOutput( + + double gain_shift = GetLeafGain( sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, - meta_->config->max_delta_step, meta_->config->path_smooth, num_data, parent_output); - double gain_shift = GetLeafGainGivenOutput( - sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, output_without_split); + meta_->config->max_delta_step, meta_->config->path_smooth, num_data, parent_output); *rand_threshold = 0; if (USE_RAND) { if (meta_->num_bin - 2 > 0) { @@ -527,6 +526,21 @@ class FeatureHistogram { uint32_t threshold, data_size_t num_data, double parent_output, SplitInfo* output) { bool use_smoothing = meta_->config->path_smooth > kEpsilon; + if (use_smoothing) { + GatherInfoForThresholdNumericalInner(sum_gradient, sum_hessian, + threshold, num_data, + parent_output, output); + } else { + GatherInfoForThresholdNumericalInner(sum_gradient, sum_hessian, + threshold, num_data, + parent_output, output); + } + } + + template + void GatherInfoForThresholdNumericalInner(double sum_gradient, double sum_hessian, + uint32_t threshold, data_size_t num_data, + double parent_output, SplitInfo* output) { double gain_shift = GetLeafGainGivenOutput( sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output); @@ -573,28 +587,15 @@ class FeatureHistogram { double sum_left_gradient = sum_gradient - sum_right_gradient; double sum_left_hessian = sum_hessian - sum_right_hessian; data_size_t left_count = num_data - right_count; - double current_gain; - if (use_smoothing) { - current_gain = - GetLeafGain( - sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, - meta_->config->lambda_l2, meta_->config->max_delta_step, - meta_->config->path_smooth, left_count, parent_output) + - GetLeafGain( - sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, - meta_->config->lambda_l2, meta_->config->max_delta_step, - meta_->config->path_smooth, right_count, parent_output); - } else { - current_gain = - GetLeafGain( - sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, - meta_->config->lambda_l2, meta_->config->max_delta_step, - meta_->config->path_smooth, left_count, parent_output) + - GetLeafGain( - sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, - meta_->config->lambda_l2, meta_->config->max_delta_step, - meta_->config->path_smooth, right_count, parent_output); - } + double current_gain = + GetLeafGain( + sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + meta_->config->path_smooth, left_count, parent_output) + + GetLeafGain( + sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + meta_->config->path_smooth, right_count, parent_output); // gain with split is worse than without split if (std::isnan(current_gain) || current_gain <= min_gain_shift) { @@ -606,33 +607,18 @@ class FeatureHistogram { // update split information output->threshold = threshold; - if (use_smoothing) { - output->left_output = CalculateSplittedLeafOutput( - sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, - meta_->config->lambda_l2, meta_->config->max_delta_step, - meta_->config->path_smooth, left_count, parent_output); - } else { - output->left_output = CalculateSplittedLeafOutput( - sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, - meta_->config->lambda_l2, meta_->config->max_delta_step, - meta_->config->path_smooth, left_count, parent_output); - } + output->left_output = CalculateSplittedLeafOutput( + sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + meta_->config->path_smooth, left_count, parent_output); output->left_count = left_count; output->left_sum_gradient = sum_left_gradient; output->left_sum_hessian = sum_left_hessian - kEpsilon; - if (use_smoothing) { - output->right_output = CalculateSplittedLeafOutput( - sum_gradient - sum_left_gradient, sum_hessian - sum_left_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, - meta_->config->max_delta_step, meta_->config->path_smooth, - right_count, parent_output); - } else { - output->right_output = CalculateSplittedLeafOutput( - sum_gradient - sum_left_gradient, sum_hessian - sum_left_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, - meta_->config->max_delta_step, meta_->config->path_smooth, - right_count, parent_output); - } + output->right_output = CalculateSplittedLeafOutput( + sum_gradient - sum_left_gradient, sum_hessian - sum_left_hessian, + meta_->config->lambda_l1, meta_->config->lambda_l2, + meta_->config->max_delta_step, meta_->config->path_smooth, + right_count, parent_output); output->right_count = num_data - left_count; output->right_sum_gradient = sum_gradient - sum_left_gradient; output->right_sum_hessian = sum_hessian - sum_left_hessian - kEpsilon; @@ -640,10 +626,24 @@ class FeatureHistogram { output->default_left = true; } - void GatherInfoForThresholdCategorical(double sum_gradient, - double sum_hessian, uint32_t threshold, - data_size_t num_data, double parent_output, - SplitInfo* output) { + void GatherInfoForThresholdCategorical(double sum_gradient, double sum_hessian, + uint32_t threshold, data_size_t num_data, + double parent_output, SplitInfo* output) { + bool use_smoothing = meta_->config->path_smooth > kEpsilon; + if (use_smoothing) { + GatherInfoForThresholdCategoricalInner(sum_gradient, sum_hessian, threshold, + num_data, parent_output, output); + } else { + GatherInfoForThresholdCategoricalInner(sum_gradient, sum_hessian, threshold, + num_data, parent_output, output); + } + } + + template + void GatherInfoForThresholdCategoricalInner(double sum_gradient, + double sum_hessian, uint32_t threshold, + data_size_t num_data, double parent_output, + SplitInfo* output) { // get SplitInfo for a given one-hot categorical split. output->default_left = false; double gain_shift = GetLeafGainGivenOutput( @@ -670,65 +670,34 @@ class FeatureHistogram { double sum_left_gradient = grad; double sum_right_gradient = sum_gradient - sum_left_gradient; // current split gain - double current_gain; - bool use_smoothing = meta_->config->path_smooth > kEpsilon; - if (use_smoothing) { - current_gain = - GetLeafGain(sum_right_gradient, sum_right_hessian, - meta_->config->lambda_l1, l2, - meta_->config->max_delta_step, - meta_->config->path_smooth, right_count, - parent_output) + - GetLeafGain(sum_left_gradient, sum_left_hessian, - meta_->config->lambda_l1, l2, - meta_->config->max_delta_step, - meta_->config->path_smooth, left_count, - parent_output); - } else { - current_gain = - GetLeafGain(sum_right_gradient, sum_right_hessian, - meta_->config->lambda_l1, l2, - meta_->config->max_delta_step, - meta_->config->path_smooth, right_count, - parent_output) + - GetLeafGain(sum_left_gradient, sum_left_hessian, - meta_->config->lambda_l1, l2, - meta_->config->max_delta_step, - meta_->config->path_smooth, left_count, - parent_output); - } + double current_gain = + GetLeafGain(sum_right_gradient, sum_right_hessian, + meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, + meta_->config->path_smooth, right_count, + parent_output) + + GetLeafGain(sum_left_gradient, sum_left_hessian, + meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, + meta_->config->path_smooth, left_count, + parent_output); if (std::isnan(current_gain) || current_gain <= min_gain_shift) { output->gain = kMinScore; Log::Warning( "'Forced Split' will be ignored since the gain getting worse."); return; } - - if (use_smoothing) { - output->left_output = CalculateSplittedLeafOutput( - sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, l2, - meta_->config->max_delta_step, meta_->config->path_smooth, left_count, - parent_output); - } else { - output->left_output = CalculateSplittedLeafOutput( - sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, l2, - meta_->config->max_delta_step, meta_->config->path_smooth, left_count, - parent_output); - } + output->left_output = CalculateSplittedLeafOutput( + sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, meta_->config->path_smooth, left_count, + parent_output); output->left_count = left_count; output->left_sum_gradient = sum_left_gradient; output->left_sum_hessian = sum_left_hessian - kEpsilon; - if (use_smoothing) { - output->right_output = CalculateSplittedLeafOutput( - sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2, - meta_->config->max_delta_step, meta_->config->path_smooth, right_count, - parent_output); - } else { - output->right_output = CalculateSplittedLeafOutput( - sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2, - meta_->config->max_delta_step, meta_->config->path_smooth, right_count, - parent_output); - } + output->right_output = CalculateSplittedLeafOutput( + sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, meta_->config->path_smooth, right_count, + parent_output); output->right_count = right_count; output->right_sum_gradient = sum_gradient - sum_left_gradient; output->right_sum_hessian = sum_right_hessian - kEpsilon; From e4a84dad25c1a7fcc6139483c535277b41c5b256 Mon Sep 17 00:00:00 2001 From: btrotta Date: Thu, 9 Apr 2020 17:48:20 +1000 Subject: [PATCH 06/10] Restore old code to get CI working. --- .ci/test_r_package.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh index 31fae4ae06bd..4fd21a028ca1 100755 --- a/.ci/test_r_package.sh +++ b/.ci/test_r_package.sh @@ -91,7 +91,7 @@ if grep -q -R "WARNING" "$LOG_FILE_NAME"; then exit -1 fi -ALLOWED_CHECK_NOTES=2 +ALLOWED_CHECK_NOTES=4 NUM_CHECK_NOTES=$( cat ${LOG_FILE_NAME} \ | grep -e '^Status: .* NOTE.*' \ From 1a86a6b6b6c3af16aaedaf44d01e13ef88593a98 Mon Sep 17 00:00:00 2001 From: btrotta Date: Sun, 26 Apr 2020 15:11:44 +1000 Subject: [PATCH 07/10] Fix style issues. --- src/treelearner/leaf_splits.hpp | 4 ++-- src/treelearner/serial_tree_learner.cpp | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp index 1f396198e580..b0a753eafe1b 100644 --- a/src/treelearner/leaf_splits.hpp +++ b/src/treelearner/leaf_splits.hpp @@ -137,8 +137,8 @@ class LeafSplits { /*! \brief Get indices of data of current leaf */ const data_size_t* data_indices() const { return data_indices_; } - /*! \brief Get weight of current leaf */ - double weight() const { return weight_; } + /*! \brief Get weight of current leaf */ + double weight() const { return weight_; } diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 491e80f00505..a68a65ee91b4 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -708,9 +708,9 @@ void SerialTreeLearner::ComputeBestSplitForFeature( } else { parent_output = leaf_splits->weight(); } - histogram_array_[feature_index].FindBestThreshold( - leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data, - constraints_->Get(leaf_splits->leaf_index()), parent_output, &new_split); + histogram_array_[feature_index].FindBestThreshold( + leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data, + constraints_->Get(leaf_splits->leaf_index()), parent_output, &new_split); new_split.feature = real_fidx; if (cegb_ != nullptr) { new_split.gain -= From a77b509ab0a58d8be60fe046aa64810b203b41e1 Mon Sep 17 00:00:00 2001 From: btrotta Date: Mon, 27 Apr 2020 21:36:38 +1000 Subject: [PATCH 08/10] Fix ci for R package. --- .ci/test_r_package.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh index 1ef61c3910a2..8bab60fac567 100755 --- a/.ci/test_r_package.sh +++ b/.ci/test_r_package.sh @@ -98,7 +98,7 @@ if grep -q -R "WARNING" "$LOG_FILE_NAME"; then exit -1 fi -ALLOWED_CHECK_NOTES=4 +ALLOWED_CHECK_NOTES=3 NUM_CHECK_NOTES=$( cat ${LOG_FILE_NAME} \ | grep -e '^Status: .* NOTE.*' \ From 239d90ff145498c68e9fccfc8ffdc0695c6ba1ba Mon Sep 17 00:00:00 2001 From: btrotta Date: Mon, 27 Apr 2020 21:59:30 +1000 Subject: [PATCH 09/10] Minor fixes for docs and code style. --- docs/Parameters-Tuning.rst | 4 ++-- docs/Parameters.rst | 6 +++--- include/LightGBM/config.h | 6 +++--- src/treelearner/feature_histogram.hpp | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/Parameters-Tuning.rst b/docs/Parameters-Tuning.rst index 1f39b29ded46..bb29cb7b7412 100644 --- a/docs/Parameters-Tuning.rst +++ b/docs/Parameters-Tuning.rst @@ -81,6 +81,6 @@ Deal with Over-fitting - Try ``extra_trees`` -.. _Optuna: https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258 - - Try increasing ``path_smoothing`` + +.. _Optuna: https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258 diff --git a/docs/Parameters.rst b/docs/Parameters.rst index b87b8d0b9f4e..8e3c934beb1b 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -530,13 +530,13 @@ Learning Control Parameters - if set to zero, no smoothing is applied - - if `path_smooth > 0` then `min_data_in_leaf` must be at least `2`. + - if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2``. - larger values give stronger regularisation - - the weight of each node is `(n / path_smooth) * w + w_p / (n / path_smooth + 1)`, where `n` is the number of samples in the node, `w` is the optimal node weight to minimise the loss (approximately `-sum_gradients / sum_hessians`), and `w_p` is the weight of the parent node + - the weight of each node is ``(n / path_smooth) * w + w_p / (n / path_smooth + 1)``, where ``n`` is the number of samples in the node, ``w`` is the optimal node weight to minimise the loss (approximately ``-sum_gradients / sum_hessians``), and ``w_p`` is the weight of the parent node - - (note that the parent output `w_p` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth) + - note that the parent output ``w_p`` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth - ``verbosity`` :raw-html:`🔗︎`, default = ``1``, type = int, aliases: ``verbose`` diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 69911de006a1..1306beedd839 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -499,10 +499,10 @@ struct Config { // desc = controls smoothing applied to tree nodes // desc = helps prevent overfitting on leaves with few samples // desc = if set to zero, no smoothing is applied - // desc = if `path_smooth > 0` then `min_data_in_leaf` must be at least `2`. + // desc = if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2``. // desc = larger values give stronger regularisation - // descl2 = the weight of each node is `(n / path_smooth) * w + w_p / (n / path_smooth + 1)`, where `n` is the number of samples in the node, `w` is the optimal node weight to minimise the loss (approximately `-sum_gradients / sum_hessians`), and `w_p` is the weight of the parent node - // descl2 = (note that the parent output `w_p` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth) + // descl2 = the weight of each node is ``(n / path_smooth) * w + w_p / (n / path_smooth + 1)``, where ``n`` is the number of samples in the node, ``w`` is the optimal node weight to minimise the loss (approximately ``-sum_gradients / sum_hessians``), and ``w_p`` is the weight of the parent node + // descl2 = note that the parent output ``w_p`` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth double path_smooth = 0; // alias = verbose diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 3e99747a73e2..584d37257ab5 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -102,7 +102,7 @@ class FeatureHistogram { double gain_shift = GetLeafGain( sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, - meta_->config->max_delta_step, meta_->config->path_smooth, num_data, parent_output); + meta_->config->max_delta_step, meta_->config->path_smooth, num_data, parent_output); *rand_threshold = 0; if (USE_RAND) { if (meta_->num_bin - 2 > 0) { From eeb4b75e81ec32bc446c640b6a8fb507608ca0c2 Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 29 Apr 2020 09:32:30 +1000 Subject: [PATCH 10/10] Update docs. --- docs/Parameters-Tuning.rst | 2 +- docs/Parameters.rst | 2 +- include/LightGBM/config.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/Parameters-Tuning.rst b/docs/Parameters-Tuning.rst index bb29cb7b7412..1d16e823220d 100644 --- a/docs/Parameters-Tuning.rst +++ b/docs/Parameters-Tuning.rst @@ -81,6 +81,6 @@ Deal with Over-fitting - Try ``extra_trees`` -- Try increasing ``path_smoothing`` +- Try increasing ``path_smooth`` .. _Optuna: https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258 diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 8e3c934beb1b..64251f8d4573 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -530,7 +530,7 @@ Learning Control Parameters - if set to zero, no smoothing is applied - - if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2``. + - if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2`` - larger values give stronger regularisation diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 1306beedd839..5cdc6139dc0e 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -499,7 +499,7 @@ struct Config { // desc = controls smoothing applied to tree nodes // desc = helps prevent overfitting on leaves with few samples // desc = if set to zero, no smoothing is applied - // desc = if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2``. + // desc = if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2`` // desc = larger values give stronger regularisation // descl2 = the weight of each node is ``(n / path_smooth) * w + w_p / (n / path_smooth + 1)``, where ``n`` is the number of samples in the node, ``w`` is the optimal node weight to minimise the loss (approximately ``-sum_gradients / sum_hessians``), and ``w_p`` is the weight of the parent node // descl2 = note that the parent output ``w_p`` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth