From debf9650c90a3ffbae3fd836bd0e4965e4f3d719 Mon Sep 17 00:00:00 2001 From: Arthur Date: Sun, 14 Sep 2025 18:51:22 +0200 Subject: [PATCH 1/5] minimal changes --- sklearn/tree/_criterion.pyx | 44 +++++++++++++++++++-------------- sklearn/tree/tests/test_tree.py | 8 +++--- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 0e4b3358e5f89..681c11b2c1c82 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1192,8 +1192,9 @@ cdef void precompute_absolute_errors( intp_t k, intp_t start, intp_t end, + float64_t q, float64_t[::1] abs_errors, - float64_t[::1] medians + float64_t[::1] quantiles ) noexcept nogil: """ Fill `abs_errors` and `medians`. @@ -1222,6 +1223,8 @@ cdef void precompute_absolute_errors( Start index in `sample_indices` end : intp_t End index (exclusive) in `sample_indices` + q : float64_t + Quantile abs_errors : float64_t[::1] array to store (increment) the computed absolute errors. Shape: (n,) with n := end - start @@ -1257,8 +1260,8 @@ cdef void precompute_absolute_errors( cdef float64_t y cdef float64_t w = 1.0 cdef float64_t top_val, top_weight - cdef float64_t median = 0.0 - cdef float64_t half_weight + cdef float64_t quantile = 0.0 + cdef float64_t split_weight p = start for _ in range(n): @@ -1275,29 +1278,29 @@ cdef void precompute_absolute_errors( else: below.push(y, w) - half_weight = (above.total_weight + below.total_weight) / 2.0 + split_weight = (above.total_weight + below.total_weight) * q # Rebalance heaps - while above.total_weight < half_weight and not below.is_empty(): + while above.total_weight < split_weight and not below.is_empty(): below.pop(&top_val, &top_weight) above.push(top_val, top_weight) while ( not above.is_empty() - and (above.total_weight - above.top_weight()) >= half_weight + and (above.total_weight - above.top_weight()) >= split_weight ): above.pop(&top_val, &top_weight) below.push(top_val, top_weight) # Current median - if above.total_weight > half_weight + 1e-5 * fabs(half_weight): - median = above.top() - else: # above and below weight are almost exactly equals - median = (above.top() + below.top()) / 2. - medians[j] = median + if above.total_weight > split_weight + 1e-5 * fabs(split_weight): + quantile = above.top() + else: # above and below heaps are almost exactly balanced + quantile = q * above.top() + (1 - q) * below.top() + # FIXME: check if it should be (1 - q) * ... + q * ... + quantiles[j] = quantile abs_errors[j] += ( - (below.total_weight - above.total_weight) * median - - below.weighted_sum - + above.weighted_sum + q * (above.weighted_sum - above.total_weight * quantile) + + (1 - q) * (below.total_weight * quantile - below.weighted_sum) ) p += step j += step @@ -1307,7 +1310,8 @@ def _py_precompute_absolute_errors( const float64_t[:, ::1] ys, const float64_t[:] sample_weight, const intp_t[:] sample_indices, - bint suffix=False + bint suffix=False, + float64_t q=0.5 ): """ Used for testing precompute_absolute_errors. @@ -1334,7 +1338,7 @@ def _py_precompute_absolute_errors( precompute_absolute_errors( ys, sample_weight, sample_indices, above, below, - k, start, end, abs_errors, medians + k, start, end, q, abs_errors, medians ) return np.asarray(abs_errors) @@ -1469,7 +1473,7 @@ cdef class MAE(Criterion): # which are allowed only with n_outputs=1. precompute_absolute_errors( self.y, self.sample_weight, self.sample_indices, - self.above, self.below, k, self.start, self.end, + self.above, self.below, k, self.start, self.end, 0.5, # left_abs_errors is incremented, left_medians is overwritten self.left_abs_errors, self.left_medians ) @@ -1477,13 +1481,17 @@ cdef class MAE(Criterion): # i.e., reversed, and abs error & median are filled in reverse order to. precompute_absolute_errors( self.y, self.sample_weight, self.sample_indices, - self.above, self.below, k, self.end - 1, self.start - 1, + self.above, self.below, k, self.end - 1, self.start - 1, 0.5, # right_abs_errors is incremented, right_medians is overwritten self.right_abs_errors, self.right_medians ) # Store the median for the current node self.node_medians[k] = self.right_medians[0] + for p in range(self.start, self.end): + self.left_abs_errors[p] *= 2 + self.right_abs_errors[p] *= 2 + return 0 cdef int reverse_reset(self) except -1 nogil: diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index f86e41a2723ca..f0202552ba87c 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -2915,11 +2915,11 @@ def compute_abs_error(y: np.ndarray, w: np.ndarray): y = np.random.uniform(size=(n, 1)) w = np.random.rand(n) indices = np.arange(n) - abs_errors = _py_precompute_absolute_errors(y, w, indices) + abs_errors = 2 * _py_precompute_absolute_errors(y, w, indices) expected = compute_prefix_abs_errors_naive(y, w) assert np.allclose(abs_errors, expected) - abs_errors = _py_precompute_absolute_errors(y, w, indices, suffix=True) + abs_errors = 2 * _py_precompute_absolute_errors(y, w, indices, suffix=True) expected = compute_prefix_abs_errors_naive(y[::-1], w[::-1])[::-1] assert np.allclose(abs_errors, expected) @@ -2929,10 +2929,10 @@ def compute_abs_error(y: np.ndarray, w: np.ndarray): y_sorted = y[indices] w_sorted = w[indices] - abs_errors = _py_precompute_absolute_errors(y, w, indices) + abs_errors = 2 * _py_precompute_absolute_errors(y, w, indices) expected = compute_prefix_abs_errors_naive(y_sorted, w_sorted) assert np.allclose(abs_errors, expected) - abs_errors = _py_precompute_absolute_errors(y, w, indices, suffix=True) + abs_errors = 2 * _py_precompute_absolute_errors(y, w, indices, suffix=True) expected = compute_prefix_abs_errors_naive(y_sorted[::-1], w_sorted[::-1])[::-1] assert np.allclose(abs_errors, expected) From 6e267d5ba5e26c61f71028a411dfd996b7e7a3bd Mon Sep 17 00:00:00 2001 From: Arthur Date: Sun, 14 Sep 2025 21:24:51 +0200 Subject: [PATCH 2/5] AE to pinball loss --- sklearn/tree/_criterion.pyx | 48 ++++++++++++++++--------------- sklearn/tree/tests/test_tree.py | 51 +++++++++++++++++++++------------ 2 files changed, 57 insertions(+), 42 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 681c11b2c1c82..709ee5f2ce362 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1183,7 +1183,7 @@ cdef class MSE(RegressionCriterion): # Helper for MAE criterion: -cdef void precompute_absolute_errors( +cdef void precompute_pinball_losses( const float64_t[:, ::1] ys, const float64_t[:] sample_weight, const intp_t[:] sample_indices, @@ -1193,18 +1193,18 @@ cdef void precompute_absolute_errors( intp_t start, intp_t end, float64_t q, - float64_t[::1] abs_errors, + float64_t[::1] losses, float64_t[::1] quantiles ) noexcept nogil: """ - Fill `abs_errors` and `medians`. + Fill `losses` and `quantiles`. If start < end: - Computes the "prefix" AEs/medians, i.e the AEs for each set of indices + Computes the "prefix" losses/quantiles, i.e the losses for each set of indices sample_indices[start:start + i] with i in {1, ..., n} where n = end - start Else: - Computes the "suffix" AEs/medians, i.e the AEs for each set of indices + Computes the "suffix" losses/quantiles, i.e the losses for each set of indices sample_indices[i:] with i in {0, ..., n-1} Parameters @@ -1224,25 +1224,26 @@ cdef void precompute_absolute_errors( end : intp_t End index (exclusive) in `sample_indices` q : float64_t - Quantile - abs_errors : float64_t[::1] + Probability for the quantile / alpha for the pinball loss + losses : float64_t[::1] array to store (increment) the computed absolute errors. Shape: (n,) with n := end - start - medians : float64_t[::1] - array to store (overwrite) the computed medians. Shape: (n,) + quantiles : float64_t[::1] + array to store (overwrite) the computed quantiles. Shape: (n,) Complexity: O(n log n) This algorithm is an adaptation of the two heaps solution of the "find median from a data stream" problem See for instance: https://www.geeksforgeeks.org/dsa/median-of-stream-of-integers-running-integers/ - But here, it's the weighted median and we also need to compute the AE, so: + But here, it's the weighted quantile and we also need to compute the pinball loss, so: - instead of balancing the heaps based on their number of elements, rebalance them based on the summed weights of their elements - - rewrite the AE computation by splitting the sum between elements - above and below the median, which allow to express it as a simple + - rewrite the pinball loss computation by splitting the sum between + elements above and below the median, which allow to express it as a simple O(1) computation. - See the maths in the PR desc: + See the maths in the PR description: TODO + Also this the PR for the initial implementation (weighted median and absolute error): https://github.com/scikit-learn/scikit-learn/pull/32100 """ cdef intp_t j, p, i, step, n @@ -1278,7 +1279,7 @@ cdef void precompute_absolute_errors( else: below.push(y, w) - split_weight = (above.total_weight + below.total_weight) * q + split_weight = (above.total_weight + below.total_weight) * (1 - q) # Rebalance heaps while above.total_weight < split_weight and not below.is_empty(): @@ -1298,7 +1299,7 @@ cdef void precompute_absolute_errors( quantile = q * above.top() + (1 - q) * below.top() # FIXME: check if it should be (1 - q) * ... + q * ... quantiles[j] = quantile - abs_errors[j] += ( + losses[j] += ( q * (above.weighted_sum - above.total_weight * quantile) + (1 - q) * (below.total_weight * quantile - below.weighted_sum) ) @@ -1306,7 +1307,7 @@ cdef void precompute_absolute_errors( j += step -def _py_precompute_absolute_errors( +def _py_precompute_pinball_losses( const float64_t[:, ::1] ys, const float64_t[:] sample_weight, const intp_t[:] sample_indices, @@ -1329,18 +1330,18 @@ def _py_precompute_absolute_errors( intp_t k = 0 intp_t start = 0 intp_t end = n - float64_t[::1] abs_errors = np.zeros(n, dtype=np.float64) - float64_t[::1] medians = np.zeros(n, dtype=np.float64) + float64_t[::1] losses = np.zeros(n, dtype=np.float64) + float64_t[::1] quantiles = np.zeros(n, dtype=np.float64) if suffix: start = n - 1 end = -1 - precompute_absolute_errors( + precompute_pinball_losses( ys, sample_weight, sample_indices, above, below, - k, start, end, q, abs_errors, medians + k, start, end, q, losses, quantiles ) - return np.asarray(abs_errors) + return np.asarray(losses) cdef class MAE(Criterion): @@ -1471,7 +1472,7 @@ cdef class MAE(Criterion): # Note that at each iteration of this loop, we overwrite `self.left_medians` # and `self.right_medians`. They are used to check for monoticity constraints, # which are allowed only with n_outputs=1. - precompute_absolute_errors( + precompute_pinball_losses( self.y, self.sample_weight, self.sample_indices, self.above, self.below, k, self.start, self.end, 0.5, # left_abs_errors is incremented, left_medians is overwritten @@ -1479,7 +1480,7 @@ cdef class MAE(Criterion): ) # For the right child, we consider samples from end-1 to start-1 # i.e., reversed, and abs error & median are filled in reverse order to. - precompute_absolute_errors( + precompute_pinball_losses( self.y, self.sample_weight, self.sample_indices, self.above, self.below, k, self.end - 1, self.start - 1, 0.5, # right_abs_errors is incremented, right_medians is overwritten @@ -1488,6 +1489,7 @@ cdef class MAE(Criterion): # Store the median for the current node self.node_medians[k] = self.right_medians[0] + # pinball loss with q=0.5 is half of the AE for p in range(self.start, self.end): self.left_abs_errors[p] *= 2 self.right_abs_errors[p] *= 2 diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index f0202552ba87c..0e5c19afcde41 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -20,7 +20,12 @@ from sklearn.dummy import DummyRegressor from sklearn.exceptions import NotFittedError from sklearn.impute import SimpleImputer -from sklearn.metrics import accuracy_score, mean_poisson_deviance, mean_squared_error +from sklearn.metrics import ( + accuracy_score, + mean_pinball_loss, + mean_poisson_deviance, + mean_squared_error, +) from sklearn.model_selection import cross_val_score, train_test_split from sklearn.pipeline import make_pipeline from sklearn.random_projection import _sparse_random_matrix @@ -36,7 +41,7 @@ DENSE_SPLITTERS, SPARSE_SPLITTERS, ) -from sklearn.tree._criterion import _py_precompute_absolute_errors +from sklearn.tree._criterion import _py_precompute_pinball_losses from sklearn.tree._partitioner import _py_sort from sklearn.tree._tree import ( NODE_DTYPE, @@ -2885,7 +2890,8 @@ def test_sort_log2_build(): assert_array_equal(samples, expected_samples) -def test_absolute_errors_precomputation_function(): +@pytest.mark.parametrize("q", [0.5, 0.2, 0.9]) +def test_absolute_errors_precomputation_function(q): """ Test the main bit of logic of the MAE(RegressionCriterion) class (used by DecisionTreeRegressor(criterion="asbolute_error")). @@ -2896,31 +2902,38 @@ def test_absolute_errors_precomputation_function(): it can be safely removed. """ - def compute_prefix_abs_errors_naive(y: np.ndarray, w: np.ndarray): + def compute_prefix_losses_naive(y: np.ndarray, w: np.ndarray): + """ + Computes the pinball loss for all (y[:i], w[:i]) + Naive: O(n^2 log n) + """ y = y.ravel() - return np.array([compute_abs_error(y[:i], w[:i]) for i in range(1, y.size + 1)]) + return np.array( + [compute_pinball_loss(y[:i], w[:i]) for i in range(1, y.size + 1)] + ) - def compute_abs_error(y: np.ndarray, w: np.ndarray): + def compute_pinball_loss(y: np.ndarray, w: np.ndarray): # 1) compute the weighted median # i.e. once ordered by y, search for i such that: - # sum(w[:i]) <= 1/2 and sum(w[i+1:]) <= 1/2 + # sum(w[:i]) <= q*sum(w) and sum(w[i+1:]) >= q*sum(w) sorter = np.argsort(y) wc = np.cumsum(w[sorter]) - idx = np.searchsorted(wc, wc[-1] / 2) - median = y[sorter[idx]] - # 2) compute the AE - return (np.abs(y - median) * w).sum() + idx = np.searchsorted(wc, wc[-1] * q) + quantile = y[sorter[idx]] + y_pred = np.full(y.size, quantile) + # 2) compute the pinball loss + return mean_pinball_loss(y, y_pred, sample_weight=w, alpha=q) * w.sum() for n in [3, 5, 10, 20, 50, 100]: y = np.random.uniform(size=(n, 1)) w = np.random.rand(n) indices = np.arange(n) - abs_errors = 2 * _py_precompute_absolute_errors(y, w, indices) - expected = compute_prefix_abs_errors_naive(y, w) + abs_errors = _py_precompute_pinball_losses(y, w, indices, q=q) + expected = compute_prefix_losses_naive(y, w) assert np.allclose(abs_errors, expected) - abs_errors = 2 * _py_precompute_absolute_errors(y, w, indices, suffix=True) - expected = compute_prefix_abs_errors_naive(y[::-1], w[::-1])[::-1] + abs_errors = _py_precompute_pinball_losses(y, w, indices, suffix=True, q=q) + expected = compute_prefix_losses_naive(y[::-1], w[::-1])[::-1] assert np.allclose(abs_errors, expected) x = np.random.rand(n) @@ -2929,10 +2942,10 @@ def compute_abs_error(y: np.ndarray, w: np.ndarray): y_sorted = y[indices] w_sorted = w[indices] - abs_errors = 2 * _py_precompute_absolute_errors(y, w, indices) - expected = compute_prefix_abs_errors_naive(y_sorted, w_sorted) + abs_errors = _py_precompute_pinball_losses(y, w, indices, q=q) + expected = compute_prefix_losses_naive(y_sorted, w_sorted) assert np.allclose(abs_errors, expected) - abs_errors = 2 * _py_precompute_absolute_errors(y, w, indices, suffix=True) - expected = compute_prefix_abs_errors_naive(y_sorted[::-1], w_sorted[::-1])[::-1] + abs_errors = _py_precompute_pinball_losses(y, w, indices, suffix=True, q=q) + expected = compute_prefix_losses_naive(y_sorted[::-1], w_sorted[::-1])[::-1] assert np.allclose(abs_errors, expected) From 592e74a85f7b1914da3a6856ccddf3b409501681 Mon Sep 17 00:00:00 2001 From: Arthur Date: Wed, 17 Sep 2025 20:21:15 +0200 Subject: [PATCH 3/5] Pass down pinball_alpha --- sklearn/tree/_classes.py | 25 ++++++++- sklearn/tree/_criterion.pyx | 96 +++++++++++++++++++-------------- sklearn/tree/tests/test_tree.py | 4 +- 3 files changed, 81 insertions(+), 44 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 6b0c46190f849..6f7c8b15dd5eb 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -77,6 +77,7 @@ "friedman_mse": _criterion.FriedmanMSE, "absolute_error": _criterion.MAE, "poisson": _criterion.Poisson, + "pinball": _criterion.Pinball, } DENSE_SPLITTERS = {"best": _splitter.BestSplitter, "random": _splitter.RandomSplitter} @@ -383,7 +384,14 @@ def _fit( self.n_outputs_, self.n_classes_ ) else: - criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples) + args = (self.n_outputs_, n_samples) + if self.criterion == "pinball": + args = (*args, self.pinball_alpha) + if self.criterion == "absolute_error": + # FIXME: this is coupled with code at a much lower level + # because of some inheritance shenanigans with __cinit__ + args = (*args, 0.5) + criterion = CRITERIA_REG[self.criterion](*args) else: # Make a deepcopy in case the criterion has mutable attributes that # might be shared and modified concurrently during parallel fitting @@ -1338,9 +1346,18 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): _parameter_constraints: dict = { **BaseDecisionTree._parameter_constraints, "criterion": [ - StrOptions({"squared_error", "friedman_mse", "absolute_error", "poisson"}), + StrOptions( + { + "squared_error", + "friedman_mse", + "absolute_error", + "poisson", + "pinball", + } + ), Hidden(Criterion), ], + "pinball_alpha": [Interval(RealNotInt, 0.0, 1.0, closed="neither")], } def __init__( @@ -1358,6 +1375,7 @@ def __init__( min_impurity_decrease=0.0, ccp_alpha=0.0, monotonic_cst=None, + pinball_alpha=0.5, ): super().__init__( criterion=criterion, @@ -1373,6 +1391,7 @@ def __init__( ccp_alpha=ccp_alpha, monotonic_cst=monotonic_cst, ) + self.pinball_alpha = pinball_alpha @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None, check_input=True): @@ -1971,6 +1990,7 @@ def __init__( max_leaf_nodes=None, ccp_alpha=0.0, monotonic_cst=None, + pinball_alpha=0.5, ): super().__init__( criterion=criterion, @@ -1985,6 +2005,7 @@ def __init__( random_state=random_state, ccp_alpha=ccp_alpha, monotonic_cst=monotonic_cst, + pinball_alpha=pinball_alpha, ) def __sklearn_tags__(self): diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 709ee5f2ce362..859d8e83d22cb 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1292,7 +1292,7 @@ cdef void precompute_pinball_losses( above.pop(&top_val, &top_weight) below.push(top_val, top_weight) - # Current median + # Current quantile if above.total_weight > split_weight + 1e-5 * fabs(split_weight): quantile = above.top() else: # above and below heaps are almost exactly balanced @@ -1344,7 +1344,7 @@ def _py_precompute_pinball_losses( return np.asarray(losses) -cdef class MAE(Criterion): +cdef class Pinball(Criterion): r"""Mean absolute error impurity criterion. MAE = (1 / n)*(\sum_i |y_i - f_i|), where y_i is the true @@ -1353,15 +1353,16 @@ cdef class MAE(Criterion): It has almost nothing in common with other regression criterions so it doesn't inherit from RegressionCriterion """ - cdef float64_t[::1] node_medians - cdef float64_t[::1] left_abs_errors - cdef float64_t[::1] right_abs_errors - cdef float64_t[::1] left_medians - cdef float64_t[::1] right_medians + cdef float64_t alpha + cdef float64_t[::1] node_quantiles + cdef float64_t[::1] left_pinball_losses + cdef float64_t[::1] right_pinball_losses + cdef float64_t[::1] left_quantiles + cdef float64_t[::1] right_quantiles cdef WeightedHeap above cdef WeightedHeap below - def __cinit__(self, intp_t n_outputs, intp_t n_samples): + def __cinit__(self, intp_t n_outputs, intp_t n_samples, float64_t alpha): """Initialize parameters for this criterion. Parameters @@ -1372,6 +1373,8 @@ cdef class MAE(Criterion): n_samples : intp_t The total number of samples to fit on """ + self.alpha = alpha + # Default values self.start = 0 self.pos = 0 @@ -1384,14 +1387,14 @@ cdef class MAE(Criterion): self.weighted_n_left = 0.0 self.weighted_n_right = 0.0 - self.node_medians = np.zeros(n_outputs, dtype=np.float64) + self.node_quantiles = np.zeros(n_outputs, dtype=np.float64) # Note: this criterion has an important memory footprint, which is # fine as it's instantiated only once to build an entire tree - self.left_abs_errors = np.empty(n_samples, dtype=np.float64) - self.right_abs_errors = np.empty(n_samples, dtype=np.float64) - self.left_medians = np.empty(n_samples, dtype=np.float64) - self.right_medians = np.empty(n_samples, dtype=np.float64) + self.left_pinball_losses = np.empty(n_samples, dtype=np.float64) + self.right_pinball_losses = np.empty(n_samples, dtype=np.float64) + self.left_quantiles = np.empty(n_samples, dtype=np.float64) + self.right_quantiles = np.empty(n_samples, dtype=np.float64) self.above = WeightedHeap(n_samples, True) # min-heap self.below = WeightedHeap(n_samples, False) # max-heap @@ -1457,42 +1460,37 @@ cdef class MAE(Criterion): self.pos = self.start n_bytes = self.n_node_samples * sizeof(float64_t) - memset(&self.left_abs_errors[0], 0, n_bytes) - memset(&self.right_abs_errors[0], 0, n_bytes) + memset(&self.left_pinball_losses[0], 0, n_bytes) + memset(&self.right_pinball_losses[0], 0, n_bytes) # Precompute absolute errors (summed over each output) - # and medians (used only when n_outputs=1) + # and quantiles (used only when n_outputs=1) # of the right and left child of all possible splits # for the current ordering of `sample_indices` # Precomputation is needed here and can't be done step-by-step in the update method # like for other criterions. Indeed, we don't have efficient ways to update right child - # statistics when removing samples from it. So we compute right child AEs/medians by + # statistics when removing samples from it. So we compute right child AEs/quantiles by # traversing from right to left (and hence only adding samples). for k in range(self.n_outputs): - # Note that at each iteration of this loop, we overwrite `self.left_medians` - # and `self.right_medians`. They are used to check for monoticity constraints, + # Note that at each iteration of this loop, we overwrite `self.left_quantiles` + # and `self.right_quantiles`. They are used to check for monoticity constraints, # which are allowed only with n_outputs=1. precompute_pinball_losses( self.y, self.sample_weight, self.sample_indices, - self.above, self.below, k, self.start, self.end, 0.5, - # left_abs_errors is incremented, left_medians is overwritten - self.left_abs_errors, self.left_medians + self.above, self.below, k, self.start, self.end, self.alpha, + # left_abs_errors is incremented, left_quantiles is overwritten + self.left_pinball_losses, self.left_quantiles ) # For the right child, we consider samples from end-1 to start-1 - # i.e., reversed, and abs error & median are filled in reverse order to. + # i.e., reversed, and abs error & quantile are filled in reverse order to. precompute_pinball_losses( self.y, self.sample_weight, self.sample_indices, - self.above, self.below, k, self.end - 1, self.start - 1, 0.5, - # right_abs_errors is incremented, right_medians is overwritten - self.right_abs_errors, self.right_medians + self.above, self.below, k, self.end - 1, self.start - 1, self.alpha, + # right_abs_errors is incremented, right_quantiles is overwritten + self.right_pinball_losses, self.right_quantiles ) - # Store the median for the current node - self.node_medians[k] = self.right_medians[0] - - # pinball loss with q=0.5 is half of the AE - for p in range(self.start, self.end): - self.left_abs_errors[p] *= 2 - self.right_abs_errors[p] *= 2 + # Store the quantile for the current node + self.node_quantiles[k] = self.right_quantiles[0] return 0 @@ -1529,7 +1527,7 @@ cdef class MAE(Criterion): """Computes the node value of sample_indices[start:end] into dest.""" cdef intp_t k for k in range(self.n_outputs): - dest[k] = self.node_medians[k] + dest[k] = self.node_quantiles[k] cdef inline float64_t middle_value(self) noexcept nogil: """Compute the middle value of a split for monotonicity constraints as the simple average @@ -1540,8 +1538,8 @@ cdef class MAE(Criterion): """ cdef intp_t j = self.pos - self.start return ( - self.left_medians[j - 1] - + self.right_medians[j] + self.left_quantiles[j - 1] + + self.right_quantiles[j] ) / 2 cdef inline bint check_monotonicity( @@ -1555,7 +1553,7 @@ cdef class MAE(Criterion): return self._check_monotonicity( monotonic_cst, lower_bound, upper_bound, - self.left_medians[j - 1], self.right_medians[j]) + self.left_quantiles[j - 1], self.right_quantiles[j]) cdef float64_t node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. @@ -1567,7 +1565,7 @@ cdef class MAE(Criterion): Time complexity: O(1) (precomputed in `.reset()`) """ return ( - self.right_abs_errors[0] + self.right_pinball_losses[0] / (self.weighted_n_node_samples * self.n_outputs) ) @@ -1586,13 +1584,13 @@ cdef class MAE(Criterion): # if pos == start, left child is empty, hence impurity is 0 if self.pos > self.start: - impurity_left += self.left_abs_errors[j - 1] + impurity_left += self.left_pinball_losses[j - 1] p_impurity_left[0] = impurity_left / (self.weighted_n_left * self.n_outputs) # if pos == end, right child is empty, hence impurity is 0 if self.pos < self.end: - impurity_right += self.right_abs_errors[j] + impurity_right += self.right_pinball_losses[j] p_impurity_right[0] = impurity_right / (self.weighted_n_right * self.n_outputs) @@ -1608,6 +1606,24 @@ cdef class MAE(Criterion): dest[0] = upper_bound +cdef class MAE(Pinball): + """ + The median is just the quantile alpha=0.5 + And the absolute error is twice the pinball_loss (with alpha=0.5) + """ + + # FIXME/XXX: Trust the instanciater to pass alpha=0.5 to the __cinit__... + + cdef float64_t node_impurity(self) noexcept nogil: + return 2 * Pinball.node_impurity(self) + + cdef void children_impurity(self, float64_t* p_impurity_left, + float64_t* p_impurity_right) noexcept nogil: + Pinball.children_impurity(self, p_impurity_left, p_impurity_right) + p_impurity_left[0] *= 2 + p_impurity_right[0] *= 2 + + cdef class FriedmanMSE(MSE): """Mean squared error impurity criterion with improvement score by Friedman. diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 0e5c19afcde41..c8318ffbdfbc0 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1827,8 +1827,8 @@ def _pickle_copy(obj): assert n_outputs == n_outputs_ assert_array_equal(n_classes, n_classes_) - for _, typename in CRITERIA_REG.items(): - criteria = typename(n_outputs, n_samples) + for name, typename in CRITERIA_REG.items(): + criteria = DecisionTreeRegressor(criterion=name).criterion result = copy_func(criteria).__reduce__() typename_, (n_outputs_, n_samples_), _ = result assert typename == typename_ From ecd2f15f17f62963d4f216051a22279c40ec46f5 Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 19 Sep 2025 12:51:29 +0200 Subject: [PATCH 4/5] small changes --- sklearn/tree/_criterion.pyx | 11 ++++++----- sklearn/tree/tests/test_tree.py | 9 ++------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 6ee1a955b26cf..4db9d3f0cd33b 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1289,12 +1289,13 @@ cdef void precompute_pinball_losses( if above.total_weight > split_weight + 1e-5 * fabs(split_weight): quantile = above.top() else: # above and below heaps are almost exactly balanced - quantile = q * above.top() + (1 - q) * below.top() - # FIXME: check if it should be (1 - q) * ... + q * ... + # Any value between below.top() and above.top() is valid here. + # We choose the midpoint for determinism. + quantile = 0.5 * (above.top() + below.top()) quantiles[j] = quantile losses[j] += ( - q * (above.weighted_sum - above.total_weight * quantile) - + (1 - q) * (below.total_weight * quantile - below.weighted_sum) + q * (above.weighted_sum - quantile * above.total_weight) + + (1 - q) * (quantile * below.total_weight - below.weighted_sum) ) p += step j += step @@ -1321,7 +1322,7 @@ def _py_precompute_pinball_losses( ys, sample_weight, sample_indices, above, below, k, start, end, q, losses, quantiles ) - return np.asarray(losses) + return np.asarray(losses), np.asarray(quantiles) cdef class Pinball(Criterion): diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 8acd216ae4c08..faf0356a3aac1 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -2913,13 +2913,8 @@ def compute_prefix_losses_naive(y: np.ndarray, w: np.ndarray): ) def compute_pinball_loss(y: np.ndarray, w: np.ndarray): - # 1) compute the weighted median - # i.e. once ordered by y, search for i such that: - # sum(w[:i]) <= q*sum(w) and sum(w[i+1:]) >= q*sum(w) - sorter = np.argsort(y) - wc = np.cumsum(w[sorter]) - idx = np.searchsorted(wc, wc[-1] * q) - quantile = y[sorter[idx]] + # 1) compute the weighted quantile + quantile = np.quantile(y, q, weights=w, method="inverted_cdf") y_pred = np.full(y.size, quantile) # 2) compute the pinball loss return mean_pinball_loss(y, y_pred, sample_weight=w, alpha=q) * w.sum() From 075243c4f8b70e86002363671e75714a9778c1aa Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 23 Sep 2025 13:16:10 +0200 Subject: [PATCH 5/5] fixes --- sklearn/tree/_classes.py | 2 +- sklearn/tree/_criterion.pyx | 6 +++--- sklearn/tree/tests/test_tree.py | 26 ++++++++++++++------------ 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 6f7c8b15dd5eb..d6ee4193a2373 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -389,7 +389,7 @@ def _fit( args = (*args, self.pinball_alpha) if self.criterion == "absolute_error": # FIXME: this is coupled with code at a much lower level - # because of some inheritance shenanigans with __cinit__ + # because of the inheritance behavior of __cinit__ args = (*args, 0.5) criterion = CRITERIA_REG[self.criterion](*args) else: diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 4db9d3f0cd33b..8ff9b3f9a87ce 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1322,7 +1322,7 @@ def _py_precompute_pinball_losses( ys, sample_weight, sample_indices, above, below, k, start, end, q, losses, quantiles ) - return np.asarray(losses), np.asarray(quantiles) + return np.asarray(losses) cdef class Pinball(Criterion): @@ -1575,10 +1575,10 @@ cdef class Pinball(Criterion): p_impurity_right[0] = impurity_right / (self.weighted_n_right * self.n_outputs) - # those 2 methods are copied from the RegressionCriterion abstract class: def __reduce__(self): - return (type(self), (self.n_outputs, self.n_samples), self.__getstate__()) + return (type(self), (self.n_outputs, self.n_samples, self.alpha), self.__getstate__()) + # this method is copied from the RegressionCriterion abstract class: cdef inline void clip_node_value(self, float64_t* dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil: """Clip the value in dest between lower_bound and upper_bound for monotonic constraints.""" if dest[0] < lower_bound: diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index faf0356a3aac1..83a2b4eae4893 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1828,12 +1828,14 @@ def _pickle_copy(obj): assert_array_equal(n_classes, n_classes_) for name, typename in CRITERIA_REG.items(): - criteria = DecisionTreeRegressor(criterion=name).criterion + args = (n_outputs, n_samples) + if name == "absolute_error" or name == "pinball": + args = (*args, 0.5) + criteria = typename(*args) result = copy_func(criteria).__reduce__() - typename_, (n_outputs_, n_samples_), _ = result + typename_, args_, _ = result assert typename == typename_ - assert n_outputs == n_outputs_ - assert n_samples == n_samples_ + assert args == args_ @pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS) @@ -2925,13 +2927,13 @@ def compute_pinball_loss(y: np.ndarray, w: np.ndarray): y = rng.uniform(size=(n, 1)) w = rng.rand(n) indices = np.arange(n) - abs_errors = _py_precompute_pinball_losses(y, w, indices, 0, n, q=q) + pb_losses = _py_precompute_pinball_losses(y, w, indices, 0, n, q=q) expected = compute_prefix_losses_naive(y, w) - assert np.allclose(abs_errors, expected) + assert np.allclose(pb_losses, expected) - abs_errors = _py_precompute_pinball_losses(y, w, indices, n - 1, -1, q=q) + pb_losses = _py_precompute_pinball_losses(y, w, indices, n - 1, -1, q=q) expected = compute_prefix_losses_naive(y[::-1], w[::-1])[::-1] - assert np.allclose(abs_errors, expected) + assert np.allclose(pb_losses, expected) x = rng.rand(n) indices = np.argsort(x) @@ -2939,10 +2941,10 @@ def compute_pinball_loss(y: np.ndarray, w: np.ndarray): y_sorted = y[indices] w_sorted = w[indices] - abs_errors = _py_precompute_pinball_losses(y, w, indices, 0, n, q=q) + pb_losses = _py_precompute_pinball_losses(y, w, indices, 0, n, q=q) expected = compute_prefix_losses_naive(y_sorted, w_sorted) - assert np.allclose(abs_errors, expected) + assert np.allclose(pb_losses, expected) - abs_errors = _py_precompute_pinball_losses(y, w, indices, n - 1, -1, q=q) + pb_losses = _py_precompute_pinball_losses(y, w, indices, n - 1, -1, q=q) expected = compute_prefix_losses_naive(y_sorted[::-1], w_sorted[::-1])[::-1] - assert np.allclose(abs_errors, expected) + assert np.allclose(pb_losses, expected)