feat: improve sample efficiency (#28)

superlinear-ai · Jun 11, 2024 · f71376e · f71376e
1 parent 45922e1
commit f71376e
Show file tree

Hide file tree

Showing 4 changed files with 98 additions and 53 deletions.
diff --git a/README.md b/README.md
@@ -56,23 +56,23 @@ conformal_predictor.fit(X_train, y_train)
 
 # Predict quantiles with the conformal predictor
 ŷ_test_quantiles = conformal_predictor.predict_quantiles(
-    X_test, quantiles=(0.025, 0.05, 0.1, 0.9, 0.95, 0.975)
+    X_test, quantiles=(0.025, 0.05, 0.1, 0.5, 0.9, 0.95, 0.975)
 )
 ```
 
 When the input data is a pandas DataFrame, the output is also a pandas DataFrame. For example, printing the head of `ŷ_test_quantiles` yields:
 
-|   house_id |    0.025 |     0.05 |      0.1 |      0.9 |     0.95 |    0.975 |
-|-----------:|---------:|---------:|---------:|---------:|---------:|---------:|
-|       1357 | 114784.0 | 120894.3 | 131618.0 | 175760.5 | 188052.0 | 205448.8 |
-|       2367 |  67416.6 |  80073.7 |  86754.0 | 117854.1 | 127582.6 | 142321.9 |
-|       2822 | 119422.7 | 132047.7 | 138724.6 | 178526.0 | 197246.2 | 214205.6 |
-|       2126 |  94030.6 |  99850.0 | 110891.3 | 150249.2 | 164703.0 | 182528.1 |
-|       1544 |  68996.2 |  81516.3 |  88231.6 | 121774.2 | 132425.1 | 147110.2 |
+|   house_id |    0.025 |     0.05 |      0.1 |      0.5 |      0.9 |     0.95 |    0.975 |
+|-----------:|---------:|---------:|---------:|---------:|---------:|---------:|---------:|
+|       1357 | 114743.7 | 120917.9 | 131752.6 | 156708.2 | 175907.8 | 187996.1 | 205443.4 |
+|       2367 |  67382.7 |  80191.7 |  86871.8 | 105807.1 | 118465.3 | 127581.2 | 142419.1 |
+|       2822 | 119068.0 | 131864.8 | 138541.6 | 159447.7 | 179227.2 | 197337.0 | 214134.1 |
+|       2126 |  93885.8 | 100040.7 | 111345.5 | 134292.7 | 150557.1 | 164595.8 | 182524.1 |
+|       1544 |  68959.8 |  81648.8 |  88364.1 | 108298.3 | 122329.6 | 132421.1 | 147225.6 |
 
 Let's visualize the predicted quantiles on the test set:
 
-<img src="https://github.com/radix-ai/conformal-tights/assets/4543654/594682d2-0431-4fa8-9126-9e1482992d26">
+<img src="https://github.com/radix-ai/conformal-tights/assets/4543654/2726d108-ee84-47d0-83d9-7e911b123f0c">
 
 <details>
 <summary>Expand to see the code that generated the graph above</summary>
@@ -84,7 +84,7 @@ import matplotlib.ticker as ticker
 %config InlineBackend.figure_format = "retina"
 plt.rc("font", family="DejaVu Sans", size=10)
 plt.figure(figsize=(8, 4.5))
-idx = ŷ_test.sample(50, random_state=42).sort_values().index
+idx = ŷ_test_quantiles[0.5].sample(50, random_state=42).sort_values().index
 x = list(range(1, len(idx) + 1))
 x_ticks = [1, *list(range(5, len(idx) + 1, 5))]
 for j in range(3):
@@ -217,15 +217,15 @@ Printing the head of the forecast quantiles time series `forecast.quantiles_df(q
 
 | Timestamp      |   Value_NE5_0.025 |   Value_NE5_0.05 |   Value_NE5_0.1 |   Value_NE5_0.25 |   Value_NE5_0.5 |   Value_NE5_0.75 |   Value_NE5_0.9 |   Value_NE5_0.95 |   Value_NE5_0.975 |
 |:---------------|------------------:|-----------------:|----------------:|-----------------:|----------------:|-----------------:|----------------:|-----------------:|------------------:|
-| 2022‑06‑01 01h |           19197.4 |          19262.5 |         19366.4 |          19612.7 |         19786.7 |          19996.5 |         20185.5 |          20293.3 |           20358.0 |
-| 2022‑06‑01 02h |           18963.2 |          19078.7 |         19263.3 |          19463.6 |         19706.0 |          19951.4 |         20125.2 |          20265.8 |           20353.4 |
-| 2022‑06‑01 03h |           19259.1 |          19372.3 |         19551.2 |          19846.4 |         20145.2 |          20401.1 |         20630.4 |          20814.0 |           20939.6 |
-| 2022‑06‑01 04h |           21537.8 |          21745.9 |         21958.0 |          22266.8 |         22600.7 |          22939.7 |         23356.0 |          23538.7 |           23691.7 |
-| 2022‑06‑01 05h |           24304.0 |          24503.6 |         24717.5 |          25029.4 |         25602.3 |          26266.4 |         26791.6 |          26963.8 |           27359.2 |
+| 2022‑06‑01 01h |           19165.2 |          19268.3 |         19435.7 |          19663.0 |         19861.7 |          20062.2 |         20237.9 |          20337.7 |           20453.2 |
+| 2022‑06‑01 02h |           19004.0 |          19099.0 |         19226.3 |          19453.7 |         19710.7 |          19966.1 |         20170.1 |          20272.8 |           20366.9 |
+| 2022‑06‑01 03h |           19372.6 |          19493.0 |         19679.4 |          20027.6 |         20324.6 |          20546.3 |         20773.2 |          20910.3 |           21014.1 |
+| 2022‑06‑01 04h |           21936.2 |          22105.6 |         22436.0 |          22917.5 |         23308.6 |          23604.8 |         23871.0 |          24121.7 |           24351.5 |
+| 2022‑06‑01 05h |           25040.5 |          25330.5 |         25531.1 |          25910.4 |         26439.4 |          26903.2 |         27287.4 |          27493.9 |           27633.9 |
 
 Let's visualize the forecast and its prediction interval on the test set:
 
-<img src="https://github.com/radix-ai/conformal-tights/assets/4543654/6886384d-979f-46ec-ba06-10f4ef8f8f6f">
+<img src="https://github.com/radix-ai/conformal-tights/assets/4543654/8c3c256f-0732-49c7-94f2-e42213e85e4b">
 
 <details>
 <summary>Expand to see the code that generated the graph above</summary>

diff --git a/notebooks/README.ipynb b/notebooks/README.ipynb
diff --git a/src/conformal_tights/_conformal_coherent_quantile_regressor.py b/src/conformal_tights/_conformal_coherent_quantile_regressor.py
@@ -28,17 +28,19 @@ class ConformalCoherentQuantileRegressor(MetaEstimatorMixin, RegressorMixin, Bas
     Adds conformally calibrated quantile and interval prediction to a given regressor by fitting a
     meta-estimator as follows:
 
-        1. The given data is split into a training set and a conformal calibration set.
-        2. The training set is used to fit the given regressor.
-        3. The training set is also used to fit a nonconformity estimator, which is by default an
+        1. All available data is used to fit the given regressor for point prediction later on.
+        2. The available data is then split into a training set and a conformal calibration set.
+        3. The training set is used to fit a base regressor that is used as the center of the
+           conformal predictions.
+        4. The training set is also used to fit a nonconformity estimator, which is by default an
            XGBoost vector quantile regressor for the quantiles (1/8, 1/4, 1/2, 3/4, 7/8). These
            quantiles are not necessarily monotonic and may cross each other.
-        4. The conformal calibration set is split into two levels.
-        5. The level 1 conformal calibration set is used to fit a Coherent Linear Quantile
+        5. The conformal calibration set is split into two levels.
+        6. The level 1 conformal calibration set is used to fit a Coherent Linear Quantile
            Regression model of the (relative) residuals given the level 1 nonconformity estimates.
            This model produces conformally calibrated quantiles of the (relative) residuals that are
            coherent in the sense that they increase monotonically.
-        6. The level 2 conformal calibration set is used to fit a per-quantile conformal bias on top
+        7. The level 2 conformal calibration set is used to fit a per-quantile conformal bias on top
            of the level 1 conformal quantile predictions of the (relative) residuals.
 
     Quantile and interval predictions are made by predicting the nonconformity estimates, converting
@@ -119,8 +121,8 @@ def fit(
         sample_weight_train, sample_weight_calib = (
             sample_weights[:2] if sample_weight is not None else (None, None)
         )
-        # Split the conformal calibration set into two levels. If would be less than 128 level 2
-        # examples, use all of them for level 1 instead.
+        # Split the conformal calibration set into two levels. If there would be less than 128
+        # level 2 examples, use all of them for level 1 instead.
         X_calib_l1, X_calib_l2, y_calib_l1, y_calib_l2, *sample_weights_calib = train_test_split(
             self.X_calib_,
             self.y_calib_,
@@ -133,26 +135,42 @@ def fit(
         self.sample_weight_calib_l1_, self.sample_weight_calib_l2_ = (
             sample_weights_calib[:2] if sample_weight is not None else (None, None)  # type: ignore[has-type]
         )
-        # Check if the estimator was pre-fitted.
+        # Fit the wrapped estimator for point prediction.
         try:
             check_is_fitted(self.estimator)
         except (NotFittedError, TypeError):
-            # Fit the given estimator on the training data.
+            # Fit the given estimator on all available data.
             self.estimator_ = (
                 clone(self.estimator)
                 if self.estimator != "auto"
                 else XGBRegressor(objective="reg:absoluteerror")
             )
             if isinstance(self.estimator_, XGBRegressor):
                 self.estimator_.set_params(enable_categorical=True, random_state=self.random_state)
-            self.estimator_.fit(X_train, y_train, sample_weight=sample_weight_train)
+            self.estimator_.fit(X, y, sample_weight=sample_weight)
         else:
             # Use the pre-fitted estimator.
             self.estimator_ = self.estimator
+        # Fit a base estimator on the training data (which is a subset of all available data). This
+        # estimator's predictions will be used as the center of the conformally calibrated quantiles
+        # and intervals.
+        self.base_estimator_ = (
+            clone(self.estimator) if self.nonconformity_estimator != "auto" else XGBRegressor()
+        )
+        if isinstance(self.base_estimator_, XGBRegressor):
+            self.base_estimator_.set_params(
+                objective="reg:absoluteerror",
+                enable_categorical=True,
+                random_state=self.random_state,
+            )
+        self.base_estimator_.fit(X_train, y_train, sample_weight=sample_weight_train)
         # Fit a nonconformity estimator on the training data with XGBRegressor's vector quantile
         # regression. We fit a minimal number of quantiles to reduce the computational cost, but
         # also to reduce the risk of overfitting in the coherent quantile regressor that is applied
         # on top of the nonconformity estimates.
+        self.nonconformity_quantiles_: list[float] = sorted(
+            set(self.nonconformity_quantiles) | {0.5}  # type: ignore[arg-type]
+        )
         self.nonconformity_estimator_ = (
             clone(self.nonconformity_estimator)
             if self.nonconformity_estimator != "auto"
@@ -161,18 +179,22 @@ def fit(
         if isinstance(self.nonconformity_estimator_, XGBRegressor):
             self.nonconformity_estimator_.set_params(
                 objective="reg:quantileerror",
-                quantile_alpha=self.nonconformity_quantiles,
+                quantile_alpha=self.nonconformity_quantiles_,
                 enable_categorical=True,
                 random_state=self.random_state,
             )
         self.nonconformity_estimator_.fit(X_train, y_train, sample_weight=sample_weight_train)
         # Predict on the level 1 calibration set.
-        self.ŷ_calib_l1_ = self.estimator_.predict(X_calib_l1)
-        self.ŷ_calib_l1_nonconformity_ = self.nonconformity_estimator_.predict(X_calib_l1)
+        self.ŷ_calib_l1_ = np.asarray(self.base_estimator_.predict(X_calib_l1))
+        self.ŷ_calib_l1_nonconformity_ = np.asarray(
+            self.nonconformity_estimator_.predict(X_calib_l1)
+        )
         self.residuals_calib_l1_ = self.ŷ_calib_l1_ - y_calib_l1
         # Predict on the level 2 calibration set.
-        self.ŷ_calib_l2_ = self.estimator_.predict(X_calib_l2)
-        self.ŷ_calib_l2_nonconformity_ = self.nonconformity_estimator_.predict(X_calib_l2)
+        self.ŷ_calib_l2_ = np.asarray(self.base_estimator_.predict(X_calib_l2))
+        self.ŷ_calib_l2_nonconformity_ = np.asarray(
+            self.nonconformity_estimator_.predict(X_calib_l2)
+        )
         self.residuals_calib_l2_ = self.ŷ_calib_l2_ - y_calib_l2
         # Lazily fit level 1 conformal predictors as coherent linear quantile regression models that
         # predict quantiles of the (relative) residuals given the nonconformity estimates, and
@@ -256,8 +278,8 @@ def predict_quantiles(
         """Predict conformally calibrated quantiles on a given dataset."""
         # Predict the absolute and relative quantiles.
         quantiles = np.asarray(quantiles)
-        ŷ = np.asarray(self.estimator_.predict(X))
-        X_cqr = self.nonconformity_estimator_.predict(X)
+        ŷ = np.asarray(self.base_estimator_.predict(X))
+        X_cqr = np.asarray(self.nonconformity_estimator_.predict(X))
         cqr_abs, bias_abs = self._lazily_fit_conformal_predictor("Δŷ", quantiles)
         cqr_rel, bias_rel = self._lazily_fit_conformal_predictor("Δŷ/ŷ", quantiles)
         if priority == "coverage":  # Only allow quantile expansion when the priority is coverage.

diff --git a/tests/test_conformal_quantile_regressor.py b/tests/test_conformal_quantile_regressor.py
@@ -1,6 +1,7 @@
 """Test the Conformal Coherent Quantile Regressor."""
 
 import numpy as np
+import pytest
 from sklearn.base import BaseEstimator
 from sklearn.utils.estimator_checks import check_estimator
 from xgboost import XGBRegressor
@@ -9,13 +10,25 @@
 from tests.conftest import Dataset
 
 
-def test_conformal_quantile_regressor_coverage(dataset: Dataset, regressor: BaseEstimator) -> None:
+@pytest.mark.parametrize("prefit", [True, False], ids=["prefit=True", "prefit=False"])
+def test_conformal_quantile_regressor_coverage(
+    dataset: Dataset,
+    regressor: BaseEstimator,
+    prefit: bool,  # noqa: FBT001
+) -> None:
     """Test ConformalCoherentQuantileRegressor's coverage."""
     # Unpack the dataset.
     X_train, X_test, y_train, y_test = dataset
     # Train the models.
+    if prefit and isinstance(regressor, BaseEstimator):
+        if isinstance(regressor, XGBRegressor):
+            regressor.set_params(enable_categorical=True)
+        regressor.fit(X_train, y_train)
     model = ConformalCoherentQuantileRegressor(estimator=regressor)
     model.fit(X_train, y_train)
+    # Verify that the prefitted model was used.
+    if prefit and isinstance(regressor, BaseEstimator):
+        np.testing.assert_array_equal(model.predict(X_test), regressor.predict(X_test), strict=True)
     # Verify the coherence of the predicted quantiles.
     ŷ_quantiles = model.predict(X_test, quantiles=np.linspace(0.1, 0.9, 3))
     for j in range(ŷ_quantiles.shape[1] - 1):