microsoft · thinkall · Oct 12, 2024 · Oct 9, 2024 · Oct 9, 2024 · Oct 9, 2024
diff --git a/test/automl/test_classification.py b/test/automl/test_classification.py
@@ -1,11 +1,15 @@
 import unittest
 from datetime import datetime
+from test.conftest import evaluate_cv_folds_with_underlying_model
 
 import numpy as np
 import pandas as pd
+import pytest
 import scipy.sparse
 from sklearn.datasets import load_breast_cancer
-from sklearn.model_selection import train_test_split
+from sklearn.model_selection import (
+ train_test_split,
+)
 
 from flaml import AutoML, tune
 from flaml.automl.model import LGBMEstimator
@@ -420,6 +424,120 @@ def test_sparse_matrix_lr(self):
  print(automl_experiment.best_estimator)
 
 
+@pytest.mark.parametrize(
+ "estimator",
+ [
+ # "catboost",
+ "extra_tree",
+ "histgb",
+ "kneighbor",
+ "lgbm",
+ # "lrl1",
+ "lrl2",
+ "rf",
+ "xgboost",
+ "xgb_limitdepth",
+ ],
+)
+def test_reproducibility_of_classification_models(estimator: str):
+ """FLAML finds the best model for a given dataset, which it then provides to users.
+
+ However, there are reported issues where FLAML was providing an incorrect model - see here:
+ https://github.com/microsoft/FLAML/issues/1317
+ In this test we take the best model which FLAML provided us, and then retrain and test it on the
+ same folds, to verify that the result is reproducible.
+ """
+ automl = AutoML()
+ automl_settings = {
+ "max_iter": 5,
+ "time_budget": -1,
+ "task": "classification",
+ "n_jobs": 1,
+ "estimator_list": [estimator],
+ "eval_method": "cv",
+ "n_splits": 10,
+ "metric": "f1",
+ "keep_search_state": True,
+ "skip_transform": True,
+ }
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+ automl.fit(X_train=X, y_train=y, **automl_settings)
+ best_model = automl.model
+ assert best_model is not None
+ config = best_model.get_params()
+ val_loss_flaml = automl.best_result["val_loss"]
+
+ # Take the best model, and see if we can reproduce the best result
+ reproduced_val_loss, metric_for_logging, train_time, pred_time = automl._state.task.evaluate_model_CV(
+ config=config,
+ estimator=best_model,
+ X_train_all=automl._state.X_train_all,
+ y_train_all=automl._state.y_train_all,
+ budget=None,
+ kf=automl._state.kf,
+ eval_metric="f1",
+ best_val_loss=None,
+ cv_score_agg_func=None,
+ log_training_metric=False,
+ fit_kwargs=None,
+ free_mem_ratio=0,
+ )
+ assert pytest.approx(val_loss_flaml) == reproduced_val_loss
+
+
+@pytest.mark.parametrize(
+ "estimator",
+ [
+ # "catboost",
+ "extra_tree",
+ "histgb",
+ "kneighbor",
+ # "lgbm",
+ # "lrl1",
+ "lrl2",
+ "rf",
+ "xgboost",
+ "xgb_limitdepth",
+ ],
+)
+def test_reproducibility_of_underlying_classification_models(estimator: str):
+ """FLAML finds the best model for a given dataset, which it then provides to users.
+
+ However, there are reported issues where FLAML was providing an incorrect model - see here:
+ https://github.com/microsoft/FLAML/issues/1317
+ FLAML defines FLAMLised models, which wrap around the underlying (SKLearn/XGBoost/CatBoost) model.
+ Ideally, FLAMLised models should perform identically to the underlying model, when fitted
+ to the same data, with no budget. This verifies that this is the case for classification models.
+ In this test we take the best model which FLAML provided us, extract the underlying model,
+ before retraining and testing it on the same folds - to verify that the result is reproducible.
+ """
+ automl = AutoML()
+ automl_settings = {
+ "max_iter": 5,
+ "time_budget": -1,
+ "task": "classification",
+ "n_jobs": 1,
+ "estimator_list": [estimator],
+ "eval_method": "cv",
+ "n_splits": 10,
+ "metric": "f1",
+ "keep_search_state": True,
+ "skip_transform": True,
+ }
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+ automl.fit(X_train=X, y_train=y, **automl_settings)
+ best_model = automl.model
+ assert best_model is not None
+ val_loss_flaml = automl.best_result["val_loss"]
+ reproduced_val_loss_underlying_model = np.mean(
+ evaluate_cv_folds_with_underlying_model(
+ automl._state.X_train_all, automl._state.y_train_all, automl._state.kf, best_model.model, "classification"
+ )
+ )
+
+ assert pytest.approx(val_loss_flaml) == reproduced_val_loss_underlying_model
+
+
 if __name__ == "__main__":
  test = TestClassification()
  test.test_preprocess()
diff --git a/test/automl/test_regression.py b/test/automl/test_regression.py
@@ -1,9 +1,12 @@
 import unittest
+from test.conftest import evaluate_cv_folds_with_underlying_model
 
 import numpy as np
+import pytest
 import scipy.sparse
 from sklearn.datasets import (
  fetch_california_housing,
+ make_regression,
 )
 
 from flaml import AutoML
@@ -205,7 +208,6 @@ def test_regression_xgboost(self):
 
 
 def test_multioutput():
- from sklearn.datasets import make_regression
  from sklearn.model_selection import train_test_split
  from sklearn.multioutput import MultiOutputRegressor, RegressorChain
 
@@ -230,5 +232,115 @@ def test_multioutput():
  print(model.predict(X_test))
 
 
+@pytest.mark.parametrize(
+ "estimator",
+ [
+ # "catboost",
+ "extra_tree",
+ "histgb",
+ "kneighbor",
+ "lgbm",
+ "rf",
+ "xgboost",
+ "xgb_limitdepth",
+ ],
+)
+def test_reproducibility_of_regression_models(estimator: str):
+ """FLAML finds the best model for a given dataset, which it then provides to users.
+
+ However, there are reported issues where FLAML was providing an incorrect model - see here:
+ https://github.com/microsoft/FLAML/issues/1317
+ In this test we take the best regression model which FLAML provided us, and then retrain and test it on the
+ same folds, to verify that the result is reproducible.
+ """
+ automl = AutoML()
+ automl_settings = {
+ "max_iter": 2,
+ "time_budget": -1,
+ "task": "regression",
+ "n_jobs": 1,
+ "estimator_list": [estimator],
+ "eval_method": "cv",
+ "n_splits": 3,
+ "metric": "r2",
+ "keep_search_state": True,
+ "skip_transform": True,
+ }
+ X, y = fetch_california_housing(return_X_y=True, as_frame=True)
+ automl.fit(X_train=X, y_train=y, **automl_settings)
+ best_model = automl.model
+ assert best_model is not None
+ config = best_model.get_params()
+ val_loss_flaml = automl.best_result["val_loss"]
+
+ # Take the best model, and see if we can reproduce the best result
+ reproduced_val_loss, metric_for_logging, train_time, pred_time = automl._state.task.evaluate_model_CV(
+ config=config,
+ estimator=best_model,
+ X_train_all=automl._state.X_train_all,
+ y_train_all=automl._state.y_train_all,
+ budget=None,
+ kf=automl._state.kf,
+ eval_metric="r2",
+ best_val_loss=None,
+ cv_score_agg_func=None,
+ log_training_metric=False,
+ fit_kwargs=None,
+ free_mem_ratio=0,
+ )
+ assert pytest.approx(val_loss_flaml) == reproduced_val_loss
+
+
+@pytest.mark.parametrize(
+ "estimator",
+ [
+ # "catboost",
+ "extra_tree",
+ "histgb",
+ "kneighbor",
+ # "lgbm",
+ "rf",
+ "xgboost",
+ "xgb_limitdepth",
+ ],
+)
+def test_reproducibility_of_underlying_regression_models(estimator: str):
+ """FLAML finds the best model for a given dataset, which it then provides to users.
+
+ However, there are reported issues where FLAML was providing an incorrect model - see here:
+ https://github.com/microsoft/FLAML/issues/1317
+ FLAML defines FLAMLised models, which wrap around the underlying (SKLearn/XGBoost/CatBoost) model.
+ Ideally, FLAMLised models should perform identically to the underlying model, when fitted
+ to the same data, with no budget. This verifies that this is the case for regression models.
+ In this test we take the best model which FLAML provided us, extract the underlying model,
+ before retraining and testing it on the same folds - to verify that the result is reproducible.
+ """
+ automl = AutoML()
+ automl_settings = {
+ "max_iter": 5,
+ "time_budget": -1,
+ "task": "regression",
+ "n_jobs": 1,
+ "estimator_list": [estimator],
+ "eval_method": "cv",
+ "n_splits": 10,
+ "metric": "r2",
+ "keep_search_state": True,
+ "skip_transform": True,
+ }
+ X, y = fetch_california_housing(return_X_y=True, as_frame=True)
+ automl.fit(X_train=X, y_train=y, **automl_settings)
+ best_model = automl.model
+ assert best_model is not None
+ val_loss_flaml = automl.best_result["val_loss"]
+ reproduced_val_loss_underlying_model = np.mean(
+ evaluate_cv_folds_with_underlying_model(
+ automl._state.X_train_all, automl._state.y_train_all, automl._state.kf, best_model.model, "regression"
+ )
+ )
+
+ assert pytest.approx(val_loss_flaml) == reproduced_val_loss_underlying_model
+
+
 if __name__ == "__main__":
  unittest.main()
diff --git a/test/conftest.py b/test/conftest.py
@@ -0,0 +1,42 @@
+from typing import Any, Dict, List, Union
+
+import numpy as np
+import pandas as pd
+from catboost import CatBoostClassifier, CatBoostRegressor, Pool
+from sklearn.metrics import f1_score, r2_score
+
+
+def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model: Any, task: str) -> pd.DataFrame:
+ """Mimic the FLAML CV process to calculate the metrics across each fold.
+
+ :param X_train_all: X training data
+ :param y_train_all: y training data
+ :param kf: The splitter object to use to generate the folds
+ :param model: The estimator to fit to the data during the CV process
+ :param task: classification or regression
+ :return: An array containing the metrics
+ """
+ rng = np.random.RandomState(2020)
+ all_fold_metrics: List[Dict[str, Union[int, float]]] = []
+ for train_index, val_index in kf.split(X_train_all, y_train_all):
+ X_train_split, y_train_split = X_train_all, y_train_all
+ train_index = rng.permutation(train_index)
+ X_train = X_train_split.iloc[train_index]
+ X_val = X_train_split.iloc[val_index]
+ y_train, y_val = y_train_split[train_index], y_train_split[val_index]
+ model_type = type(model)
+ if model_type is not CatBoostClassifier and model_type is not CatBoostRegressor:
+ model.fit(X_train, y_train)
+ else:
+ use_best_model = True
+ n = max(int(len(y_train) * 0.9), len(y_train) - 1000) if use_best_model else len(y_train)
+ X_tr, y_tr = (X_train)[:n], y_train[:n]
+ eval_set = Pool(data=X_train[n:], label=y_train[n:], cat_features=[]) if use_best_model else None
+ model.fit(X_tr, y_tr, eval_set=eval_set, use_best_model=True)
+ y_pred_classes = model.predict(X_val)
+ if task == "classification":
+ reproduced_metric = 1 - f1_score(y_val, y_pred_classes)
+ else:
+ reproduced_metric = 1 - r2_score(y_val, y_pred_classes)
+ all_fold_metrics.append(reproduced_metric)
+ return all_fold_metrics