diff --git a/test/automl/test_classification.py b/test/automl/test_classification.py index db5a8e5297..deef827adf 100644 --- a/test/automl/test_classification.py +++ b/test/automl/test_classification.py @@ -1,11 +1,15 @@ import unittest from datetime import datetime +from test.conftest import evaluate_cv_folds_with_underlying_model import numpy as np import pandas as pd +import pytest import scipy.sparse from sklearn.datasets import load_breast_cancer -from sklearn.model_selection import train_test_split +from sklearn.model_selection import ( + train_test_split, +) from flaml import AutoML, tune from flaml.automl.model import LGBMEstimator @@ -420,6 +424,120 @@ def test_sparse_matrix_lr(self): print(automl_experiment.best_estimator) +@pytest.mark.parametrize( + "estimator", + [ + # "catboost", + "extra_tree", + "histgb", + "kneighbor", + "lgbm", + # "lrl1", + "lrl2", + "rf", + "xgboost", + "xgb_limitdepth", + ], +) +def test_reproducibility_of_classification_models(estimator: str): + """FLAML finds the best model for a given dataset, which it then provides to users. + + However, there are reported issues where FLAML was providing an incorrect model - see here: + https://github.com/microsoft/FLAML/issues/1317 + In this test we take the best model which FLAML provided us, and then retrain and test it on the + same folds, to verify that the result is reproducible. + """ + automl = AutoML() + automl_settings = { + "max_iter": 5, + "time_budget": -1, + "task": "classification", + "n_jobs": 1, + "estimator_list": [estimator], + "eval_method": "cv", + "n_splits": 10, + "metric": "f1", + "keep_search_state": True, + "skip_transform": True, + } + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + automl.fit(X_train=X, y_train=y, **automl_settings) + best_model = automl.model + assert best_model is not None + config = best_model.get_params() + val_loss_flaml = automl.best_result["val_loss"] + + # Take the best model, and see if we can reproduce the best result + reproduced_val_loss, metric_for_logging, train_time, pred_time = automl._state.task.evaluate_model_CV( + config=config, + estimator=best_model, + X_train_all=automl._state.X_train_all, + y_train_all=automl._state.y_train_all, + budget=None, + kf=automl._state.kf, + eval_metric="f1", + best_val_loss=None, + cv_score_agg_func=None, + log_training_metric=False, + fit_kwargs=None, + free_mem_ratio=0, + ) + assert pytest.approx(val_loss_flaml) == reproduced_val_loss + + +@pytest.mark.parametrize( + "estimator", + [ + # "catboost", + "extra_tree", + "histgb", + "kneighbor", + # "lgbm", + # "lrl1", + "lrl2", + "rf", + "xgboost", + "xgb_limitdepth", + ], +) +def test_reproducibility_of_underlying_classification_models(estimator: str): + """FLAML finds the best model for a given dataset, which it then provides to users. + + However, there are reported issues where FLAML was providing an incorrect model - see here: + https://github.com/microsoft/FLAML/issues/1317 + FLAML defines FLAMLised models, which wrap around the underlying (SKLearn/XGBoost/CatBoost) model. + Ideally, FLAMLised models should perform identically to the underlying model, when fitted + to the same data, with no budget. This verifies that this is the case for classification models. + In this test we take the best model which FLAML provided us, extract the underlying model, + before retraining and testing it on the same folds - to verify that the result is reproducible. + """ + automl = AutoML() + automl_settings = { + "max_iter": 5, + "time_budget": -1, + "task": "classification", + "n_jobs": 1, + "estimator_list": [estimator], + "eval_method": "cv", + "n_splits": 10, + "metric": "f1", + "keep_search_state": True, + "skip_transform": True, + } + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + automl.fit(X_train=X, y_train=y, **automl_settings) + best_model = automl.model + assert best_model is not None + val_loss_flaml = automl.best_result["val_loss"] + reproduced_val_loss_underlying_model = np.mean( + evaluate_cv_folds_with_underlying_model( + automl._state.X_train_all, automl._state.y_train_all, automl._state.kf, best_model.model, "classification" + ) + ) + + assert pytest.approx(val_loss_flaml) == reproduced_val_loss_underlying_model + + if __name__ == "__main__": test = TestClassification() test.test_preprocess() diff --git a/test/automl/test_regression.py b/test/automl/test_regression.py index 52c6b1d048..bb4fc31da1 100644 --- a/test/automl/test_regression.py +++ b/test/automl/test_regression.py @@ -1,9 +1,12 @@ import unittest +from test.conftest import evaluate_cv_folds_with_underlying_model import numpy as np +import pytest import scipy.sparse from sklearn.datasets import ( fetch_california_housing, + make_regression, ) from flaml import AutoML @@ -205,7 +208,6 @@ def test_regression_xgboost(self): def test_multioutput(): - from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split from sklearn.multioutput import MultiOutputRegressor, RegressorChain @@ -230,5 +232,115 @@ def test_multioutput(): print(model.predict(X_test)) +@pytest.mark.parametrize( + "estimator", + [ + # "catboost", + "extra_tree", + "histgb", + "kneighbor", + "lgbm", + "rf", + "xgboost", + "xgb_limitdepth", + ], +) +def test_reproducibility_of_regression_models(estimator: str): + """FLAML finds the best model for a given dataset, which it then provides to users. + + However, there are reported issues where FLAML was providing an incorrect model - see here: + https://github.com/microsoft/FLAML/issues/1317 + In this test we take the best regression model which FLAML provided us, and then retrain and test it on the + same folds, to verify that the result is reproducible. + """ + automl = AutoML() + automl_settings = { + "max_iter": 2, + "time_budget": -1, + "task": "regression", + "n_jobs": 1, + "estimator_list": [estimator], + "eval_method": "cv", + "n_splits": 3, + "metric": "r2", + "keep_search_state": True, + "skip_transform": True, + } + X, y = fetch_california_housing(return_X_y=True, as_frame=True) + automl.fit(X_train=X, y_train=y, **automl_settings) + best_model = automl.model + assert best_model is not None + config = best_model.get_params() + val_loss_flaml = automl.best_result["val_loss"] + + # Take the best model, and see if we can reproduce the best result + reproduced_val_loss, metric_for_logging, train_time, pred_time = automl._state.task.evaluate_model_CV( + config=config, + estimator=best_model, + X_train_all=automl._state.X_train_all, + y_train_all=automl._state.y_train_all, + budget=None, + kf=automl._state.kf, + eval_metric="r2", + best_val_loss=None, + cv_score_agg_func=None, + log_training_metric=False, + fit_kwargs=None, + free_mem_ratio=0, + ) + assert pytest.approx(val_loss_flaml) == reproduced_val_loss + + +@pytest.mark.parametrize( + "estimator", + [ + # "catboost", + "extra_tree", + "histgb", + "kneighbor", + # "lgbm", + "rf", + "xgboost", + "xgb_limitdepth", + ], +) +def test_reproducibility_of_underlying_regression_models(estimator: str): + """FLAML finds the best model for a given dataset, which it then provides to users. + + However, there are reported issues where FLAML was providing an incorrect model - see here: + https://github.com/microsoft/FLAML/issues/1317 + FLAML defines FLAMLised models, which wrap around the underlying (SKLearn/XGBoost/CatBoost) model. + Ideally, FLAMLised models should perform identically to the underlying model, when fitted + to the same data, with no budget. This verifies that this is the case for regression models. + In this test we take the best model which FLAML provided us, extract the underlying model, + before retraining and testing it on the same folds - to verify that the result is reproducible. + """ + automl = AutoML() + automl_settings = { + "max_iter": 5, + "time_budget": -1, + "task": "regression", + "n_jobs": 1, + "estimator_list": [estimator], + "eval_method": "cv", + "n_splits": 10, + "metric": "r2", + "keep_search_state": True, + "skip_transform": True, + } + X, y = fetch_california_housing(return_X_y=True, as_frame=True) + automl.fit(X_train=X, y_train=y, **automl_settings) + best_model = automl.model + assert best_model is not None + val_loss_flaml = automl.best_result["val_loss"] + reproduced_val_loss_underlying_model = np.mean( + evaluate_cv_folds_with_underlying_model( + automl._state.X_train_all, automl._state.y_train_all, automl._state.kf, best_model.model, "regression" + ) + ) + + assert pytest.approx(val_loss_flaml) == reproduced_val_loss_underlying_model + + if __name__ == "__main__": unittest.main() diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 0000000000..47a74b2896 --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,42 @@ +from typing import Any, Dict, List, Union + +import numpy as np +import pandas as pd +from catboost import CatBoostClassifier, CatBoostRegressor, Pool +from sklearn.metrics import f1_score, r2_score + + +def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model: Any, task: str) -> pd.DataFrame: + """Mimic the FLAML CV process to calculate the metrics across each fold. + + :param X_train_all: X training data + :param y_train_all: y training data + :param kf: The splitter object to use to generate the folds + :param model: The estimator to fit to the data during the CV process + :param task: classification or regression + :return: An array containing the metrics + """ + rng = np.random.RandomState(2020) + all_fold_metrics: List[Dict[str, Union[int, float]]] = [] + for train_index, val_index in kf.split(X_train_all, y_train_all): + X_train_split, y_train_split = X_train_all, y_train_all + train_index = rng.permutation(train_index) + X_train = X_train_split.iloc[train_index] + X_val = X_train_split.iloc[val_index] + y_train, y_val = y_train_split[train_index], y_train_split[val_index] + model_type = type(model) + if model_type is not CatBoostClassifier and model_type is not CatBoostRegressor: + model.fit(X_train, y_train) + else: + use_best_model = True + n = max(int(len(y_train) * 0.9), len(y_train) - 1000) if use_best_model else len(y_train) + X_tr, y_tr = (X_train)[:n], y_train[:n] + eval_set = Pool(data=X_train[n:], label=y_train[n:], cat_features=[]) if use_best_model else None + model.fit(X_tr, y_tr, eval_set=eval_set, use_best_model=True) + y_pred_classes = model.predict(X_val) + if task == "classification": + reproduced_metric = 1 - f1_score(y_val, y_pred_classes) + else: + reproduced_metric = 1 - r2_score(y_val, y_pred_classes) + all_fold_metrics.append(reproduced_metric) + return all_fold_metrics