diff --git a/test/automl/test_classification.py b/test/automl/test_classification.py
index db5a8e5297..deef827adf 100644
--- a/test/automl/test_classification.py
+++ b/test/automl/test_classification.py
@@ -1,11 +1,15 @@
 import unittest
 from datetime import datetime
+from test.conftest import evaluate_cv_folds_with_underlying_model
 
 import numpy as np
 import pandas as pd
+import pytest
 import scipy.sparse
 from sklearn.datasets import load_breast_cancer
-from sklearn.model_selection import train_test_split
+from sklearn.model_selection import (
+    train_test_split,
+)
 
 from flaml import AutoML, tune
 from flaml.automl.model import LGBMEstimator
@@ -420,6 +424,120 @@ def test_sparse_matrix_lr(self):
         print(automl_experiment.best_estimator)
 
 
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        # "catboost",
+        "extra_tree",
+        "histgb",
+        "kneighbor",
+        "lgbm",
+        # "lrl1",
+        "lrl2",
+        "rf",
+        "xgboost",
+        "xgb_limitdepth",
+    ],
+)
+def test_reproducibility_of_classification_models(estimator: str):
+    """FLAML finds the best model for a given dataset, which it then provides to users.
+
+    However, there are reported issues where FLAML was providing an incorrect model - see here:
+    https://github.com/microsoft/FLAML/issues/1317
+    In this test we take the best model which FLAML provided us, and then retrain and test it on the
+    same folds, to verify that the result is reproducible.
+    """
+    automl = AutoML()
+    automl_settings = {
+        "max_iter": 5,
+        "time_budget": -1,
+        "task": "classification",
+        "n_jobs": 1,
+        "estimator_list": [estimator],
+        "eval_method": "cv",
+        "n_splits": 10,
+        "metric": "f1",
+        "keep_search_state": True,
+        "skip_transform": True,
+    }
+    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+    automl.fit(X_train=X, y_train=y, **automl_settings)
+    best_model = automl.model
+    assert best_model is not None
+    config = best_model.get_params()
+    val_loss_flaml = automl.best_result["val_loss"]
+
+    # Take the best model, and see if we can reproduce the best result
+    reproduced_val_loss, metric_for_logging, train_time, pred_time = automl._state.task.evaluate_model_CV(
+        config=config,
+        estimator=best_model,
+        X_train_all=automl._state.X_train_all,
+        y_train_all=automl._state.y_train_all,
+        budget=None,
+        kf=automl._state.kf,
+        eval_metric="f1",
+        best_val_loss=None,
+        cv_score_agg_func=None,
+        log_training_metric=False,
+        fit_kwargs=None,
+        free_mem_ratio=0,
+    )
+    assert pytest.approx(val_loss_flaml) == reproduced_val_loss
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        # "catboost",
+        "extra_tree",
+        "histgb",
+        "kneighbor",
+        # "lgbm",
+        # "lrl1",
+        "lrl2",
+        "rf",
+        "xgboost",
+        "xgb_limitdepth",
+    ],
+)
+def test_reproducibility_of_underlying_classification_models(estimator: str):
+    """FLAML finds the best model for a given dataset, which it then provides to users.
+
+    However, there are reported issues where FLAML was providing an incorrect model - see here:
+    https://github.com/microsoft/FLAML/issues/1317
+    FLAML defines FLAMLised models, which wrap around the underlying (SKLearn/XGBoost/CatBoost) model.
+    Ideally, FLAMLised models should perform identically to the underlying model, when fitted
+    to the same data, with no budget. This verifies that this is the case for classification models.
+    In this test we take the best model which FLAML provided us, extract the underlying model,
+     before retraining and testing it on the same folds - to verify that the result is reproducible.
+    """
+    automl = AutoML()
+    automl_settings = {
+        "max_iter": 5,
+        "time_budget": -1,
+        "task": "classification",
+        "n_jobs": 1,
+        "estimator_list": [estimator],
+        "eval_method": "cv",
+        "n_splits": 10,
+        "metric": "f1",
+        "keep_search_state": True,
+        "skip_transform": True,
+    }
+    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+    automl.fit(X_train=X, y_train=y, **automl_settings)
+    best_model = automl.model
+    assert best_model is not None
+    val_loss_flaml = automl.best_result["val_loss"]
+    reproduced_val_loss_underlying_model = np.mean(
+        evaluate_cv_folds_with_underlying_model(
+            automl._state.X_train_all, automl._state.y_train_all, automl._state.kf, best_model.model, "classification"
+        )
+    )
+
+    assert pytest.approx(val_loss_flaml) == reproduced_val_loss_underlying_model
+
+
 if __name__ == "__main__":
     test = TestClassification()
     test.test_preprocess()
diff --git a/test/automl/test_regression.py b/test/automl/test_regression.py
index 52c6b1d048..bb4fc31da1 100644
--- a/test/automl/test_regression.py
+++ b/test/automl/test_regression.py
@@ -1,9 +1,12 @@
 import unittest
+from test.conftest import evaluate_cv_folds_with_underlying_model
 
 import numpy as np
+import pytest
 import scipy.sparse
 from sklearn.datasets import (
     fetch_california_housing,
+    make_regression,
 )
 
 from flaml import AutoML
@@ -205,7 +208,6 @@ def test_regression_xgboost(self):
 
 
 def test_multioutput():
-    from sklearn.datasets import make_regression
     from sklearn.model_selection import train_test_split
     from sklearn.multioutput import MultiOutputRegressor, RegressorChain
 
@@ -230,5 +232,115 @@ def test_multioutput():
     print(model.predict(X_test))
 
 
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        # "catboost",
+        "extra_tree",
+        "histgb",
+        "kneighbor",
+        "lgbm",
+        "rf",
+        "xgboost",
+        "xgb_limitdepth",
+    ],
+)
+def test_reproducibility_of_regression_models(estimator: str):
+    """FLAML finds the best model for a given dataset, which it then provides to users.
+
+    However, there are reported issues where FLAML was providing an incorrect model - see here:
+    https://github.com/microsoft/FLAML/issues/1317
+    In this test we take the best regression model which FLAML provided us, and then retrain and test it on the
+    same folds, to verify that the result is reproducible.
+    """
+    automl = AutoML()
+    automl_settings = {
+        "max_iter": 2,
+        "time_budget": -1,
+        "task": "regression",
+        "n_jobs": 1,
+        "estimator_list": [estimator],
+        "eval_method": "cv",
+        "n_splits": 3,
+        "metric": "r2",
+        "keep_search_state": True,
+        "skip_transform": True,
+    }
+    X, y = fetch_california_housing(return_X_y=True, as_frame=True)
+    automl.fit(X_train=X, y_train=y, **automl_settings)
+    best_model = automl.model
+    assert best_model is not None
+    config = best_model.get_params()
+    val_loss_flaml = automl.best_result["val_loss"]
+
+    # Take the best model, and see if we can reproduce the best result
+    reproduced_val_loss, metric_for_logging, train_time, pred_time = automl._state.task.evaluate_model_CV(
+        config=config,
+        estimator=best_model,
+        X_train_all=automl._state.X_train_all,
+        y_train_all=automl._state.y_train_all,
+        budget=None,
+        kf=automl._state.kf,
+        eval_metric="r2",
+        best_val_loss=None,
+        cv_score_agg_func=None,
+        log_training_metric=False,
+        fit_kwargs=None,
+        free_mem_ratio=0,
+    )
+    assert pytest.approx(val_loss_flaml) == reproduced_val_loss
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        # "catboost",
+        "extra_tree",
+        "histgb",
+        "kneighbor",
+        # "lgbm",
+        "rf",
+        "xgboost",
+        "xgb_limitdepth",
+    ],
+)
+def test_reproducibility_of_underlying_regression_models(estimator: str):
+    """FLAML finds the best model for a given dataset, which it then provides to users.
+
+    However, there are reported issues where FLAML was providing an incorrect model - see here:
+    https://github.com/microsoft/FLAML/issues/1317
+    FLAML defines FLAMLised models, which wrap around the underlying (SKLearn/XGBoost/CatBoost) model.
+    Ideally, FLAMLised models should perform identically to the underlying model, when fitted
+    to the same data, with no budget. This verifies that this is the case for regression models.
+    In this test we take the best model which FLAML provided us, extract the underlying model,
+     before retraining and testing it on the same folds - to verify that the result is reproducible.
+    """
+    automl = AutoML()
+    automl_settings = {
+        "max_iter": 5,
+        "time_budget": -1,
+        "task": "regression",
+        "n_jobs": 1,
+        "estimator_list": [estimator],
+        "eval_method": "cv",
+        "n_splits": 10,
+        "metric": "r2",
+        "keep_search_state": True,
+        "skip_transform": True,
+    }
+    X, y = fetch_california_housing(return_X_y=True, as_frame=True)
+    automl.fit(X_train=X, y_train=y, **automl_settings)
+    best_model = automl.model
+    assert best_model is not None
+    val_loss_flaml = automl.best_result["val_loss"]
+    reproduced_val_loss_underlying_model = np.mean(
+        evaluate_cv_folds_with_underlying_model(
+            automl._state.X_train_all, automl._state.y_train_all, automl._state.kf, best_model.model, "regression"
+        )
+    )
+
+    assert pytest.approx(val_loss_flaml) == reproduced_val_loss_underlying_model
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/conftest.py b/test/conftest.py
new file mode 100644
index 0000000000..47a74b2896
--- /dev/null
+++ b/test/conftest.py
@@ -0,0 +1,42 @@
+from typing import Any, Dict, List, Union
+
+import numpy as np
+import pandas as pd
+from catboost import CatBoostClassifier, CatBoostRegressor, Pool
+from sklearn.metrics import f1_score, r2_score
+
+
+def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model: Any, task: str) -> pd.DataFrame:
+    """Mimic the FLAML CV process to calculate the metrics across each fold.
+
+    :param X_train_all: X training data
+    :param y_train_all: y training data
+    :param kf: The splitter object to use to generate the folds
+    :param model: The estimator to fit to the data during the CV process
+    :param task: classification or regression
+    :return: An array containing the metrics
+    """
+    rng = np.random.RandomState(2020)
+    all_fold_metrics: List[Dict[str, Union[int, float]]] = []
+    for train_index, val_index in kf.split(X_train_all, y_train_all):
+        X_train_split, y_train_split = X_train_all, y_train_all
+        train_index = rng.permutation(train_index)
+        X_train = X_train_split.iloc[train_index]
+        X_val = X_train_split.iloc[val_index]
+        y_train, y_val = y_train_split[train_index], y_train_split[val_index]
+        model_type = type(model)
+        if model_type is not CatBoostClassifier and model_type is not CatBoostRegressor:
+            model.fit(X_train, y_train)
+        else:
+            use_best_model = True
+            n = max(int(len(y_train) * 0.9), len(y_train) - 1000) if use_best_model else len(y_train)
+            X_tr, y_tr = (X_train)[:n], y_train[:n]
+            eval_set = Pool(data=X_train[n:], label=y_train[n:], cat_features=[]) if use_best_model else None
+            model.fit(X_tr, y_tr, eval_set=eval_set, use_best_model=True)
+        y_pred_classes = model.predict(X_val)
+        if task == "classification":
+            reproduced_metric = 1 - f1_score(y_val, y_pred_classes)
+        else:
+            reproduced_metric = 1 - r2_score(y_val, y_pred_classes)
+        all_fold_metrics.append(reproduced_metric)
+    return all_fold_metrics