[ENH] Remove fit_predict_proba from Base-Class, Use sklearn cross_val…

…_predict instead (#117) * remove fit_predict_proba, use sklearn cross_val_predict instead * fix ensemble * remove tests for fit_predict * missed one function --------- Co-authored-by: Patrick Schäfer <patrick.schaefer@informatik.hu-berlin.de>
aeon-toolkit · Feb 28, 2023 · 015339f · 015339f
1 parent 9b480f9
commit 015339f
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 252 deletions.
diff --git a/sktime/classification/base.py b/sktime/classification/base.py
@@ -256,166 +256,6 @@ def predict_proba(self, X) -> np.ndarray:
         # call internal _predict_proba
         return self._predict_proba(X)
 
-    def fit_predict(self, X, y, cv=None, change_state=True) -> np.ndarray:
-        """Fit and predict labels for sequences in X.
-
-        Method to produce predictions for the train set, either using the model fit
-        on the whole data or through cross validation.
-
-        Writes to self, if change_state=True:
-            Sets self.is_fitted to True.
-            Sets fitted model attributes ending in "_".
-
-        Does not update state if change_state=False.
-
-        Parameters
-        ----------
-        X : 3D np.array (any number of dimensions, equal length series)
-                of shape [n_instances, n_dimensions, series_length]
-            or 2D np.array (univariate, equal length series)
-                of shape [n_instances, series_length]
-            or pd.DataFrame with each column a dimension, each cell a pd.Series
-                (any number of dimensions, equal or unequal length series)
-        y : 1D np.array of int, of shape [n_instances] - class labels for fitting
-            indices correspond to instance indices in X
-        cv : None, int, or sklearn cross-validation object, optional, default=None
-            None : predictions are in-sample, equivalent to fit(X, y).predict(X)
-            cv : predictions are equivalent to fit(X_train, y_train).predict(X_test)
-                where multiple X_train, y_train, X_test are obtained from cv folds
-                returned y is union over all test fold predictions
-                cv test folds must be non-intersecting
-            int : equivalent to cv=KFold(cv, shuffle=True, random_state=x),
-                i.e., k-fold cross-validation predictions out-of-sample
-                random_state x is taken from self if exists, otherwise x=None
-        change_state : bool, optional (default=True)
-            if False, will not change the state of the classifier,
-                i.e., fit/predict sequence is run with a copy, self does not change
-            if True, will fit self to the full X and y,
-                end state will be equivalent to running fit(X, y)
-
-        Returns
-        -------
-        y : 1D np.array of int, of shape [n_instances] - predicted class labels
-            indices correspond to instance indices in X
-            if cv is passed, -1 indicates entries not seen in union of test sets
-        """
-        return self._fit_predict_boilerplate(
-            X=X, y=y, cv=cv, change_state=change_state, method="predict"
-        )
-
-    def _fit_predict_boilerplate(self, X, y, cv, change_state, method):
-        """Logic for fit_predict and fit_predict_proba."""
-        from sklearn.model_selection import KFold
-
-        if isinstance(cv, int):
-            random_state = getattr(self, "random_state", None)
-            cv = KFold(cv, random_state=random_state, shuffle=True)
-
-        if change_state:
-            self.reset()
-            est = self
-        else:
-            est = self.clone()
-
-        if cv is None:
-            return getattr(est.fit(X, y), method)(X)
-        elif change_state:
-            self.fit(X, y)
-
-        # we now know that cv is an sklearn splitter
-        X, y = self._internal_convert(X, y)
-        X_metadata = self._check_classifier_input(X, y)
-        missing = X_metadata["has_nans"]
-        multivariate = not X_metadata["is_univariate"]
-        unequal = not X_metadata["is_equal_length"]
-        # Check this classifier can handle characteristics
-        self._check_capabilities(missing, multivariate, unequal)
-
-        # handle single class case
-        if len(self._class_dictionary) == 1:
-            return self._single_class_y_pred(X)
-
-        # Convert data to format easily useable for applying cv
-        if isinstance(X, np.ndarray):
-            X = convert_to(
-                X,
-                to_type="numpy3D",
-                as_scitype="Panel",
-                store_behaviour="freeze",
-            )
-        else:
-            X = convert_to(
-                X,
-                to_type="nested_univ",
-                as_scitype="Panel",
-                store_behaviour="freeze",
-            )
-
-        if method == "predict_proba":
-            y_pred = np.empty([len(y), len(np.unique(y))])
-        else:
-            y_pred = np.empty_like(y)
-        y_pred[:] = -1
-        if isinstance(X, np.ndarray):
-            for tr_idx, tt_idx in cv.split(X):
-                X_train = X[tr_idx]
-                X_test = X[tt_idx]
-                y_train = y[tr_idx]
-                fitted_est = self.clone().fit(X_train, y_train)
-                y_pred[tt_idx] = getattr(fitted_est, method)(X_test)
-        else:
-            for tr_idx, tt_idx in cv.split(X):
-                X_train = X.iloc[tr_idx]
-                X_test = X.iloc[tt_idx]
-                y_train = y[tr_idx]
-                fitted_est = self.clone().fit(X_train, y_train)
-                y_pred[tt_idx] = getattr(fitted_est, method)(X_test)
-
-        return y_pred
-
-    def fit_predict_proba(self, X, y, cv=None, change_state=True) -> np.ndarray:
-        """Fit and predict labels probabilities for sequences in X.
-
-        Convenience method to produce in-sample predictions and
-        cross-validated out-of-sample predictions.
-
-        Parameters
-        ----------
-        X : 3D np.array (any number of dimensions, equal length series)
-                of shape [n_instances, n_dimensions, series_length]
-            or 2D np.array (univariate, equal length series)
-                of shape [n_instances, series_length]
-            or pd.DataFrame with each column a dimension, each cell a pd.Series
-                (any number of dimensions, equal or unequal length series)
-            or of any other supported Panel mtype
-                for list of mtypes, see datatypes.SCITYPE_REGISTER
-                for specifications, see examples/AA_datatypes_and_datasets.ipynb
-        y : 1D np.array of int, of shape [n_instances] - class labels for fitting
-            indices correspond to instance indices in X
-        cv : None, int, or sklearn cross-validation object, optional, default=None
-            None : predictions are in-sample, equivalent to fit(X, y).predict(X)
-            cv : predictions are equivalent to fit(X_train, y_train).predict(X_test)
-                where multiple X_train, y_train, X_test are obtained from cv folds
-                returned y is union over all test fold predictions
-                cv test folds must be non-intersecting
-            int : equivalent to cv=Kfold(int), i.e., k-fold cross-validation predictions
-        change_state : bool, optional (default=True)
-            if False, will not change the state of the classifier,
-                i.e., fit/predict sequence is run with a copy, self does not change
-            if True, will fit self to the full X and y,
-                end state will be equivalent to running fit(X, y)
-
-        Returns
-        -------
-        y : 2D array of shape [n_instances, n_classes] - predicted class probabilities
-            1st dimension indices correspond to instance indices in X
-            2nd dimension indices correspond to possible labels (integers)
-            (i, j)-th entry is predictive probability that i-th instance is of class j
-        """
-        return self._fit_predict_boilerplate(
-            X=X, y=y, cv=cv, change_state=change_state, method="predict_proba"
-        )
-
     def _single_class_y_pred(self, X, method="predict"):
         """Handle the prediction case where only single class label was seen in fit."""
         _, _, X_meta = check_is_scitype(X, scitype="Panel", return_metadata=True)

diff --git a/sktime/classification/compose/_ensemble.py b/sktime/classification/compose/_ensemble.py
@@ -14,6 +14,7 @@
     _get_n_samples_bootstrap,
 )
 from sklearn.metrics import accuracy_score
+from sklearn.model_selection import cross_val_predict
 from sklearn.pipeline import Pipeline
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.utils import compute_sample_weight
@@ -729,8 +730,15 @@ def _fit(self, X, y):
         else:
             exponent = self.weights
             for clf_name, clf in self.classifiers_:
-                train_probs = clf.fit_predict_proba(X=X, y=y, cv=self.cv)
+                # learn cross-val accuracy of the model
+                train_probs = cross_val_predict(
+                    clf, X=X, y=y, cv=self.cv, method="predict_proba"
+                )
+
+                # train final model
+                clf.fit(X, y)
                 train_preds = clf.classes_[np.argmax(train_probs, axis=1)]
+
                 if self.metric_type == "proba":
                     for i in range(len(train_preds)):
                         train_preds[i] = train_probs[i, np.argmax(train_probs[i, :])]

diff --git a/sktime/classification/tests/test_base.py b/sktime/classification/tests/test_base.py
@@ -8,13 +8,11 @@
 import numpy as np
 import pandas as pd
 import pytest
-from sklearn.model_selection import KFold
 
 from sktime.classification.base import BaseClassifier
 from sktime.classification.deep_learning.base import BaseDeepClassifier
 from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
 from sktime.classification.feature_based import Catch22Classifier
-from sktime.utils._testing.estimator_checks import _assert_array_almost_equal
 from sktime.utils._testing.panel import (
     _make_classification_y,
     _make_panel,
@@ -338,71 +336,6 @@ def test_input_conversion_fit_predict(mtype):
     clf.predict(X)
 
 
-@pytest.mark.parametrize("method", ["fit_predict", "fit_predict_proba"])
-def test_fit_predict_change_state(method):
-    """Test change_state flag in fit_predict, fit_predict_proba works as intended."""
-    X, y = make_classification_problem()
-
-    clf = KNeighborsTimeSeriesClassifier()
-
-    y_pred = getattr(clf, method)(X, y, change_state=False)
-    assert not clf.is_fitted
-
-    y_pred_post_fit = getattr(clf, method)(X, y, change_state=True)
-    assert clf.is_fitted
-
-    y_pred_post_fit2 = getattr(clf, method)(X, y, change_state=False)
-    assert clf.is_fitted
-
-    # get output from fit and predict or predict_proba
-    clf = KNeighborsTimeSeriesClassifier()
-    normal_method = method.partition("_")[2]
-    y_pred_normal = getattr(clf.fit(X, y), normal_method)(X)
-
-    # all the above outputs should be equal
-    _assert_array_almost_equal(y_pred_normal, y_pred)
-    _assert_array_almost_equal(y_pred_post_fit, y_pred)
-    _assert_array_almost_equal(y_pred_post_fit, y_pred_post_fit2)
-
-    assert len(y_pred) == len(y)
-    if method == "fit_predict_proba":
-        n_cl = len(y.unique())
-        assert y_pred.shape[1] == n_cl
-
-
-@pytest.mark.parametrize("method", ["fit_predict", "fit_predict_proba"])
-def test_fit_predict_cv(method):
-    """Test cv argument in fit_predict, fit_predict_proba."""
-    X, y = make_classification_problem()
-
-    clf = KNeighborsTimeSeriesClassifier()
-    clf.random_state = 42
-    cv = KFold(3, random_state=42, shuffle=True)
-
-    y_pred_cv_int = getattr(clf, method)(X, y, cv=3, change_state=False)
-    y_pred_cv_obj = getattr(clf, method)(X, y, cv=cv, change_state=False)
-    assert not clf.is_fitted
-
-    _assert_array_almost_equal(y_pred_cv_int, y_pred_cv_obj)
-    assert -1 not in y_pred_cv_int
-
-    assert len(y) == len(y_pred_cv_int)
-    if method == "fit_predict_proba":
-        n_cl = len(y.unique())
-        assert y_pred_cv_int.shape[1] == n_cl
-
-    # check that state is same as self.fit(X, y) if change_state=True
-    y_pred_cv_obj_fit = getattr(clf, method)(X, y, cv=cv, change_state=True)
-    assert clf.is_fitted
-
-    # get output from fit and predict or predict_proba
-    clf = KNeighborsTimeSeriesClassifier()
-    normal_method = method.partition("_")[2]
-    y_pred_normal = getattr(clf.fit(X, y), normal_method)(X)
-
-    _assert_array_almost_equal(y_pred_normal, y_pred_cv_obj_fit)
-
-
 @pytest.mark.parametrize("method", ["predict", "predict_proba"])
 def test_predict_single_class(method):
     """Test return of predict/_proba in case only single class seen in fit."""
@@ -428,30 +361,6 @@ def test_predict_single_class(method):
         assert all(list(y_pred == 1))
 
 
-@pytest.mark.parametrize("cv", [None, KFold(3, random_state=42, shuffle=True)])
-@pytest.mark.parametrize("method", ["fit_predict", "fit_predict_proba"])
-def test_fit_predict_single_class(method, cv):
-    """Test return of fit_predict/_proba in case only single class seen in fit."""
-    X, y = make_classification_problem()
-    y[:] = 42
-    n_instances = len(X)
-
-    clf = KNeighborsTimeSeriesClassifier()
-
-    y_pred = getattr(clf, method)(X, y, cv=cv, change_state=False)
-
-    if method == "fit_predict":
-        assert isinstance(y_pred, np.ndarray)
-        assert y_pred.ndim == 1
-        assert y_pred.shape == (n_instances,)
-        assert all(list(y_pred == 42))
-    if method == "fit_predict_proba":
-        assert isinstance(y_pred, np.ndarray)
-        assert y_pred.ndim == 2
-        assert y_pred.shape == (n_instances, 1)
-        assert all(list(y_pred == 1))
-
-
 @pytest.mark.skipif(
     not _check_soft_dependencies("tensorflow", severity="none"),
     reason="skip test if required soft dependency not available",