Skip to content

Commit

Permalink
[ENH] Remove fit_predict_proba from Base-Class, Use sklearn cross_val…
Browse files Browse the repository at this point in the history
…_predict instead (#117)

* remove fit_predict_proba, use sklearn cross_val_predict instead

* fix ensemble

* remove tests for fit_predict

* missed one function

---------

Co-authored-by: Patrick Schäfer <patrick.schaefer@informatik.hu-berlin.de>
  • Loading branch information
patrickzib and patrickzib authored Feb 28, 2023
1 parent 9b480f9 commit 015339f
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 252 deletions.
160 changes: 0 additions & 160 deletions sktime/classification/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,166 +256,6 @@ def predict_proba(self, X) -> np.ndarray:
# call internal _predict_proba
return self._predict_proba(X)

def fit_predict(self, X, y, cv=None, change_state=True) -> np.ndarray:
"""Fit and predict labels for sequences in X.
Method to produce predictions for the train set, either using the model fit
on the whole data or through cross validation.
Writes to self, if change_state=True:
Sets self.is_fitted to True.
Sets fitted model attributes ending in "_".
Does not update state if change_state=False.
Parameters
----------
X : 3D np.array (any number of dimensions, equal length series)
of shape [n_instances, n_dimensions, series_length]
or 2D np.array (univariate, equal length series)
of shape [n_instances, series_length]
or pd.DataFrame with each column a dimension, each cell a pd.Series
(any number of dimensions, equal or unequal length series)
y : 1D np.array of int, of shape [n_instances] - class labels for fitting
indices correspond to instance indices in X
cv : None, int, or sklearn cross-validation object, optional, default=None
None : predictions are in-sample, equivalent to fit(X, y).predict(X)
cv : predictions are equivalent to fit(X_train, y_train).predict(X_test)
where multiple X_train, y_train, X_test are obtained from cv folds
returned y is union over all test fold predictions
cv test folds must be non-intersecting
int : equivalent to cv=KFold(cv, shuffle=True, random_state=x),
i.e., k-fold cross-validation predictions out-of-sample
random_state x is taken from self if exists, otherwise x=None
change_state : bool, optional (default=True)
if False, will not change the state of the classifier,
i.e., fit/predict sequence is run with a copy, self does not change
if True, will fit self to the full X and y,
end state will be equivalent to running fit(X, y)
Returns
-------
y : 1D np.array of int, of shape [n_instances] - predicted class labels
indices correspond to instance indices in X
if cv is passed, -1 indicates entries not seen in union of test sets
"""
return self._fit_predict_boilerplate(
X=X, y=y, cv=cv, change_state=change_state, method="predict"
)

def _fit_predict_boilerplate(self, X, y, cv, change_state, method):
"""Logic for fit_predict and fit_predict_proba."""
from sklearn.model_selection import KFold

if isinstance(cv, int):
random_state = getattr(self, "random_state", None)
cv = KFold(cv, random_state=random_state, shuffle=True)

if change_state:
self.reset()
est = self
else:
est = self.clone()

if cv is None:
return getattr(est.fit(X, y), method)(X)
elif change_state:
self.fit(X, y)

# we now know that cv is an sklearn splitter
X, y = self._internal_convert(X, y)
X_metadata = self._check_classifier_input(X, y)
missing = X_metadata["has_nans"]
multivariate = not X_metadata["is_univariate"]
unequal = not X_metadata["is_equal_length"]
# Check this classifier can handle characteristics
self._check_capabilities(missing, multivariate, unequal)

# handle single class case
if len(self._class_dictionary) == 1:
return self._single_class_y_pred(X)

# Convert data to format easily useable for applying cv
if isinstance(X, np.ndarray):
X = convert_to(
X,
to_type="numpy3D",
as_scitype="Panel",
store_behaviour="freeze",
)
else:
X = convert_to(
X,
to_type="nested_univ",
as_scitype="Panel",
store_behaviour="freeze",
)

if method == "predict_proba":
y_pred = np.empty([len(y), len(np.unique(y))])
else:
y_pred = np.empty_like(y)
y_pred[:] = -1
if isinstance(X, np.ndarray):
for tr_idx, tt_idx in cv.split(X):
X_train = X[tr_idx]
X_test = X[tt_idx]
y_train = y[tr_idx]
fitted_est = self.clone().fit(X_train, y_train)
y_pred[tt_idx] = getattr(fitted_est, method)(X_test)
else:
for tr_idx, tt_idx in cv.split(X):
X_train = X.iloc[tr_idx]
X_test = X.iloc[tt_idx]
y_train = y[tr_idx]
fitted_est = self.clone().fit(X_train, y_train)
y_pred[tt_idx] = getattr(fitted_est, method)(X_test)

return y_pred

def fit_predict_proba(self, X, y, cv=None, change_state=True) -> np.ndarray:
"""Fit and predict labels probabilities for sequences in X.
Convenience method to produce in-sample predictions and
cross-validated out-of-sample predictions.
Parameters
----------
X : 3D np.array (any number of dimensions, equal length series)
of shape [n_instances, n_dimensions, series_length]
or 2D np.array (univariate, equal length series)
of shape [n_instances, series_length]
or pd.DataFrame with each column a dimension, each cell a pd.Series
(any number of dimensions, equal or unequal length series)
or of any other supported Panel mtype
for list of mtypes, see datatypes.SCITYPE_REGISTER
for specifications, see examples/AA_datatypes_and_datasets.ipynb
y : 1D np.array of int, of shape [n_instances] - class labels for fitting
indices correspond to instance indices in X
cv : None, int, or sklearn cross-validation object, optional, default=None
None : predictions are in-sample, equivalent to fit(X, y).predict(X)
cv : predictions are equivalent to fit(X_train, y_train).predict(X_test)
where multiple X_train, y_train, X_test are obtained from cv folds
returned y is union over all test fold predictions
cv test folds must be non-intersecting
int : equivalent to cv=Kfold(int), i.e., k-fold cross-validation predictions
change_state : bool, optional (default=True)
if False, will not change the state of the classifier,
i.e., fit/predict sequence is run with a copy, self does not change
if True, will fit self to the full X and y,
end state will be equivalent to running fit(X, y)
Returns
-------
y : 2D array of shape [n_instances, n_classes] - predicted class probabilities
1st dimension indices correspond to instance indices in X
2nd dimension indices correspond to possible labels (integers)
(i, j)-th entry is predictive probability that i-th instance is of class j
"""
return self._fit_predict_boilerplate(
X=X, y=y, cv=cv, change_state=change_state, method="predict_proba"
)

def _single_class_y_pred(self, X, method="predict"):
"""Handle the prediction case where only single class label was seen in fit."""
_, _, X_meta = check_is_scitype(X, scitype="Panel", return_metadata=True)
Expand Down
10 changes: 9 additions & 1 deletion sktime/classification/compose/_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
_get_n_samples_bootstrap,
)
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import compute_sample_weight
Expand Down Expand Up @@ -729,8 +730,15 @@ def _fit(self, X, y):
else:
exponent = self.weights
for clf_name, clf in self.classifiers_:
train_probs = clf.fit_predict_proba(X=X, y=y, cv=self.cv)
# learn cross-val accuracy of the model
train_probs = cross_val_predict(
clf, X=X, y=y, cv=self.cv, method="predict_proba"
)

# train final model
clf.fit(X, y)
train_preds = clf.classes_[np.argmax(train_probs, axis=1)]

if self.metric_type == "proba":
for i in range(len(train_preds)):
train_preds[i] = train_probs[i, np.argmax(train_probs[i, :])]
Expand Down
91 changes: 0 additions & 91 deletions sktime/classification/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,11 @@
import numpy as np
import pandas as pd
import pytest
from sklearn.model_selection import KFold

from sktime.classification.base import BaseClassifier
from sktime.classification.deep_learning.base import BaseDeepClassifier
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
from sktime.classification.feature_based import Catch22Classifier
from sktime.utils._testing.estimator_checks import _assert_array_almost_equal
from sktime.utils._testing.panel import (
_make_classification_y,
_make_panel,
Expand Down Expand Up @@ -338,71 +336,6 @@ def test_input_conversion_fit_predict(mtype):
clf.predict(X)


@pytest.mark.parametrize("method", ["fit_predict", "fit_predict_proba"])
def test_fit_predict_change_state(method):
"""Test change_state flag in fit_predict, fit_predict_proba works as intended."""
X, y = make_classification_problem()

clf = KNeighborsTimeSeriesClassifier()

y_pred = getattr(clf, method)(X, y, change_state=False)
assert not clf.is_fitted

y_pred_post_fit = getattr(clf, method)(X, y, change_state=True)
assert clf.is_fitted

y_pred_post_fit2 = getattr(clf, method)(X, y, change_state=False)
assert clf.is_fitted

# get output from fit and predict or predict_proba
clf = KNeighborsTimeSeriesClassifier()
normal_method = method.partition("_")[2]
y_pred_normal = getattr(clf.fit(X, y), normal_method)(X)

# all the above outputs should be equal
_assert_array_almost_equal(y_pred_normal, y_pred)
_assert_array_almost_equal(y_pred_post_fit, y_pred)
_assert_array_almost_equal(y_pred_post_fit, y_pred_post_fit2)

assert len(y_pred) == len(y)
if method == "fit_predict_proba":
n_cl = len(y.unique())
assert y_pred.shape[1] == n_cl


@pytest.mark.parametrize("method", ["fit_predict", "fit_predict_proba"])
def test_fit_predict_cv(method):
"""Test cv argument in fit_predict, fit_predict_proba."""
X, y = make_classification_problem()

clf = KNeighborsTimeSeriesClassifier()
clf.random_state = 42
cv = KFold(3, random_state=42, shuffle=True)

y_pred_cv_int = getattr(clf, method)(X, y, cv=3, change_state=False)
y_pred_cv_obj = getattr(clf, method)(X, y, cv=cv, change_state=False)
assert not clf.is_fitted

_assert_array_almost_equal(y_pred_cv_int, y_pred_cv_obj)
assert -1 not in y_pred_cv_int

assert len(y) == len(y_pred_cv_int)
if method == "fit_predict_proba":
n_cl = len(y.unique())
assert y_pred_cv_int.shape[1] == n_cl

# check that state is same as self.fit(X, y) if change_state=True
y_pred_cv_obj_fit = getattr(clf, method)(X, y, cv=cv, change_state=True)
assert clf.is_fitted

# get output from fit and predict or predict_proba
clf = KNeighborsTimeSeriesClassifier()
normal_method = method.partition("_")[2]
y_pred_normal = getattr(clf.fit(X, y), normal_method)(X)

_assert_array_almost_equal(y_pred_normal, y_pred_cv_obj_fit)


@pytest.mark.parametrize("method", ["predict", "predict_proba"])
def test_predict_single_class(method):
"""Test return of predict/_proba in case only single class seen in fit."""
Expand All @@ -428,30 +361,6 @@ def test_predict_single_class(method):
assert all(list(y_pred == 1))


@pytest.mark.parametrize("cv", [None, KFold(3, random_state=42, shuffle=True)])
@pytest.mark.parametrize("method", ["fit_predict", "fit_predict_proba"])
def test_fit_predict_single_class(method, cv):
"""Test return of fit_predict/_proba in case only single class seen in fit."""
X, y = make_classification_problem()
y[:] = 42
n_instances = len(X)

clf = KNeighborsTimeSeriesClassifier()

y_pred = getattr(clf, method)(X, y, cv=cv, change_state=False)

if method == "fit_predict":
assert isinstance(y_pred, np.ndarray)
assert y_pred.ndim == 1
assert y_pred.shape == (n_instances,)
assert all(list(y_pred == 42))
if method == "fit_predict_proba":
assert isinstance(y_pred, np.ndarray)
assert y_pred.ndim == 2
assert y_pred.shape == (n_instances, 1)
assert all(list(y_pred == 1))


@pytest.mark.skipif(
not _check_soft_dependencies("tensorflow", severity="none"),
reason="skip test if required soft dependency not available",
Expand Down

0 comments on commit 015339f

Please sign in to comment.