Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade scikit-learn #4248

Merged
merged 13 commits into from
Jul 24, 2023
6 changes: 3 additions & 3 deletions .github/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ outputs:
- pandas >=1.5.0, <2.0.0
- dask >=2022.2.0, !=2022.10.1
- scipy >=1.5.0
- scikit-learn >=1.2.2
- scikit-learn >=1.3.0
- scikit-optimize >=0.9.0
- statsmodels >=0.12.2
- colorama >=0.4.4
Expand Down Expand Up @@ -78,8 +78,8 @@ outputs:
- lightgbm >=4.0.0
- lime >=0.2.0.1
- python >=3.8.*
- imbalanced-learn >=0.9.1, <0.11.0
- sktime >=0.17.0
- imbalanced-learn >=0.11.0
- sktime >=0.21.0
- pmdarima >=1.8.5
- vowpalwabbit >=8.11.0
test:
Expand Down
2 changes: 1 addition & 1 deletion core-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
numpy>=1.21.0
pandas>=1.5.0, <2.0.0
scipy>=1.5.0
scikit-learn>=1.2.1
scikit-learn>=1.3.0
scikit-optimize>=0.9.0
pyzmq>=20.0.0
colorama>=0.4.4
Expand Down
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Release Notes
* Changes
* Unpinned sktime version :pr:`4214`
* Bumped minimum lightgbm version to 4.0.0 for nullable type handling :pr:`4237`
* Pinned scikit-learn version due to incompatibility with pinned imbalanced-learn :pr:`4248`
eccabay marked this conversation as resolved.
Show resolved Hide resolved
* Documentation Changes
* Testing Changes

Expand Down
2 changes: 1 addition & 1 deletion evalml/pipelines/component_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ class ComponentGraph:
... 'max_depth': 6,
... 'n_jobs': -1},
... 'Decision Tree Classifier': {'criterion': 'gini',
... 'max_features': 'auto',
... 'max_features': 'sqrt',
... 'max_depth': 6,
... 'min_samples_split': 2,
... 'min_weight_fraction_leaf': 0.0},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class StackedEnsembleClassifier(StackedEnsembleBase):
>>> cg = ComponentGraph(component_graph)
>>> assert cg.default_parameters == {
... 'Decision Tree Classifier': {'criterion': 'gini',
... 'max_features': 'auto',
... 'max_features': 'sqrt',
... 'max_depth': 6,
... 'min_samples_split': 2,
... 'min_weight_fraction_leaf': 0.0},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,15 @@ class DecisionTreeClassifier(Estimator):
criterion ({"gini", "entropy"}): The function to measure the quality of a split.
Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain.
Defaults to "gini".
max_features (int, float or {"auto", "sqrt", "log2"}): The number of features to consider when looking for the best split:
max_features (int, float or {"sqrt", "log2"}): The number of features to consider when looking for the best split:

- If int, then consider max_features features at each split.
- If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split.
- If "auto", then max_features=sqrt(n_features).
- If "sqrt", then max_features=sqrt(n_features).
- If "log2", then max_features=log2(n_features).
- If None, then max_features = n_features.

The search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than max_features features.
Defaults to "auto".
max_depth (int): The maximum depth of the tree. Defaults to 6.
min_samples_split (int or float): The minimum number of samples required to split an internal node:

Expand All @@ -40,12 +38,12 @@ class DecisionTreeClassifier(Estimator):
name = "Decision Tree Classifier"
hyperparameter_ranges = {
"criterion": ["gini", "entropy"],
"max_features": ["auto", "sqrt", "log2"],
"max_features": ["sqrt", "log2"],
"max_depth": Integer(4, 10),
}
"""{
"criterion": ["gini", "entropy"],
"max_features": ["auto", "sqrt", "log2"],
"max_features": ["sqrt", "log2"],
"max_depth": Integer(4, 10),
}"""
model_family = ModelFamily.DECISION_TREE
Expand All @@ -66,7 +64,7 @@ class DecisionTreeClassifier(Estimator):
def __init__(
self,
criterion="gini",
max_features="auto",
max_features="sqrt",
max_depth=6,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,15 @@ class ExtraTreesClassifier(Estimator):

Args:
n_estimators (float): The number of trees in the forest. Defaults to 100.
max_features (int, float or {"auto", "sqrt", "log2"}): The number of features to consider when looking for the best split:
max_features (int, float or {"sqrt", "log2"}): The number of features to consider when looking for the best split:

- If int, then consider max_features features at each split.
- If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split.
- If "auto", then max_features=sqrt(n_features).
- If "sqrt", then max_features=sqrt(n_features).
- If "log2", then max_features=log2(n_features).
- If None, then max_features = n_features.

The search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than max_features features.
Defaults to "auto".
max_depth (int): The maximum depth of the tree. Defaults to 6.
min_samples_split (int or float): The minimum number of samples required to split an internal node:

Expand All @@ -39,12 +37,12 @@ class ExtraTreesClassifier(Estimator):
name = "Extra Trees Classifier"
hyperparameter_ranges = {
"n_estimators": Integer(10, 1000),
"max_features": ["auto", "sqrt", "log2"],
"max_features": ["sqrt", "log2"],
"max_depth": Integer(4, 10),
}
"""{
"n_estimators": Integer(10, 1000),
"max_features": ["auto", "sqrt", "log2"],
"max_features": ["sqrt", "log2"],
"max_depth": Integer(4, 10),
}
"""
Expand All @@ -66,7 +64,7 @@ class ExtraTreesClassifier(Estimator):
def __init__(
self,
n_estimators=100,
max_features="auto",
max_features="sqrt",
max_depth=6,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""K-Nearest Neighbors Classifier."""
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier as SKKNeighborsClassifier
from skopt.space import Integer

from evalml.model_family import ModelFamily
from evalml.pipelines.components.estimators import Estimator
from evalml.problem_types import ProblemTypes
from evalml.utils import infer_feature_types


class KNeighborsClassifier(Estimator):
Expand Down Expand Up @@ -93,6 +95,34 @@ def __init__(
random_seed=random_seed,
)

def predict(self, X: pd.DataFrame) -> pd.Series:
"""Make predictions using selected features.

Args:
X (pd.DataFrame): Data of shape [n_samples, n_features].

Returns:
pd.Series: Predicted values.
"""
predictions = self._component_obj.predict(X.to_numpy())
predictions = infer_feature_types(predictions)
predictions.index = X.index
return predictions

def predict_proba(self, X: pd.DataFrame) -> pd.Series:
"""Make probability estimates for labels.

Args:
X (pd.DataFrame): Features.

Returns:
pd.Series: Probability estimates.
"""
pred_proba = self._component_obj.predict_proba(X.to_numpy())
pred_proba = infer_feature_types(pred_proba)
pred_proba.index = X.index
return pred_proba

@property
def feature_importance(self):
"""Returns array of 0's matching the input number of features as feature_importance is not defined for KNN classifiers."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,10 @@ class DecisionTreeRegressor(Estimator):
- "friedman_mse", which uses mean squared error with Friedman"s improvement score for potential splits
- "absolute_error" for the mean absolute error, which minimizes the L1 loss using the median of each terminal node,
- "poisson" which uses reduction in Poisson deviance to find splits.
max_features (int, float or {"auto", "sqrt", "log2"}): The number of features to consider when looking for the best split:
max_features (int, float or {"sqrt", "log2"}): The number of features to consider when looking for the best split:

- If int, then consider max_features features at each split.
- If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split.
- If "auto", then max_features=sqrt(n_features).
- If "sqrt", then max_features=sqrt(n_features).
- If "log2", then max_features=log2(n_features).
- If None, then max_features = n_features.
Expand All @@ -43,12 +42,12 @@ class DecisionTreeRegressor(Estimator):
name = "Decision Tree Regressor"
hyperparameter_ranges = {
"criterion": ["squared_error", "friedman_mse", "absolute_error"],
"max_features": ["auto", "sqrt", "log2"],
"max_features": ["sqrt", "log2"],
"max_depth": Integer(4, 10),
}
"""{
"criterion": ["squared_error", "friedman_mse", "absolute_error"],
"max_features": ["auto", "sqrt", "log2"],
"max_features": ["sqrt", "log2"],
"max_depth": Integer(4, 10),
}"""
model_family = ModelFamily.DECISION_TREE
Expand All @@ -65,7 +64,7 @@ class DecisionTreeRegressor(Estimator):
def __init__(
self,
criterion="squared_error",
max_features="auto",
max_features="sqrt",
max_depth=6,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,15 @@ class ExtraTreesRegressor(Estimator):

Args:
n_estimators (float): The number of trees in the forest. Defaults to 100.
max_features (int, float or {"auto", "sqrt", "log2"}): The number of features to consider when looking for the best split:
max_features (int, float or {"sqrt", "log2"}): The number of features to consider when looking for the best split:

- If int, then consider max_features features at each split.
- If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split.
- If "auto", then max_features=sqrt(n_features).
- If "sqrt", then max_features=sqrt(n_features).
- If "log2", then max_features=log2(n_features).
- If None, then max_features = n_features.

The search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than max_features features.
Defaults to "auto".
max_depth (int): The maximum depth of the tree. Defaults to 6.
min_samples_split (int or float): The minimum number of samples required to split an internal node:

Expand All @@ -45,12 +43,12 @@ class ExtraTreesRegressor(Estimator):
name = "Extra Trees Regressor"
hyperparameter_ranges = {
"n_estimators": Integer(10, 1000),
"max_features": ["auto", "sqrt", "log2"],
"max_features": ["sqrt", "log2"],
"max_depth": Integer(4, 10),
}
"""{
"n_estimators": Integer(10, 1000),
"max_features": ["auto", "sqrt", "log2"],
"max_features": ["sqrt", "log2"],
"max_depth": Integer(4, 10),
}"""
model_family = ModelFamily.EXTRA_TREES
Expand All @@ -67,7 +65,7 @@ class ExtraTreesRegressor(Estimator):
def __init__(
self,
n_estimators: int = 100,
max_features: str = "auto",
max_features: str = "sqrt",
max_depth: int = 6,
min_samples_split: int = 2,
min_weight_fraction_leaf: float = 0.0,
Expand Down
2 changes: 1 addition & 1 deletion evalml/pipelines/components/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ def generate_component_code(element):

Examples:
>>> from evalml.pipelines.components.estimators.regressors.decision_tree_regressor import DecisionTreeRegressor
>>> assert generate_component_code(DecisionTreeRegressor()) == "from evalml.pipelines.components.estimators.regressors.decision_tree_regressor import DecisionTreeRegressor\n\ndecisionTreeRegressor = DecisionTreeRegressor(**{'criterion': 'squared_error', 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0})"
>>> assert generate_component_code(DecisionTreeRegressor()) == "from evalml.pipelines.components.estimators.regressors.decision_tree_regressor import DecisionTreeRegressor\n\ndecisionTreeRegressor = DecisionTreeRegressor(**{'criterion': 'squared_error', 'max_features': 'sqrt', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0})"
...
>>> from evalml.pipelines.components.transformers.imputers.simple_imputer import SimpleImputer
>>> assert generate_component_code(SimpleImputer()) == "from evalml.pipelines.components.transformers.imputers.simple_imputer import SimpleImputer\n\nsimpleImputer = SimpleImputer(**{'impute_strategy': 'most_frequent', 'fill_value': None})"
Expand Down
10 changes: 5 additions & 5 deletions evalml/tests/component_tests/test_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,8 +326,8 @@ def test_describe_component():
lr_classifier = LogisticRegressionClassifier()
en_classifier = ElasticNetClassifier()
en_regressor = ElasticNetRegressor()
et_classifier = ExtraTreesClassifier(n_estimators=10, max_features="auto")
et_regressor = ExtraTreesRegressor(n_estimators=10, max_features="auto")
et_classifier = ExtraTreesClassifier(n_estimators=10, max_features="sqrt")
et_regressor = ExtraTreesRegressor(n_estimators=10, max_features="sqrt")
rf_classifier = RandomForestClassifier(n_estimators=10, max_depth=3)
rf_regressor = RandomForestRegressor(n_estimators=10, max_depth=3)
linear_regressor = LinearRegressor()
Expand Down Expand Up @@ -374,7 +374,7 @@ def test_describe_component():
"name": "Extra Trees Classifier",
"parameters": {
"n_estimators": 10,
"max_features": "auto",
"max_features": "sqrt",
"max_depth": 6,
"min_samples_split": 2,
"min_weight_fraction_leaf": 0.0,
Expand All @@ -385,7 +385,7 @@ def test_describe_component():
"name": "Extra Trees Regressor",
"parameters": {
"n_estimators": 10,
"max_features": "auto",
"max_features": "sqrt",
"max_depth": 6,
"min_samples_split": 2,
"min_weight_fraction_leaf": 0.0,
Expand Down Expand Up @@ -1615,7 +1615,7 @@ def test_generate_code():

expected_code = (
"from evalml.pipelines.components.estimators.regressors.et_regressor import ExtraTreesRegressor"
"\n\nextraTreesRegressor = ExtraTreesRegressor(**{'n_estimators': 50, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1})"
"\n\nextraTreesRegressor = ExtraTreesRegressor(**{'n_estimators': 50, 'max_features': 'sqrt', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1})"
)
component_code = generate_component_code(ExtraTreesRegressor(n_estimators=50))
assert component_code == expected_code
Expand Down
6 changes: 3 additions & 3 deletions evalml/tests/component_tests/test_decision_tree_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def test_problem_types():
def test_fit_predict_binary(X_y_binary):
X, y = X_y_binary

sk_clf = SKDecisionTreeClassifier(max_depth=6, max_features="auto", random_state=0)
sk_clf = SKDecisionTreeClassifier(max_depth=6, max_features="sqrt", random_state=0)
sk_clf.fit(X, y)
y_pred_sk = sk_clf.predict(X)
y_pred_proba_sk = sk_clf.predict_proba(X)
Expand All @@ -39,7 +39,7 @@ def test_fit_predict_binary(X_y_binary):
def test_fit_predict_multi(X_y_multi):
X, y = X_y_multi

sk_clf = SKDecisionTreeClassifier(max_depth=6, max_features="auto", random_state=0)
sk_clf = SKDecisionTreeClassifier(max_depth=6, max_features="sqrt", random_state=0)
sk_clf.fit(X, y)
y_pred_sk = sk_clf.predict(X)
y_pred_proba_sk = sk_clf.predict_proba(X)
Expand All @@ -59,7 +59,7 @@ def test_feature_importance(X_y_binary):
X, y = X_y_binary

clf = DecisionTreeClassifier()
sk_clf = SKDecisionTreeClassifier(max_depth=6, max_features="auto", random_state=0)
sk_clf = SKDecisionTreeClassifier(max_depth=6, max_features="sqrt", random_state=0)
sk_clf.fit(X, y)
sk_feature_importance = sk_clf.feature_importances_

Expand Down
4 changes: 2 additions & 2 deletions evalml/tests/component_tests/test_decision_tree_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_problem_types():
def test_fit_predict(X_y_regression):
X, y = X_y_regression

sk_clf = SKDecisionTreeRegressor(max_depth=6, max_features="auto", random_state=0)
sk_clf = SKDecisionTreeRegressor(max_depth=6, max_features="sqrt", random_state=0)
sk_clf.fit(X, y)
y_pred_sk = sk_clf.predict(X)

Expand All @@ -36,7 +36,7 @@ def test_feature_importance(X_y_regression):
X, y = X_y_regression

clf = DecisionTreeRegressor()
sk_clf = SKDecisionTreeRegressor(max_depth=6, max_features="auto", random_state=0)
sk_clf = SKDecisionTreeRegressor(max_depth=6, max_features="sqrt", random_state=0)
sk_clf.fit(X, y)
sk_feature_importance = sk_clf.feature_importances_

Expand Down
2 changes: 1 addition & 1 deletion evalml/tests/component_tests/test_et_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def test_problem_types():
def test_fit_predict_binary(X_y_binary):
X, y = X_y_binary

sk_clf = SKExtraTreesClassifier(max_depth=6, random_state=0)
sk_clf = SKExtraTreesClassifier(max_depth=6, random_state=0, max_features="sqrt")
sk_clf.fit(X, y)
y_pred_sk = sk_clf.predict(X)
y_pred_proba_sk = sk_clf.predict_proba(X)
Expand Down
9 changes: 7 additions & 2 deletions evalml/tests/component_tests/test_et_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_problem_types():
def test_fit_predict(X_y_regression):
X, y = X_y_regression

sk_clf = SKExtraTreesRegressor(max_depth=6, random_state=0)
sk_clf = SKExtraTreesRegressor(max_depth=6, random_state=0, max_features="sqrt")
sk_clf.fit(X, y)
y_pred_sk = sk_clf.predict(X)

Expand All @@ -36,7 +36,12 @@ def test_feature_importance(X_y_regression):
X, y = X_y_regression

clf = ExtraTreesRegressor(n_jobs=1)
sk_clf = SKExtraTreesRegressor(max_depth=6, random_state=0, n_jobs=1)
sk_clf = SKExtraTreesRegressor(
max_depth=6,
random_state=0,
n_jobs=1,
max_features="sqrt",
)
sk_clf.fit(X, y)
sk_feature_importance = sk_clf.feature_importances_

Expand Down
Loading
Loading