Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python-package] [docs] complete type annotations for scikit-learn fit() methods #5816

Merged
merged 6 commits into from
Apr 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 32 additions & 31 deletions python-package/lightgbm/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _LGBM_BoosterBestScoreType,
_LGBM_CategoricalFeatureConfiguration, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration,
_LGBM_GroupType, _LGBM_LabelType, _log_warning)
_LGBM_GroupType, _LGBM_InitScoreType, _LGBM_LabelType, _LGBM_WeightType, _log_warning)
from .callback import _EvalResultDict, record_evaluation
from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray,
_LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase,
Expand Down Expand Up @@ -83,6 +83,7 @@
_LGBM_ScikitCustomEvalFunction,
List[Union[str, _LGBM_ScikitCustomEvalFunction]]
]
_LGBM_ScikitValidSet = Tuple[_LGBM_ScikitMatrixLike, _LGBM_LabelType]


class _ObjectiveFunctionWrapper:
Expand Down Expand Up @@ -725,15 +726,15 @@ def fit(
self,
X: _LGBM_ScikitMatrixLike,
y: _LGBM_LabelType,
sample_weight=None,
init_score=None,
sample_weight: Optional[_LGBM_WeightType] = None,
init_score: Optional[_LGBM_InitScoreType] = None,
group: Optional[_LGBM_GroupType] = None,
eval_set=None,
eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
eval_names: Optional[List[str]] = None,
eval_sample_weight=None,
eval_class_weight=None,
eval_init_score=None,
eval_group=None,
eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
eval_class_weight: Optional[List[float]] = None,
eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
eval_group: Optional[List[_LGBM_GroupType]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
feature_name: _LGBM_FeatureNameConfiguration = 'auto',
categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
Expand Down Expand Up @@ -857,12 +858,12 @@ def _get_meta_data(collection, name, i):
fit.__doc__ = _lgbmmodel_doc_fit.format(
X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples]",
sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)",
init_score_shape="array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
sample_weight_shape="numpy array, pandas Series, list of int or float of shape = [n_samples] or None, optional (default=None)",
init_score_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
group_shape="numpy array, pandas Series, list of int or float, or None, optional (default=None)",
eval_sample_weight_shape="list of array, or None, optional (default=None)",
eval_init_score_shape="list of array, or None, optional (default=None)",
eval_group_shape="list of array, or None, optional (default=None)"
eval_sample_weight_shape="list of array (same types as ``sample_weight`` supports), or None, optional (default=None)",
eval_init_score_shape="list of array (same types as ``init_score`` supports), or None, optional (default=None)",
eval_group_shape="list of array (same types as ``group`` supports), or None, optional (default=None)"
) + "\n\n" + _lgbmmodel_doc_custom_eval_note

def predict(
Expand Down Expand Up @@ -1021,12 +1022,12 @@ def fit( # type: ignore[override]
self,
X: _LGBM_ScikitMatrixLike,
y: _LGBM_LabelType,
sample_weight=None,
init_score=None,
eval_set=None,
sample_weight: Optional[_LGBM_WeightType] = None,
init_score: Optional[_LGBM_InitScoreType] = None,
eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
eval_names: Optional[List[str]] = None,
eval_sample_weight=None,
eval_init_score=None,
eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
feature_name: _LGBM_FeatureNameConfiguration = 'auto',
categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
Expand Down Expand Up @@ -1067,13 +1068,13 @@ def fit( # type: ignore[override]
self,
X: _LGBM_ScikitMatrixLike,
y: _LGBM_LabelType,
sample_weight=None,
init_score=None,
eval_set=None,
sample_weight: Optional[_LGBM_WeightType] = None,
init_score: Optional[_LGBM_InitScoreType] = None,
eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
eval_names: Optional[List[str]] = None,
eval_sample_weight=None,
eval_class_weight=None,
eval_init_score=None,
eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
eval_class_weight: Optional[List[float]] = None,
eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
feature_name: _LGBM_FeatureNameConfiguration = 'auto',
categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
Expand Down Expand Up @@ -1116,7 +1117,7 @@ def fit( # type: ignore[override]
eval_metric = eval_metric_list

# do not modify args, as it causes errors in model selection tools
valid_sets: Optional[List[Tuple]] = None
valid_sets: Optional[List[_LGBM_ScikitValidSet]] = None
if eval_set is not None:
if isinstance(eval_set, tuple):
eval_set = [eval_set]
Expand Down Expand Up @@ -1251,14 +1252,14 @@ def fit( # type: ignore[override]
self,
X: _LGBM_ScikitMatrixLike,
y: _LGBM_LabelType,
sample_weight=None,
init_score=None,
sample_weight: Optional[_LGBM_WeightType] = None,
init_score: Optional[_LGBM_InitScoreType] = None,
group: Optional[_LGBM_GroupType] = None,
eval_set=None,
eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
eval_names: Optional[List[str]] = None,
eval_sample_weight=None,
eval_init_score=None,
eval_group=None,
eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
eval_group: Optional[List[_LGBM_GroupType]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
eval_at: Union[List[int], Tuple[int, ...]] = (1, 2, 3, 4, 5),
feature_name: _LGBM_FeatureNameConfiguration = 'auto',
Expand Down
60 changes: 57 additions & 3 deletions tests/python_package_test/test_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -1416,7 +1416,17 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
pytest.skip('pandas is not installed')
if any(t.startswith("dt_") for t in [X_type, y_type]) and not DATATABLE_INSTALLED:
pytest.skip('datatable is not installed')
X, y, g = _create_data(task, n_samples=1_000)
X, y, g = _create_data(task, n_samples=2_000)
weights = np.abs(np.random.randn(y.shape[0]))

if task == 'binary-classification' or task == 'regression':
init_score = np.full_like(y, np.mean(y))
elif task == 'multiclass-classification':
init_score = np.outer(y, np.array([0.1, 0.2, 0.7]))
else:
raise ValueError(f"Unrecognized task '{task}'")

X_valid = X * 2
if X_type == 'dt_DataTable':
X = dt_DataTable(X)
elif X_type == 'list2d':
Expand All @@ -1430,17 +1440,39 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
elif X_type != 'numpy':
raise ValueError(f"Unrecognized X_type: '{X_type}'")

# make weights and init_score same types as y, just to avoid
# a huge number of combinations and therefore test cases
if y_type == 'list1d':
y = y.tolist()
weights = weights.tolist()
init_score = init_score.tolist()
elif y_type == 'pd_DataFrame':
y = pd_DataFrame(y)
weights = pd_Series(weights)
if task == 'multiclass-classification':
init_score = pd_DataFrame(init_score)
else:
init_score = pd_Series(init_score)
elif y_type == 'pd_Series':
y = pd_Series(y)
weights = pd_Series(weights)
if task == 'multiclass-classification':
init_score = pd_DataFrame(init_score)
else:
init_score = pd_Series(init_score)
elif y_type != 'numpy':
raise ValueError(f"Unrecognized y_type: '{y_type}'")

model = task_to_model_factory[task](n_estimators=10, verbose=-1)
model.fit(X, y)
model.fit(
X=X,
y=y,
sample_weight=weights,
init_score=init_score,
eval_set=[(X_valid, y)],
eval_sample_weight=[weights],
eval_init_score=[init_score]
)

preds = model.predict(X)
if task == 'binary-classification':
Expand All @@ -1462,6 +1494,10 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type
if any(t.startswith("dt_") for t in [X_type, y_type, g_type]) and not DATATABLE_INSTALLED:
pytest.skip('datatable is not installed')
X, y, g = _create_data(task='ranking', n_samples=1_000)
weights = np.abs(np.random.randn(y.shape[0]))
init_score = np.full_like(y, np.mean(y))
X_valid = X * 2

if X_type == 'dt_DataTable':
X = dt_DataTable(X)
elif X_type == 'list2d':
Expand All @@ -1475,12 +1511,20 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type
elif X_type != 'numpy':
raise ValueError(f"Unrecognized X_type: '{X_type}'")

# make weights and init_score same types as y, just to avoid
# a huge number of combinations and therefore test cases
if y_type == 'list1d':
y = y.tolist()
weights = weights.tolist()
init_score = init_score.tolist()
elif y_type == 'pd_DataFrame':
y = pd_DataFrame(y)
weights = pd_Series(weights)
init_score = pd_Series(init_score)
elif y_type == 'pd_Series':
y = pd_Series(y)
weights = pd_Series(weights)
init_score = pd_Series(init_score)
elif y_type != 'numpy':
raise ValueError(f"Unrecognized y_type: '{y_type}'")

Expand All @@ -1494,6 +1538,16 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type
raise ValueError(f"Unrecognized g_type: '{g_type}'")

model = task_to_model_factory['ranking'](n_estimators=10, verbose=-1)
model.fit(X, y, group=g)
model.fit(
X=X,
y=y,
sample_weight=weights,
init_score=init_score,
group=g,
eval_set=[(X_valid, y)],
eval_sample_weight=[weights],
eval_init_score=[init_score],
eval_group=[g]
)
preds = model.predict(X)
assert spearmanr(preds, y).correlation >= 0.99