Skip to content

Commit

Permalink
[python-package] [docs] complete type annotations for scikit-learn fi…
Browse files Browse the repository at this point in the history
…t() methods (#5816)
  • Loading branch information
jameslamb authored Apr 11, 2023
1 parent 638014d commit 99daacf
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 34 deletions.
63 changes: 32 additions & 31 deletions python-package/lightgbm/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _LGBM_BoosterBestScoreType,
_LGBM_CategoricalFeatureConfiguration, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration,
_LGBM_GroupType, _LGBM_LabelType, _log_warning)
_LGBM_GroupType, _LGBM_InitScoreType, _LGBM_LabelType, _LGBM_WeightType, _log_warning)
from .callback import _EvalResultDict, record_evaluation
from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray,
_LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase,
Expand Down Expand Up @@ -83,6 +83,7 @@
_LGBM_ScikitCustomEvalFunction,
List[Union[str, _LGBM_ScikitCustomEvalFunction]]
]
_LGBM_ScikitValidSet = Tuple[_LGBM_ScikitMatrixLike, _LGBM_LabelType]


class _ObjectiveFunctionWrapper:
Expand Down Expand Up @@ -725,15 +726,15 @@ def fit(
self,
X: _LGBM_ScikitMatrixLike,
y: _LGBM_LabelType,
sample_weight=None,
init_score=None,
sample_weight: Optional[_LGBM_WeightType] = None,
init_score: Optional[_LGBM_InitScoreType] = None,
group: Optional[_LGBM_GroupType] = None,
eval_set=None,
eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
eval_names: Optional[List[str]] = None,
eval_sample_weight=None,
eval_class_weight=None,
eval_init_score=None,
eval_group=None,
eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
eval_class_weight: Optional[List[float]] = None,
eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
eval_group: Optional[List[_LGBM_GroupType]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
feature_name: _LGBM_FeatureNameConfiguration = 'auto',
categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
Expand Down Expand Up @@ -857,12 +858,12 @@ def _get_meta_data(collection, name, i):
fit.__doc__ = _lgbmmodel_doc_fit.format(
X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples]",
sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)",
init_score_shape="array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
sample_weight_shape="numpy array, pandas Series, list of int or float of shape = [n_samples] or None, optional (default=None)",
init_score_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
group_shape="numpy array, pandas Series, list of int or float, or None, optional (default=None)",
eval_sample_weight_shape="list of array, or None, optional (default=None)",
eval_init_score_shape="list of array, or None, optional (default=None)",
eval_group_shape="list of array, or None, optional (default=None)"
eval_sample_weight_shape="list of array (same types as ``sample_weight`` supports), or None, optional (default=None)",
eval_init_score_shape="list of array (same types as ``init_score`` supports), or None, optional (default=None)",
eval_group_shape="list of array (same types as ``group`` supports), or None, optional (default=None)"
) + "\n\n" + _lgbmmodel_doc_custom_eval_note

def predict(
Expand Down Expand Up @@ -1021,12 +1022,12 @@ def fit( # type: ignore[override]
self,
X: _LGBM_ScikitMatrixLike,
y: _LGBM_LabelType,
sample_weight=None,
init_score=None,
eval_set=None,
sample_weight: Optional[_LGBM_WeightType] = None,
init_score: Optional[_LGBM_InitScoreType] = None,
eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
eval_names: Optional[List[str]] = None,
eval_sample_weight=None,
eval_init_score=None,
eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
feature_name: _LGBM_FeatureNameConfiguration = 'auto',
categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
Expand Down Expand Up @@ -1067,13 +1068,13 @@ def fit( # type: ignore[override]
self,
X: _LGBM_ScikitMatrixLike,
y: _LGBM_LabelType,
sample_weight=None,
init_score=None,
eval_set=None,
sample_weight: Optional[_LGBM_WeightType] = None,
init_score: Optional[_LGBM_InitScoreType] = None,
eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
eval_names: Optional[List[str]] = None,
eval_sample_weight=None,
eval_class_weight=None,
eval_init_score=None,
eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
eval_class_weight: Optional[List[float]] = None,
eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
feature_name: _LGBM_FeatureNameConfiguration = 'auto',
categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
Expand Down Expand Up @@ -1116,7 +1117,7 @@ def fit( # type: ignore[override]
eval_metric = eval_metric_list

# do not modify args, as it causes errors in model selection tools
valid_sets: Optional[List[Tuple]] = None
valid_sets: Optional[List[_LGBM_ScikitValidSet]] = None
if eval_set is not None:
if isinstance(eval_set, tuple):
eval_set = [eval_set]
Expand Down Expand Up @@ -1251,14 +1252,14 @@ def fit( # type: ignore[override]
self,
X: _LGBM_ScikitMatrixLike,
y: _LGBM_LabelType,
sample_weight=None,
init_score=None,
sample_weight: Optional[_LGBM_WeightType] = None,
init_score: Optional[_LGBM_InitScoreType] = None,
group: Optional[_LGBM_GroupType] = None,
eval_set=None,
eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
eval_names: Optional[List[str]] = None,
eval_sample_weight=None,
eval_init_score=None,
eval_group=None,
eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
eval_group: Optional[List[_LGBM_GroupType]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
eval_at: Union[List[int], Tuple[int, ...]] = (1, 2, 3, 4, 5),
feature_name: _LGBM_FeatureNameConfiguration = 'auto',
Expand Down
60 changes: 57 additions & 3 deletions tests/python_package_test/test_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -1416,7 +1416,17 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
pytest.skip('pandas is not installed')
if any(t.startswith("dt_") for t in [X_type, y_type]) and not DATATABLE_INSTALLED:
pytest.skip('datatable is not installed')
X, y, g = _create_data(task, n_samples=1_000)
X, y, g = _create_data(task, n_samples=2_000)
weights = np.abs(np.random.randn(y.shape[0]))

if task == 'binary-classification' or task == 'regression':
init_score = np.full_like(y, np.mean(y))
elif task == 'multiclass-classification':
init_score = np.outer(y, np.array([0.1, 0.2, 0.7]))
else:
raise ValueError(f"Unrecognized task '{task}'")

X_valid = X * 2
if X_type == 'dt_DataTable':
X = dt_DataTable(X)
elif X_type == 'list2d':
Expand All @@ -1430,17 +1440,39 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
elif X_type != 'numpy':
raise ValueError(f"Unrecognized X_type: '{X_type}'")

# make weights and init_score same types as y, just to avoid
# a huge number of combinations and therefore test cases
if y_type == 'list1d':
y = y.tolist()
weights = weights.tolist()
init_score = init_score.tolist()
elif y_type == 'pd_DataFrame':
y = pd_DataFrame(y)
weights = pd_Series(weights)
if task == 'multiclass-classification':
init_score = pd_DataFrame(init_score)
else:
init_score = pd_Series(init_score)
elif y_type == 'pd_Series':
y = pd_Series(y)
weights = pd_Series(weights)
if task == 'multiclass-classification':
init_score = pd_DataFrame(init_score)
else:
init_score = pd_Series(init_score)
elif y_type != 'numpy':
raise ValueError(f"Unrecognized y_type: '{y_type}'")

model = task_to_model_factory[task](n_estimators=10, verbose=-1)
model.fit(X, y)
model.fit(
X=X,
y=y,
sample_weight=weights,
init_score=init_score,
eval_set=[(X_valid, y)],
eval_sample_weight=[weights],
eval_init_score=[init_score]
)

preds = model.predict(X)
if task == 'binary-classification':
Expand All @@ -1462,6 +1494,10 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type
if any(t.startswith("dt_") for t in [X_type, y_type, g_type]) and not DATATABLE_INSTALLED:
pytest.skip('datatable is not installed')
X, y, g = _create_data(task='ranking', n_samples=1_000)
weights = np.abs(np.random.randn(y.shape[0]))
init_score = np.full_like(y, np.mean(y))
X_valid = X * 2

if X_type == 'dt_DataTable':
X = dt_DataTable(X)
elif X_type == 'list2d':
Expand All @@ -1475,12 +1511,20 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type
elif X_type != 'numpy':
raise ValueError(f"Unrecognized X_type: '{X_type}'")

# make weights and init_score same types as y, just to avoid
# a huge number of combinations and therefore test cases
if y_type == 'list1d':
y = y.tolist()
weights = weights.tolist()
init_score = init_score.tolist()
elif y_type == 'pd_DataFrame':
y = pd_DataFrame(y)
weights = pd_Series(weights)
init_score = pd_Series(init_score)
elif y_type == 'pd_Series':
y = pd_Series(y)
weights = pd_Series(weights)
init_score = pd_Series(init_score)
elif y_type != 'numpy':
raise ValueError(f"Unrecognized y_type: '{y_type}'")

Expand All @@ -1494,6 +1538,16 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type
raise ValueError(f"Unrecognized g_type: '{g_type}'")

model = task_to_model_factory['ranking'](n_estimators=10, verbose=-1)
model.fit(X, y, group=g)
model.fit(
X=X,
y=y,
sample_weight=weights,
init_score=init_score,
group=g,
eval_set=[(X_valid, y)],
eval_sample_weight=[weights],
eval_init_score=[init_score],
eval_group=[g]
)
preds = model.predict(X)
assert spearmanr(preds, y).correlation >= 0.99

0 comments on commit 99daacf

Please sign in to comment.