Skip to content

Commit

Permalink
Add scikit-learn tests (dmlc#3674)
Browse files Browse the repository at this point in the history
* Add scikit-learn tests

Goal is to pass scikit-learn's check_estimator() for XGBClassifier,
XGBRegressor, and XGBRanker. It is actually not possible to do so
entirely, since check_estimator() assumes that NaN is disallowed,
but XGBoost allows for NaN as missing values. However, it is always
good ideas to add some checks inspired by check_estimator().

* Fix lint

* Fix lint
  • Loading branch information
hcho3 authored Sep 6, 2018
1 parent 190d888 commit d176a0f
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 4 deletions.
7 changes: 5 additions & 2 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import os
import re
import sys

import numpy as np
import scipy.sparse

Expand Down Expand Up @@ -374,11 +373,15 @@ def __init__(self, data, label=None, missing=None,
if label is not None:
if isinstance(label, np.ndarray):
self.set_label_npy2d(label)
elif getattr(label, '__array__', None) is not None:
self.set_label_npy2d(label.__array__())
else:
self.set_label(label)
if weight is not None:
if isinstance(weight, np.ndarray):
self.set_weight_npy2d(weight)
elif getattr(weight, '__array__', None) is not None:
self.set_weight_npy2d(weight.__array__())
else:
self.set_weight(weight)

Expand Down Expand Up @@ -428,7 +431,7 @@ def _init_from_npy2d(self, mat, missing, nthread):
and type if memory use is a concern.
"""
if len(mat.shape) != 2:
raise ValueError('Input numpy.ndarray must be 2 dimensional')
raise ValueError('Input numpy.ndarray must be 2 dimensional. Reshape your data.')
# flatten the array by rows and ensure it is float32.
# we try to avoid data copies if possible (reshape returns a view when possible
# and we explicitly tell np.array to try and avoid copying)
Expand Down
19 changes: 17 additions & 2 deletions python-package/xgboost/sklearn.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# coding: utf-8
# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, E0012, R0912
# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, E0012, R0912, C0302
"""Scikit-Learn Wrapper interface for XGBoost."""
from __future__ import absolute_import

import numpy as np
import warnings
from sklearn.exceptions import NotFittedError
from sklearn.exceptions import DataConversionWarning
from .core import Booster, DMatrix, XGBoostError
from .training import train

Expand All @@ -14,6 +16,16 @@
XGBClassifierBase, XGBRegressorBase, XGBLabelEncoder)


def _check_label_1d(label):
"""Produce warning if label is not 1D array"""
label = np.array(label, copy=False, dtype=np.float32)
if len(label.shape) == 2 and label.shape[1] == 1:
warnings.warn('A column-vector y was passed when a 1d array was'
' expected. Please change the shape of y to '
'(n_samples, ), for example using ravel().',
DataConversionWarning, stacklevel=2)


def _objective_decorator(func):
"""Decorate an objective function
Expand Down Expand Up @@ -178,7 +190,7 @@ def get_booster(self):
booster : a xgboost booster of underlying model
"""
if self._Booster is None:
raise XGBoostError('need to call fit or load_model beforehand')
raise NotFittedError('need to call fit or load_model beforehand')
return self._Booster

def get_params(self, deep=False):
Expand Down Expand Up @@ -286,6 +298,7 @@ def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
file name of stored xgb model or 'Booster' instance Xgb model to be
loaded before training (allows training continuation).
"""
_check_label_1d(label=y)
if sample_weight is not None:
trainDmatrix = DMatrix(X, label=y, weight=sample_weight,
missing=self.missing, nthread=self.n_jobs)
Expand Down Expand Up @@ -536,6 +549,7 @@ def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
file name of stored xgb model or 'Booster' instance Xgb model to be
loaded before training (allows training continuation).
"""
_check_label_1d(label=y)
evals_result = {}
self.classes_ = np.unique(y)
self.n_classes_ = len(self.classes_)
Expand Down Expand Up @@ -912,6 +926,7 @@ def fit(self, X, y, group, sample_weight=None, eval_set=None, sample_weight_eval
file name of stored xgb model or 'Booster' instance Xgb model to be
loaded before training (allows training continuation).
"""
_check_label_1d(label=y)
# check if group information is provided
if group is None:
raise ValueError("group is required for ranking task")
Expand Down
12 changes: 12 additions & 0 deletions python-package/xgboost/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,18 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
DeprecationWarning)
callbacks.append(callback.reset_learning_rate(learning_rates))

nrow = dtrain.num_row()
ncol = dtrain.num_col()
if nrow <= 0:
raise ValueError('{} row(s) (shape=({}, {})) while a minimum of 1 is required.'
.format(nrow, nrow, ncol))
if ncol <= 0:
raise ValueError('{} feature(s) (shape=({}, {})) while a minimum of 1 is required.'
.format(ncol, nrow, ncol))
label = dtrain.get_label()
if nrow != len(label):
raise ValueError('Label must have same length as the number of data rows')

return _train_internal(params, dtrain,
num_boost_round=num_boost_round,
evals=evals,
Expand Down

0 comments on commit d176a0f

Please sign in to comment.