Add scikit-learn tests (dmlc#3674)

* Add scikit-learn tests Goal is to pass scikit-learn's check_estimator() for XGBClassifier, XGBRegressor, and XGBRanker. It is actually not possible to do so entirely, since check_estimator() assumes that NaN is disallowed, but XGBoost allows for NaN as missing values. However, it is always good ideas to add some checks inspired by check_estimator(). * Fix lint * Fix lint
hcho3 · Sep 6, 2018 · d176a0f · d176a0f
1 parent 190d888
commit d176a0f
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 4 deletions.
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
@@ -9,7 +9,6 @@
 import os
 import re
 import sys
-
 import numpy as np
 import scipy.sparse
 
@@ -374,11 +373,15 @@ def __init__(self, data, label=None, missing=None,
         if label is not None:
             if isinstance(label, np.ndarray):
                 self.set_label_npy2d(label)
+            elif getattr(label, '__array__', None) is not None:
+                self.set_label_npy2d(label.__array__())
             else:
                 self.set_label(label)
         if weight is not None:
             if isinstance(weight, np.ndarray):
                 self.set_weight_npy2d(weight)
+            elif getattr(weight, '__array__', None) is not None:
+                self.set_weight_npy2d(weight.__array__())
             else:
                 self.set_weight(weight)
 
@@ -428,7 +431,7 @@ def _init_from_npy2d(self, mat, missing, nthread):
         and type if memory use is a concern.
         """
         if len(mat.shape) != 2:
-            raise ValueError('Input numpy.ndarray must be 2 dimensional')
+            raise ValueError('Input numpy.ndarray must be 2 dimensional. Reshape your data.')
         # flatten the array by rows and ensure it is float32.
         # we try to avoid data copies if possible (reshape returns a view when possible
         # and we explicitly tell np.array to try and avoid copying)

diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
@@ -1,10 +1,12 @@
 # coding: utf-8
-# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, E0012, R0912
+# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, E0012, R0912, C0302
 """Scikit-Learn Wrapper interface for XGBoost."""
 from __future__ import absolute_import
 
 import numpy as np
 import warnings
+from sklearn.exceptions import NotFittedError
+from sklearn.exceptions import DataConversionWarning
 from .core import Booster, DMatrix, XGBoostError
 from .training import train
 
@@ -14,6 +16,16 @@
                      XGBClassifierBase, XGBRegressorBase, XGBLabelEncoder)
 
 
+def _check_label_1d(label):
+    """Produce warning if label is not 1D array"""
+    label = np.array(label, copy=False, dtype=np.float32)
+    if len(label.shape) == 2 and label.shape[1] == 1:
+        warnings.warn('A column-vector y was passed when a 1d array was'
+                      ' expected. Please change the shape of y to '
+                      '(n_samples, ), for example using ravel().',
+                      DataConversionWarning, stacklevel=2)
+
+
 def _objective_decorator(func):
     """Decorate an objective function
 
@@ -178,7 +190,7 @@ def get_booster(self):
         booster : a xgboost booster of underlying model
         """
         if self._Booster is None:
-            raise XGBoostError('need to call fit or load_model beforehand')
+            raise NotFittedError('need to call fit or load_model beforehand')
         return self._Booster
 
     def get_params(self, deep=False):
@@ -286,6 +298,7 @@ def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
             file name of stored xgb model or 'Booster' instance Xgb model to be
             loaded before training (allows training continuation).
         """
+        _check_label_1d(label=y)
         if sample_weight is not None:
             trainDmatrix = DMatrix(X, label=y, weight=sample_weight,
                                    missing=self.missing, nthread=self.n_jobs)
@@ -536,6 +549,7 @@ def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
             file name of stored xgb model or 'Booster' instance Xgb model to be
             loaded before training (allows training continuation).
         """
+        _check_label_1d(label=y)
         evals_result = {}
         self.classes_ = np.unique(y)
         self.n_classes_ = len(self.classes_)
@@ -912,6 +926,7 @@ def fit(self, X, y, group, sample_weight=None, eval_set=None, sample_weight_eval
             file name of stored xgb model or 'Booster' instance Xgb model to be
             loaded before training (allows training continuation).
         """
+        _check_label_1d(label=y)
         # check if group information is provided
         if group is None:
             raise ValueError("group is required for ranking task")

diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py
@@ -203,6 +203,18 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
                       DeprecationWarning)
         callbacks.append(callback.reset_learning_rate(learning_rates))
 
+    nrow = dtrain.num_row()
+    ncol = dtrain.num_col()
+    if nrow <= 0:
+        raise ValueError('{} row(s) (shape=({}, {})) while a minimum of 1 is required.'
+                         .format(nrow, nrow, ncol))
+    if ncol <= 0:
+        raise ValueError('{} feature(s) (shape=({}, {})) while a minimum of 1 is required.'
+                         .format(ncol, nrow, ncol))
+    label = dtrain.get_label()
+    if nrow != len(label):
+        raise ValueError('Label must have same length as the number of data rows')
+
     return _train_internal(params, dtrain,
                            num_boost_round=num_boost_round,
                            evals=evals,