diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst index a9a8b0837dd9..ae468d7d789f 100644 --- a/doc/tutorials/dask.rst +++ b/doc/tutorials/dask.rst @@ -131,8 +131,14 @@ Basic functionalities including training and generating predictions for regressi classification are implemented. But there are still some other limitations we haven't addressed yet. -- Label encoding for Scikit-Learn classifier. -- Ranking +- Label encoding for Scikit-Learn classifier may not be supported. Meaning that user need + to encode their training labels into discrete values first. +- Ranking is not supported right now. +- Empty worker is not well supported by classifier. If the training hangs for classifier + with a warning about empty DMatrix, please consider balancing your data first. But + regressor works fine with empty DMatrix. - Callback functions are not tested. -- To use cross validation one needs to explicitly train different models instead of using - a functional API like ``xgboost.cv``. +- Only ``GridSearchCV`` from ``scikit-learn`` is supported for dask interface. Meaning + that we can distribute data among workers but have to train one model at a time. If you + want to scale up grid searching with model parallelism by ``dask-ml``, please consider + using normal ``scikit-learn`` interface like `xgboost.XGBRegressor` for now. diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py index e943cc592c40..44002c34061e 100644 --- a/python-package/xgboost/dask.py +++ b/python-package/xgboost/dask.py @@ -572,7 +572,7 @@ def map_function(func): return predictions -def _evaluation_matrices(client, validation_set, sample_weights): +def _evaluation_matrices(client, validation_set, sample_weights, missing): ''' Parameters ---------- @@ -597,7 +597,8 @@ def _evaluation_matrices(client, validation_set, sample_weights): for i, e in enumerate(validation_set): w = (sample_weights[i] if sample_weights is not None else None) - dmat = DaskDMatrix(client=client, data=e[0], label=e[1], weight=w) + dmat = DaskDMatrix(client=client, data=e[0], label=e[1], weight=w, + missing=missing) evals.append((dmat, 'validation_{}'.format(i))) else: evals = None @@ -672,10 +673,12 @@ def fit(self, verbose=True): _assert_dask_support() dtrain = DaskDMatrix(client=self.client, - data=X, label=y, weight=sample_weights) + data=X, label=y, weight=sample_weights, + missing=self.missing) params = self.get_xgb_params() evals = _evaluation_matrices(self.client, - eval_set, sample_weight_eval_set) + eval_set, sample_weight_eval_set, + self.missing) results = train(self.client, params, dtrain, num_boost_round=self.get_num_boosting_rounds(), @@ -688,7 +691,8 @@ def fit(self, def predict(self, data): # pylint: disable=arguments-differ _assert_dask_support() - test_dmatrix = DaskDMatrix(client=self.client, data=data) + test_dmatrix = DaskDMatrix(client=self.client, data=data, + missing=self.missing) pred_probs = predict(client=self.client, model=self.get_booster(), data=test_dmatrix) return pred_probs @@ -711,7 +715,8 @@ def fit(self, verbose=True): _assert_dask_support() dtrain = DaskDMatrix(client=self.client, - data=X, label=y, weight=sample_weights) + data=X, label=y, weight=sample_weights, + missing=self.missing) params = self.get_xgb_params() # pylint: disable=attribute-defined-outside-init @@ -728,7 +733,8 @@ def fit(self, params["objective"] = "binary:logistic" evals = _evaluation_matrices(self.client, - eval_set, sample_weight_eval_set) + eval_set, sample_weight_eval_set, + self.missing) results = train(self.client, params, dtrain, num_boost_round=self.get_num_boosting_rounds(), evals=evals, verbose_eval=verbose) @@ -739,7 +745,8 @@ def fit(self, def predict(self, data): # pylint: disable=arguments-differ _assert_dask_support() - test_dmatrix = DaskDMatrix(client=self.client, data=data) + test_dmatrix = DaskDMatrix(client=self.client, data=data, + missing=self.missing) pred_probs = predict(client=self.client, model=self.get_booster(), data=test_dmatrix) return pred_probs diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py index 896f1881d58b..b8d7c9a336db 100644 --- a/tests/python/test_with_dask.py +++ b/tests/python/test_with_dask.py @@ -97,6 +97,58 @@ def test_from_dask_array(): assert np.all(single_node_predt == from_arr.compute()) +def test_dask_missing_value_reg(): + with LocalCluster(n_workers=5) as cluster: + with Client(cluster) as client: + X_0 = np.ones((20 // 2, kCols)) + X_1 = np.zeros((20 // 2, kCols)) + X = np.concatenate([X_0, X_1], axis=0) + np.random.shuffle(X) + X = da.from_array(X) + X = X.rechunk(20, 1) + y = da.random.randint(0, 3, size=20) + y.rechunk(20) + regressor = xgb.dask.DaskXGBRegressor(verbosity=1, n_estimators=2, + missing=0.0) + regressor.client = client + regressor.set_params(tree_method='hist') + regressor.fit(X, y, eval_set=[(X, y)]) + dd_predt = regressor.predict(X).compute() + + np_X = X.compute() + np_predt = regressor.get_booster().predict( + xgb.DMatrix(np_X, missing=0.0)) + np.testing.assert_allclose(np_predt, dd_predt) + + +def test_dask_missing_value_cls(): + # Multi-class doesn't handle empty DMatrix well. So we use lesser workers. + with LocalCluster(n_workers=2) as cluster: + with Client(cluster) as client: + X_0 = np.ones((kRows // 2, kCols)) + X_1 = np.zeros((kRows // 2, kCols)) + X = np.concatenate([X_0, X_1], axis=0) + np.random.shuffle(X) + X = da.from_array(X) + X = X.rechunk(20, None) + y = da.random.randint(0, 3, size=kRows) + y = y.rechunk(20, 1) + cls = xgb.dask.DaskXGBClassifier(verbosity=1, n_estimators=2, + tree_method='hist', + missing=0.0) + cls.client = client + cls.fit(X, y, eval_set=[(X, y)]) + dd_predt = cls.predict(X).compute() + + np_X = X.compute() + np_predt = cls.get_booster().predict( + xgb.DMatrix(np_X, missing=0.0)) + np.testing.assert_allclose(np_predt, dd_predt) + + cls = xgb.dask.DaskXGBClassifier() + assert hasattr(cls, 'missing') + + def test_dask_regressor(): with LocalCluster(n_workers=5) as cluster: with Client(cluster) as client: