diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index dbc0da41daa2..f429023a77c4 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -213,3 +213,60 @@ jobs: shell: bash -l {0} run: | pytest -s -v -rxXs --durations=0 ./tests/python + + python-tests-on-ubuntu: + name: Test XGBoost Python package on ${{ matrix.config.os }} + runs-on: ${{ matrix.config.os }} + timeout-minutes: 90 + strategy: + matrix: + config: + - {os: ubuntu-latest, python-version: "3.8"} + + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + + - uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14 + with: + cache-downloads: true + cache-env: true + environment-name: linux_cpu_test + environment-file: tests/ci_build/conda_env/linux_cpu_test.yml + + - name: Display Conda env + shell: bash -l {0} + run: | + conda info + conda list + + - name: Build XGBoost on Ubuntu + shell: bash -l {0} + run: | + mkdir build + cd build + cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX + ninja + + - name: Install Python package + shell: bash -l {0} + run: | + cd python-package + python --version + python setup.py install + + - name: Test Python package + shell: bash -l {0} + run: | + pytest -s -v -rxXs --durations=0 ./tests/python + + - name: Test Dask Interface + shell: bash -l {0} + run: | + pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask + + - name: Test PySpark Interface + shell: bash -l {0} + run: | + pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R index 6540ef0b372d..e6fe14c6be66 100644 --- a/R-package/tests/testthat/test_callbacks.R +++ b/R-package/tests/testthat/test_callbacks.R @@ -320,7 +320,7 @@ test_that("prediction in early-stopping xgb.cv works", { expect_output( cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.1, nrounds = 20, early_stopping_rounds = 5, maximize = FALSE, stratified = FALSE, - prediction = TRUE) + prediction = TRUE, base_score = 0.5) , "Stopping. Best iteration") expect_false(is.null(cv$best_iteration)) diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index 01436f48af30..7315a2f87151 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -27,11 +27,13 @@ if (isTRUE(VCD_AVAILABLE)) { # binary bst.Tree <- xgboost(data = sparse_matrix, label = label, max_depth = 9, eta = 1, nthread = 2, nrounds = nrounds, verbose = 0, - objective = "binary:logistic", booster = "gbtree") + objective = "binary:logistic", booster = "gbtree", + base_score = 0.5) bst.GLM <- xgboost(data = sparse_matrix, label = label, eta = 1, nthread = 1, nrounds = nrounds, verbose = 0, - objective = "binary:logistic", booster = "gblinear") + objective = "binary:logistic", booster = "gblinear", + base_score = 0.5) feature.names <- colnames(sparse_matrix) } @@ -360,7 +362,8 @@ test_that("xgb.importance works with and without feature names", { m <- xgboost::xgboost( data = as.matrix(data.frame(x = c(0, 1))), label = c(1, 2), - nrounds = 1 + nrounds = 1, + base_score = 0.5 ) df <- xgb.model.dt.tree(model = m) expect_equal(df$Feature, "Leaf") diff --git a/demo/guide-python/feature_weights.py b/demo/guide-python/feature_weights.py index 6e761d300bd1..b12edb9415ec 100644 --- a/demo/guide-python/feature_weights.py +++ b/demo/guide-python/feature_weights.py @@ -1,9 +1,9 @@ -''' +""" Demo for using feature weight to change column sampling ======================================================= .. versionadded:: 1.3.0 -''' +""" import argparse @@ -13,10 +13,10 @@ import xgboost -def main(args): +def main(args: argparse.Namespace) -> None: rng = np.random.RandomState(1994) - kRows = 1000 + kRows = 4196 kCols = 10 X = rng.randn(kRows, kCols) @@ -28,26 +28,32 @@ def main(args): dtrain = xgboost.DMatrix(X, y) dtrain.set_info(feature_weights=fw) - bst = xgboost.train({'tree_method': 'hist', - 'colsample_bynode': 0.2}, - dtrain, num_boost_round=10, - evals=[(dtrain, 'd')]) + # Perform column sampling for each node split evaluation, the sampling process is + # weighted by feature weights. + bst = xgboost.train( + {"tree_method": "hist", "colsample_bynode": 0.2}, + dtrain, + num_boost_round=10, + evals=[(dtrain, "d")], + ) feature_map = bst.get_fscore() + # feature zero has 0 weight - assert feature_map.get('f0', None) is None - assert max(feature_map.values()) == feature_map.get('f9') + assert feature_map.get("f0", None) is None + assert max(feature_map.values()) == feature_map.get("f9") if args.plot: xgboost.plot_importance(bst) plt.show() -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - '--plot', + "--plot", type=int, default=1, - help='Set to 0 to disable plotting the evaluation history.') + help="Set to 0 to disable plotting the evaluation history.", + ) args = parser.parse_args() main(args) diff --git a/demo/guide-python/sklearn_parallel.py b/demo/guide-python/sklearn_parallel.py index b0fc49d81e5e..2ebefffc767f 100644 --- a/demo/guide-python/sklearn_parallel.py +++ b/demo/guide-python/sklearn_parallel.py @@ -12,10 +12,15 @@ if __name__ == "__main__": print("Parallel Parameter optimization") X, y = fetch_california_housing(return_X_y=True) - xgb_model = xgb.XGBRegressor(n_jobs=multiprocessing.cpu_count() // 2) - clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6], - 'n_estimators': [50, 100, 200]}, verbose=1, - n_jobs=2) + xgb_model = xgb.XGBRegressor( + n_jobs=multiprocessing.cpu_count() // 2, tree_method="hist" + ) + clf = GridSearchCV( + xgb_model, + {"max_depth": [2, 4, 6], "n_estimators": [50, 100, 200]}, + verbose=1, + n_jobs=2, + ) clf.fit(X, y) print(clf.best_score_) print(clf.best_params_) diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala index 00cc4d5750e2..f31207b9fb7e 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala @@ -261,10 +261,10 @@ class XGBoostClassifierSuite extends FunSuite with PerTest with TmpFolderPerSuit "eta" -> "1", "max_depth" -> "6", "silent" -> "1", + "base_score" -> 0.5, "objective" -> "binary:logistic", "tree_method" -> treeMethod, "max_bin" -> 16) - val model1 = ScalaXGBoost.train(trainingDM, paramMap, round) val prediction1 = model1.predict(testDM) @@ -453,5 +453,4 @@ class XGBoostClassifierSuite extends FunSuite with PerTest with TmpFolderPerSuit assert(!compareTwoFiles(new File(modelJsonPath, "data/XGBoostClassificationModel").getPath, nativeUbjModelPath)) } - } diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 5ca9770a431b..ad19339ae90f 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -1078,7 +1078,7 @@ def predict( validate_features: bool = True, base_margin: Optional[ArrayLike] = None, iteration_range: Optional[Tuple[int, int]] = None, - ) -> np.ndarray: + ) -> ArrayLike: """Predict with `X`. If the model is trained with early stopping, then `best_iteration` is used automatically. For tree models, when data is on GPU, like cupy array or cuDF dataframe and `predictor` is not specified, the prediction is run on GPU @@ -1528,7 +1528,7 @@ def predict( validate_features: bool = True, base_margin: Optional[ArrayLike] = None, iteration_range: Optional[Tuple[int, int]] = None, - ) -> np.ndarray: + ) -> ArrayLike: with config_context(verbosity=self.verbosity): class_probs = super().predict( X=X, diff --git a/python-package/xgboost/testing/dask.py b/python-package/xgboost/testing/dask.py new file mode 100644 index 000000000000..e1f7142943fa --- /dev/null +++ b/python-package/xgboost/testing/dask.py @@ -0,0 +1,54 @@ +"""Tests for dask shared by different test modules.""" +import numpy as np +from dask import array as da +from distributed import Client +from xgboost.testing.updater import get_basescore + +import xgboost as xgb + + +def check_init_estimation_clf(tree_method: str, client: Client) -> None: + """Test init estimation for classsifier.""" + from sklearn.datasets import make_classification + + X, y = make_classification(n_samples=4096 * 2, n_features=32, random_state=1994) + clf = xgb.XGBClassifier(n_estimators=1, max_depth=1, tree_method=tree_method) + clf.fit(X, y) + base_score = get_basescore(clf) + + dx = da.from_array(X).rechunk(chunks=(32, None)) + dy = da.from_array(y).rechunk(chunks=(32,)) + dclf = xgb.dask.DaskXGBClassifier( + n_estimators=1, max_depth=1, tree_method=tree_method + ) + dclf.client = client + dclf.fit(dx, dy) + dbase_score = get_basescore(dclf) + np.testing.assert_allclose(base_score, dbase_score) + + +def check_init_estimation_reg(tree_method: str, client: Client) -> None: + """Test init estimation for regressor.""" + from sklearn.datasets import make_regression + + # pylint: disable=unbalanced-tuple-unpacking + X, y = make_regression(n_samples=4096 * 2, n_features=32, random_state=1994) + reg = xgb.XGBRegressor(n_estimators=1, max_depth=1, tree_method=tree_method) + reg.fit(X, y) + base_score = get_basescore(reg) + + dx = da.from_array(X).rechunk(chunks=(32, None)) + dy = da.from_array(y).rechunk(chunks=(32,)) + dreg = xgb.dask.DaskXGBRegressor( + n_estimators=1, max_depth=1, tree_method=tree_method + ) + dreg.client = client + dreg.fit(dx, dy) + dbase_score = get_basescore(dreg) + np.testing.assert_allclose(base_score, dbase_score) + + +def check_init_estimation(tree_method: str, client: Client) -> None: + """Test init estimation.""" + check_init_estimation_reg(tree_method, client) + check_init_estimation_clf(tree_method, client) diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py new file mode 100644 index 000000000000..1b675e51f887 --- /dev/null +++ b/python-package/xgboost/testing/updater.py @@ -0,0 +1,70 @@ +"""Tests for updaters.""" +import json + +import numpy as np + +import xgboost as xgb + + +def get_basescore(model: xgb.XGBModel) -> float: + """Get base score from an XGBoost sklearn estimator.""" + base_score = float( + json.loads(model.get_booster().save_config())["learner"]["learner_model_param"][ + "base_score" + ] + ) + return base_score + + +def check_init_estimation(tree_method: str) -> None: + """Test for init estimation.""" + from sklearn.datasets import ( + make_classification, + make_multilabel_classification, + make_regression, + ) + + def run_reg(X: np.ndarray, y: np.ndarray) -> None: # pylint: disable=invalid-name + reg = xgb.XGBRegressor(tree_method=tree_method, max_depth=1, n_estimators=1) + reg.fit(X, y, eval_set=[(X, y)]) + base_score_0 = get_basescore(reg) + score_0 = reg.evals_result()["validation_0"]["rmse"][0] + + reg = xgb.XGBRegressor( + tree_method=tree_method, max_depth=1, n_estimators=1, boost_from_average=0 + ) + reg.fit(X, y, eval_set=[(X, y)]) + base_score_1 = get_basescore(reg) + score_1 = reg.evals_result()["validation_0"]["rmse"][0] + assert not np.isclose(base_score_0, base_score_1) + assert score_0 < score_1 # should be better + + # pylint: disable=unbalanced-tuple-unpacking + X, y = make_regression(n_samples=4096, random_state=17) + run_reg(X, y) + # pylint: disable=unbalanced-tuple-unpacking + X, y = make_regression(n_samples=4096, n_targets=3, random_state=17) + run_reg(X, y) + + def run_clf(X: np.ndarray, y: np.ndarray) -> None: # pylint: disable=invalid-name + clf = xgb.XGBClassifier(tree_method=tree_method, max_depth=1, n_estimators=1) + clf.fit(X, y, eval_set=[(X, y)]) + base_score_0 = get_basescore(clf) + score_0 = clf.evals_result()["validation_0"]["logloss"][0] + + clf = xgb.XGBClassifier( + tree_method=tree_method, max_depth=1, n_estimators=1, boost_from_average=0 + ) + clf.fit(X, y, eval_set=[(X, y)]) + base_score_1 = get_basescore(clf) + score_1 = clf.evals_result()["validation_0"]["logloss"][0] + assert not np.isclose(base_score_0, base_score_1) + assert score_0 < score_1 # should be better + + # pylint: disable=unbalanced-tuple-unpacking + X, y = make_classification(n_samples=4096, random_state=17) + run_clf(X, y) + X, y = make_multilabel_classification( + n_samples=4096, n_labels=3, n_classes=5, random_state=17 + ) + run_clf(X, y) diff --git a/src/collective/rabit_communicator.h b/src/collective/rabit_communicator.h index 712b76eff4da..19004afb7ea9 100644 --- a/src/collective/rabit_communicator.h +++ b/src/collective/rabit_communicator.h @@ -119,7 +119,7 @@ class RabitCommunicator : public Communicator { } template ::value> * = nullptr> - void DoBitwiseAllReduce(void *send_receive_buffer, std::size_t count, Operation op) { + void DoBitwiseAllReduce(void *, std::size_t, Operation) { LOG(FATAL) << "Floating point types do not support bitwise operations."; } diff --git a/src/data/data.cc b/src/data/data.cc index d4e00cfd7bc3..dd1f5171784f 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -684,7 +684,7 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col } } -void MetaInfo::Validate(int32_t device) const { +void MetaInfo::Validate(std::int32_t device) const { if (group_ptr_.size() != 0 && weights_.Size() != 0) { CHECK_EQ(group_ptr_.size(), weights_.Size() + 1) << "Size of weights must equal to number of groups when ranking " diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h index 73a17e359ab9..b914256af552 100644 --- a/src/data/gradient_index.h +++ b/src/data/gradient_index.h @@ -15,7 +15,7 @@ #include "../common/hist_util.h" #include "../common/numeric.h" #include "../common/threading_utils.h" -#include "../common/transform_iterator.h" // MakeIndexTransformIter +#include "../common/transform_iterator.h" // common::MakeIndexTransformIter #include "adapter.h" #include "proxy_dmatrix.h" #include "xgboost/base.h" diff --git a/src/learner.cc b/src/learner.cc index 2462aec2397f..08b3d9733ef5 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -190,6 +190,32 @@ struct LearnerModelParamLegacy : public dmlc::Parameter } return dmlc::Parameter::UpdateAllowUnknown(kwargs); } + // sanity check + void Validate() { + if (!collective::IsDistributed()) { + return; + } + + std::array data; + std::size_t pos{0}; + std::memcpy(data.data() + pos, &base_score, sizeof(base_score)); + pos += 1; + std::memcpy(data.data() + pos, &num_feature, sizeof(num_feature)); + pos += 1; + std::memcpy(data.data() + pos, &num_class, sizeof(num_class)); + pos += 1; + std::memcpy(data.data() + pos, &num_target, sizeof(num_target)); + pos += 1; + std::memcpy(data.data() + pos, &major_version, sizeof(major_version)); + pos += 1; + std::memcpy(data.data() + pos, &minor_version, sizeof(minor_version)); + + std::array sync; + std::copy(data.cbegin(), data.cend(), sync.begin()); + collective::Broadcast(sync.data(), sync.size(), 0); + CHECK(std::equal(data.cbegin(), data.cend(), sync.cbegin())) + << "Different model parameter across workers."; + } // declare parameters DMLC_DECLARE_PARAMETER(LearnerModelParamLegacy) { @@ -391,6 +417,7 @@ class LearnerConfiguration : public Learner { } // Update the shared model parameter this->ConfigureModelParamWithoutBaseScore(); + mparam_.Validate(); } CHECK(!std::isnan(mparam_.base_score)); CHECK(!std::isinf(mparam_.base_score)); diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu index f609745cfc31..8e5b4fb54662 100644 --- a/src/objective/regression_obj.cu +++ b/src/objective/regression_obj.cu @@ -20,6 +20,7 @@ #include "../common/stats.h" #include "../common/threading_utils.h" #include "../common/transform.h" +#include "../tree/fit_stump.h" // FitStump #include "./regression_loss.h" #include "adaptive.h" #include "xgboost/base.h" @@ -53,6 +54,31 @@ void CheckRegInputs(MetaInfo const& info, HostDeviceVector const& pre } } // anonymous namespace +class RegInitEstimation : public ObjFunction { + void InitEstimation(MetaInfo const& info, linalg::Tensor* base_score) const override { + CheckInitInputs(info); + // Avoid altering any state in child objective. + HostDeviceVector dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id); + HostDeviceVector gpair(info.labels.Size(), GradientPair{}, this->ctx_->gpu_id); + + Json config{Object{}}; + this->SaveConfig(&config); + + std::unique_ptr new_obj{ + ObjFunction::Create(get(config["name"]), this->ctx_)}; + new_obj->LoadConfig(config); + new_obj->GetGradient(dummy_predt, info, 0, &gpair); + bst_target_t n_targets = this->Targets(info); + linalg::Vector leaf_weight; + tree::FitStump(this->ctx_, gpair, n_targets, &leaf_weight); + + // workaround, we don't support multi-target due to binary model serialization for + // base margin. + common::Mean(this->ctx_, leaf_weight, base_score); + this->PredTransform(base_score->Data()); + } +}; + #if defined(XGBOOST_USE_CUDA) DMLC_REGISTRY_FILE_TAG(regression_obj_gpu); #endif // defined(XGBOOST_USE_CUDA) @@ -67,7 +93,7 @@ struct RegLossParam : public XGBoostParameter { }; template -class RegLossObj : public ObjFunction { +class RegLossObj : public RegInitEstimation { protected: HostDeviceVector additional_input_; @@ -214,7 +240,7 @@ XGBOOST_REGISTER_OBJECTIVE(LinearRegression, "reg:linear") return new RegLossObj(); }); // End deprecated -class PseudoHuberRegression : public ObjFunction { +class PseudoHuberRegression : public RegInitEstimation { PesudoHuberParam param_; public: @@ -289,7 +315,7 @@ struct PoissonRegressionParam : public XGBoostParameter }; // poisson regression for count -class PoissonRegression : public ObjFunction { +class PoissonRegression : public RegInitEstimation { public: // declare functions void Configure(const std::vector >& args) override { @@ -384,7 +410,7 @@ XGBOOST_REGISTER_OBJECTIVE(PoissonRegression, "count:poisson") // cox regression for survival data (negative values mean they are censored) -class CoxRegression : public ObjFunction { +class CoxRegression : public RegInitEstimation { public: void Configure(Args const&) override {} ObjInfo Task() const override { return ObjInfo::kRegression; } @@ -481,7 +507,7 @@ XGBOOST_REGISTER_OBJECTIVE(CoxRegression, "survival:cox") .set_body([]() { return new CoxRegression(); }); // gamma regression -class GammaRegression : public ObjFunction { +class GammaRegression : public RegInitEstimation { public: void Configure(Args const&) override {} ObjInfo Task() const override { return ObjInfo::kRegression; } @@ -572,7 +598,7 @@ struct TweedieRegressionParam : public XGBoostParameter }; // tweedie regression -class TweedieRegression : public ObjFunction { +class TweedieRegression : public RegInitEstimation { public: // declare functions void Configure(const std::vector >& args) override { diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu index 54c3c3af4ec7..d09250b045c3 100644 --- a/tests/ci_build/Dockerfile.cpu +++ b/tests/ci_build/Dockerfile.cpu @@ -36,10 +36,10 @@ RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \ rm -rf grpc # Create new Conda environment -COPY conda_env/cpu_test.yml /scripts/ -RUN mamba env create -n cpu_test --file=/scripts/cpu_test.yml && \ +COPY conda_env/linux_cpu_test.yml /scripts/ +RUN mamba env create -n linux_cpu_test --file=/scripts/linux_cpu_test.yml && \ mamba clean --all && \ - conda run --no-capture-output -n cpu_test pip install buildkite-test-collector + conda run --no-capture-output -n linux_cpu_test pip install buildkite-test-collector # Install lightweight sudo (not bound to TTY) RUN set -ex; \ diff --git a/tests/ci_build/conda_env/cpu_test.yml b/tests/ci_build/conda_env/linux_cpu_test.yml similarity index 87% rename from tests/ci_build/conda_env/cpu_test.yml rename to tests/ci_build/conda_env/linux_cpu_test.yml index eff76cd8c17d..222c51f187ad 100644 --- a/tests/ci_build/conda_env/cpu_test.yml +++ b/tests/ci_build/conda_env/linux_cpu_test.yml @@ -1,8 +1,12 @@ -name: cpu_test +name: linux_cpu_test channels: - conda-forge dependencies: - python=3.8 +- cmake +- c-compiler +- cxx-compiler +- ninja - pip - wheel - pyyaml @@ -33,7 +37,7 @@ dependencies: - pyarrow - protobuf - cloudpickle -- shap +- shap>=0.41 - modin - pip: - datatable diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py index cd553de48ba5..894c48bdfba7 100644 --- a/tests/ci_build/lint_python.py +++ b/tests/ci_build/lint_python.py @@ -146,13 +146,17 @@ def main(args: argparse.Namespace) -> None: "tests/python/test_data_iterator.py", "tests/python/test_dt.py", "tests/python/test_quantile_dmatrix.py", + "tests/python/test_tree_regularization.py", "tests/python-gpu/test_gpu_data_iterator.py", + "tests/ci_build/lint_python.py", "tests/test_distributed/test_with_spark/", "tests/test_distributed/test_gpu_with_spark/", # demo "demo/json-model/json_parser.py", "demo/guide-python/cat_in_the_dat.py", "demo/guide-python/categorical.py", + "demo/guide-python/feature_weights.py", + "demo/guide-python/sklearn_parallel.py", "demo/guide-python/spark_estimator_examples.py", # CI "tests/ci_build/lint_python.py", @@ -194,6 +198,7 @@ def main(args: argparse.Namespace) -> None: "demo/json-model/json_parser.py", "demo/guide-python/external_memory.py", "demo/guide-python/cat_in_the_dat.py", + "demo/guide-python/feature_weights.py", # tests "tests/python/test_dt.py", "tests/python/test_data_iterator.py", diff --git a/tests/ci_build/test_python.sh b/tests/ci_build/test_python.sh index 98b73d789b84..7375b4c9f872 100755 --- a/tests/ci_build/test_python.sh +++ b/tests/ci_build/test_python.sh @@ -76,7 +76,7 @@ case "$suite" in ;; cpu) - source activate cpu_test + source activate linux_cpu_test set -x install_xgboost export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1 diff --git a/tests/python-gpu/test_from_cupy.py b/tests/python-gpu/test_from_cupy.py index 77592747e4ed..3fe647513078 100644 --- a/tests/python-gpu/test_from_cupy.py +++ b/tests/python-gpu/test_from_cupy.py @@ -224,5 +224,6 @@ def test_specified_device(self): dtrain = dmatrix_from_cupy( np.float32, xgb.DeviceQuantileDMatrix, np.nan) with pytest.raises(xgb.core.XGBoostError): - xgb.train({'tree_method': 'gpu_hist', 'gpu_id': 1}, - dtrain, num_boost_round=10) + xgb.train( + {'tree_method': 'gpu_hist', 'gpu_id': 1}, dtrain, num_boost_round=10 + ) diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py index 8a2501eb8ba5..571c4a17165e 100644 --- a/tests/python-gpu/test_gpu_updaters.py +++ b/tests/python-gpu/test_gpu_updaters.py @@ -5,6 +5,7 @@ import pytest from hypothesis import assume, given, note, settings, strategies from xgboost.testing.params import cat_parameter_strategy, hist_parameter_strategy +from xgboost.testing.updater import check_init_estimation import xgboost as xgb from xgboost import testing as tm @@ -172,24 +173,25 @@ def test_empty_dmatrix_prediction(self): kCols = 100 X = np.empty((kRows, kCols)) - y = np.empty((kRows)) + y = np.empty((kRows,)) dtrain = xgb.DMatrix(X, y) - bst = xgb.train({'verbosity': 2, - 'tree_method': 'gpu_hist', - 'gpu_id': 0}, - dtrain, - verbose_eval=True, - num_boost_round=6, - evals=[(dtrain, 'Train')]) + bst = xgb.train( + {"verbosity": 2, "tree_method": "gpu_hist", "gpu_id": 0}, + dtrain, + verbose_eval=True, + num_boost_round=6, + evals=[(dtrain, 'Train')] + ) kRows = 100 X = np.random.randn(kRows, kCols) dtest = xgb.DMatrix(X) predictions = bst.predict(dtest) - np.testing.assert_allclose(predictions, 0.5, 1e-6) + # non-distributed, 0.0 is returned due to base_score estimation with 0 gradient. + np.testing.assert_allclose(predictions, 0.0, 1e-6) @pytest.mark.mgpu @given(tm.dataset_strategy, strategies.integers(0, 10)) @@ -204,3 +206,6 @@ def test_specified_gpu_id_gpu_update(self, dataset, gpu_id): @pytest.mark.parametrize("weighted", [True, False]) def test_adaptive(self, weighted) -> None: self.cputest.run_adaptive("gpu_hist", weighted) + + def test_init_estimation(self) -> None: + check_init_estimation("gpu_hist") diff --git a/tests/python/test_early_stopping.py b/tests/python/test_early_stopping.py index ab1aebc77756..000d5e347edc 100644 --- a/tests/python/test_early_stopping.py +++ b/tests/python/test_early_stopping.py @@ -1,5 +1,6 @@ import numpy as np import pytest +from xgboost.testing.updater import get_basescore import xgboost as xgb from xgboost import testing as tm @@ -11,16 +12,12 @@ class TestEarlyStopping: @pytest.mark.skipif(**tm.no_sklearn()) def test_early_stopping_nonparallel(self): from sklearn.datasets import load_digits - try: - from sklearn.model_selection import train_test_split - except ImportError: - from sklearn.cross_validation import train_test_split + from sklearn.model_selection import train_test_split digits = load_digits(n_class=2) X = digits['data'] y = digits['target'] - X_train, X_test, y_train, y_test = train_test_split(X, y, - random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = xgb.XGBClassifier(learning_rate=0.1) clf1.fit(X_train, y_train, early_stopping_rounds=5, eval_metric="auc", eval_set=[(X_test, y_test)]) @@ -31,9 +28,23 @@ def test_early_stopping_nonparallel(self): assert clf1.best_score == clf2.best_score assert clf1.best_score != 1 # check overfit - clf3 = xgb.XGBClassifier(learning_rate=0.1) - clf3.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc", - eval_set=[(X_test, y_test)]) + clf3 = xgb.XGBClassifier( + learning_rate=0.1, + eval_metric="auc", + early_stopping_rounds=10 + ) + clf3.fit(X_train, y_train, eval_set=[(X_test, y_test)]) + base_score = get_basescore(clf3) + assert 0.53 > base_score > 0.5 + + clf3 = xgb.XGBClassifier( + learning_rate=0.1, + base_score=.5, + eval_metric="auc", + early_stopping_rounds=10 + ) + clf3.fit(X_train, y_train, eval_set=[(X_test, y_test)]) + assert clf3.best_score == 1 def evalerror(self, preds, dtrain): diff --git a/tests/python/test_tree_regularization.py b/tests/python/test_tree_regularization.py index ae8e539a092a..c5bace3b61bb 100644 --- a/tests/python/test_tree_regularization.py +++ b/tests/python/test_tree_regularization.py @@ -9,11 +9,13 @@ class TestTreeRegularization: def test_alpha(self): params = { - 'tree_method': 'exact', 'verbosity': 0, - 'objective': 'reg:squarederror', - 'eta': 1, - 'lambda': 0, - 'alpha': 0.1 + "tree_method": "exact", + "verbosity": 0, + "objective": "reg:squarederror", + "eta": 1, + "lambda": 0, + "alpha": 0.1, + "base_score": 0.5, } model = xgb.train(params, train_data, 1) @@ -27,11 +29,13 @@ def test_alpha(self): def test_lambda(self): params = { - 'tree_method': 'exact', 'verbosity': 0, - 'objective': 'reg:squarederror', - 'eta': 1, - 'lambda': 1, - 'alpha': 0 + "tree_method": "exact", + "verbosity": 0, + "objective": "reg:squarederror", + "eta": 1, + "lambda": 1, + "alpha": 0, + "base_score": 0.5, } model = xgb.train(params, train_data, 1) @@ -45,11 +49,13 @@ def test_lambda(self): def test_alpha_and_lambda(self): params = { - 'tree_method': 'exact', 'verbosity': 1, - 'objective': 'reg:squarederror', - 'eta': 1, - 'lambda': 1, - 'alpha': 0.1 + "tree_method": "exact", + "verbosity": 1, + "objective": "reg:squarederror", + "eta": 1, + "lambda": 1, + "alpha": 0.1, + "base_score": 0.5, } model = xgb.train(params, train_data, 1) diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py index 8862a7d02145..6b34a78c1d78 100644 --- a/tests/python/test_updaters.py +++ b/tests/python/test_updaters.py @@ -10,6 +10,7 @@ exact_parameter_strategy, hist_parameter_strategy, ) +from xgboost.testing.updater import check_init_estimation import xgboost as xgb from xgboost import testing as tm @@ -449,3 +450,6 @@ def get_score(config: Dict) -> float: ) def test_adaptive(self, tree_method, weighted) -> None: self.run_adaptive(tree_method, weighted) + + def test_init_estimation(self) -> None: + check_init_estimation("hist") diff --git a/tests/python/test_with_shap.py b/tests/python/test_with_shap.py index 4a38193c1780..eab98f4878ce 100644 --- a/tests/python/test_with_shap.py +++ b/tests/python/test_with_shap.py @@ -9,6 +9,7 @@ shap = None pass + pytestmark = pytest.mark.skipif(shap is None, reason="Requires shap package") @@ -16,11 +17,16 @@ # Changes in binary format may cause problems def test_with_shap(): from sklearn.datasets import fetch_california_housing + X, y = fetch_california_housing(return_X_y=True) dtrain = xgb.DMatrix(X, label=y) model = xgb.train({"learning_rate": 0.01}, dtrain, 10) explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X) margin = model.predict(dtrain, output_margin=True) - assert np.allclose(np.sum(shap_values, axis=len(shap_values.shape) - 1), - margin - explainer.expected_value, 1e-3, 1e-3) + assert np.allclose( + np.sum(shap_values, axis=len(shap_values.shape) - 1), + margin - explainer.expected_value, + 1e-3, + 1e-3, + ) diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index f7f4e1f32c8b..8dfb5cab2267 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -9,6 +9,7 @@ import pytest from sklearn.utils.estimator_checks import parametrize_with_checks from xgboost.testing.shared import get_feature_weights, validate_data_initialization +from xgboost.testing.updater import get_basescore import xgboost as xgb from xgboost import testing as tm @@ -196,19 +197,22 @@ def test_stacking_classification(): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) clf.fit(X_train, y_train).score(X_test, y_test) - @pytest.mark.skipif(**tm.no_pandas()) def test_feature_importances_weight(): from sklearn.datasets import load_digits digits = load_digits(n_class=2) - y = digits['target'] - X = digits['data'] + y = digits["target"] + X = digits["data"] + + xgb_model = xgb.XGBClassifier( + random_state=0, + tree_method="exact", + learning_rate=0.1, + importance_type="weight", + base_score=0.5, + ).fit(X, y) - xgb_model = xgb.XGBClassifier(random_state=0, - tree_method="exact", - learning_rate=0.1, - importance_type="weight").fit(X, y) exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0., 0., 0., 0., 0., 0., 0., 0., 0.025, 0.14166667, 0., 0., 0., 0., 0., 0., 0.00833333, 0.25833333, 0., 0., 0., 0., @@ -223,16 +227,22 @@ def test_feature_importances_weight(): import pandas as pd y = pd.Series(digits['target']) X = pd.DataFrame(digits['data']) - xgb_model = xgb.XGBClassifier(random_state=0, - tree_method="exact", - learning_rate=0.1, - importance_type="weight").fit(X, y) + xgb_model = xgb.XGBClassifier( + random_state=0, + tree_method="exact", + learning_rate=0.1, + base_score=.5, + importance_type="weight" + ).fit(X, y) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) - xgb_model = xgb.XGBClassifier(random_state=0, - tree_method="exact", - learning_rate=0.1, - importance_type="weight").fit(X, y) + xgb_model = xgb.XGBClassifier( + random_state=0, + tree_method="exact", + learning_rate=0.1, + importance_type="weight", + base_score=.5, + ).fit(X, y) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) with pytest.raises(ValueError): @@ -274,6 +284,7 @@ def test_feature_importances_gain(): random_state=0, tree_method="exact", learning_rate=0.1, importance_type="gain", + base_score=0.5, ).fit(X, y) exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., @@ -296,6 +307,7 @@ def test_feature_importances_gain(): tree_method="exact", learning_rate=0.1, importance_type="gain", + base_score=0.5, ).fit(X, y) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) @@ -304,6 +316,7 @@ def test_feature_importances_gain(): tree_method="exact", learning_rate=0.1, importance_type="gain", + base_score=0.5, ).fit(X, y) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) @@ -593,18 +606,21 @@ def test_split_value_histograms(): digits_2class = load_digits(n_class=2) - X = digits_2class['data'] - y = digits_2class['target'] + X = digits_2class["data"] + y = digits_2class["target"] dm = xgb.DMatrix(X, label=y) - params = {'max_depth': 6, 'eta': 0.01, 'verbosity': 0, - 'objective': 'binary:logistic'} + params = { + "max_depth": 6, + "eta": 0.01, + "verbosity": 0, + "objective": "binary:logistic", + "base_score": 0.5, + } gbdt = xgb.train(params, dm, num_boost_round=10) - assert gbdt.get_split_value_histogram("not_there", - as_pandas=True).shape[0] == 0 - assert gbdt.get_split_value_histogram("not_there", - as_pandas=False).shape[0] == 0 + assert gbdt.get_split_value_histogram("not_there", as_pandas=True).shape[0] == 0 + assert gbdt.get_split_value_histogram("not_there", as_pandas=False).shape[0] == 0 assert gbdt.get_split_value_histogram("f28", bins=0).shape[0] == 1 assert gbdt.get_split_value_histogram("f28", bins=1).shape[0] == 1 assert gbdt.get_split_value_histogram("f28", bins=2).shape[0] == 2 @@ -748,11 +764,7 @@ def test_sklearn_get_default_params(): cls = xgb.XGBClassifier() assert cls.get_params()["base_score"] is None cls.fit(X[:4, ...], y[:4, ...]) - base_score = float( - json.loads(cls.get_booster().save_config())["learner"]["learner_model_param"][ - "base_score" - ] - ) + base_score = get_basescore(cls) np.testing.assert_equal(base_score, 0.5) diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py index d295a3fc3308..18ddcc1ed287 100644 --- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py @@ -42,6 +42,7 @@ from dask import array as da from dask.distributed import Client from dask_cuda import LocalCUDACluster + from xgboost.testing.dask import check_init_estimation from xgboost import dask as dxgb except ImportError: @@ -220,6 +221,9 @@ def test_boost_from_prediction(self, local_cuda_client: Client) -> None: y = dd.from_array(y_, chunksize=50).map_partitions(cudf.from_pandas) run_boost_from_prediction_multi_class(X, y, "gpu_hist", local_cuda_client) + def test_init_estimation(self, local_cuda_client: Client) -> None: + check_init_estimation("gpu_hist", local_cuda_client) + @pytest.mark.skipif(**tm.no_dask_cudf()) def test_dask_dataframe(self, local_cuda_client: Client) -> None: run_with_dask_dataframe(dxgb.DaskDMatrix, local_cuda_client) diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py index fbe5607a1240..244c6f1e2799 100644 --- a/tests/test_distributed/test_with_dask/test_with_dask.py +++ b/tests/test_distributed/test_with_dask/test_with_dask.py @@ -12,7 +12,7 @@ from math import ceil from operator import attrgetter, getitem from pathlib import Path -from typing import Any, Dict, Generator, Optional, Tuple, Type, Union +from typing import Any, Dict, Generator, Optional, Tuple, Type, TypeVar, Union import hypothesis import numpy as np @@ -32,7 +32,7 @@ import xgboost as xgb from xgboost import testing as tm -pytestmark = [tm.timeout(30), pytest.mark.skipif(**tm.no_dask())] +pytestmark = [tm.timeout(60), pytest.mark.skipif(**tm.no_dask())] import dask import dask.array as da @@ -40,6 +40,7 @@ from distributed import Client, LocalCluster from toolz import sliding_window # dependency of dask from xgboost.dask import DaskDMatrix +from xgboost.testing.dask import check_init_estimation dask.config.set({"distributed.scheduler.allowed-failures": False}) @@ -52,8 +53,10 @@ @pytest.fixture(scope="module") def cluster() -> Generator: + n_threads = os.cpu_count() + assert n_threads is not None with LocalCluster( - n_workers=2, threads_per_worker=2, dashboard_address=":0" + n_workers=2, threads_per_worker=n_threads // 2, dashboard_address=":0" ) as dask_cluster: yield dask_cluster @@ -151,12 +154,15 @@ def deterministic_persist_per_worker(df: dd.DataFrame, client: "Client") -> dd.D return df2 +Margin = TypeVar("Margin", dd.DataFrame, dd.Series, None) + + def deterministic_repartition( client: Client, X: dd.DataFrame, y: dd.Series, - m: Optional[Union[dd.DataFrame, dd.Series]], -) -> Tuple[dd.DataFrame, dd.Series, Optional[Union[dd.DataFrame, dd.Series]]]: + m: Margin, +) -> Tuple[dd.DataFrame, dd.Series, Margin]: # force repartition the data to avoid non-deterministic result if any(X.map_partitions(lambda x: _is_cudf_df(x)).compute()): # dask_cudf seems to be doing fine for now @@ -474,14 +480,20 @@ def run_boost_from_prediction( X, y, margin = deterministic_repartition(client, X, y, margin) predictions_1: dd.Series = model_1.predict(X, base_margin=margin) - cls_2 = xgb.dask.DaskXGBClassifier( + model_2 = xgb.dask.DaskXGBClassifier( learning_rate=0.3, n_estimators=8, tree_method=tree_method, max_bin=512 ) X, y, _ = deterministic_repartition(client, X, y, None) - cls_2.fit(X=X, y=y) - predictions_2: dd.Series = cls_2.predict(X) + model_2.fit(X=X, y=y) + predictions_2: dd.Series = model_2.predict(X) - assert np.all(predictions_1.compute() == predictions_2.compute()) + predt_1 = predictions_1.compute() + predt_2 = predictions_2.compute() + if hasattr(predt_1, "to_numpy"): + predt_1 = predt_1.to_numpy() + if hasattr(predt_2, "to_numpy"): + predt_2 = predt_2.to_numpy() + np.testing.assert_allclose(predt_1, predt_2, atol=1e-5) margined = xgb.dask.DaskXGBClassifier(n_estimators=4) X, y, margin = deterministic_repartition(client, X, y, margin) @@ -706,6 +718,7 @@ def run_dask_classifier( def test_dask_classifier(model: str, client: "Client") -> None: X, y, w = generate_array(with_weights=True) y = (y * 10).astype(np.int32) + assert w is not None run_dask_classifier(X, y, w, model, None, client, 10) y_bin = y.copy() @@ -1386,16 +1399,22 @@ def run_updater_test( else: w = None - m = xgb.dask.DaskDMatrix( - client, data=X, label=y, weight=w) - history = xgb.dask.train(client, params=params, dtrain=m, - num_boost_round=num_rounds, - evals=[(m, 'train')])['history'] + m = xgb.dask.DaskDMatrix(client, data=X, label=y, weight=w) + history = xgb.dask.train( + client, + params=params, + dtrain=m, + num_boost_round=num_rounds, + evals=[(m, "train")], + )["history"] note(history) - history = history['train'][dataset.metric] + history = history["train"][dataset.metric] - def is_stump() -> bool: - return params["max_depth"] == 1 or params["max_leaves"] == 1 + def is_stump(): + return ( + params.get("max_depth", None) == 1 + or params.get("max_leaves", None) == 1 + ) def minimum_bin() -> bool: return "max_bin" in params and params["max_bin"] == 2 @@ -1410,7 +1429,11 @@ def minimum_bin() -> bool: else: assert tm.non_increasing(history) # Make sure that it's decreasing - assert history[-1] < history[0] + if is_stump(): + # we might have already got the best score with base_score. + assert history[-1] <= history[0] + else: + assert history[-1] < history[0] @given(params=hist_parameter_strategy, dataset=tm.dataset_strategy) @@ -1646,13 +1669,17 @@ def sqr( results_custom = reg.evals_result() - reg = xgb.dask.DaskXGBRegressor(n_estimators=rounds, tree_method='hist') + reg = xgb.dask.DaskXGBRegressor( + n_estimators=rounds, tree_method="hist", base_score=0.5 + ) reg.fit(X, y, eval_set=[(X, y)]) results_native = reg.evals_result() - np.testing.assert_allclose(results_custom['validation_0']['rmse'], - results_native['validation_0']['rmse']) - tm.non_increasing(results_native['validation_0']['rmse']) + np.testing.assert_allclose( + results_custom["validation_0"]["rmse"], + results_native["validation_0"]["rmse"], + ) + tm.non_increasing(results_native["validation_0"]["rmse"]) def test_no_duplicated_partition(self) -> None: '''Assert each worker has the correct amount of data, and DMatrix initialization doesn't @@ -1994,6 +2021,10 @@ def _() -> xgb.dask.DaskXGBClassifier: assert f.result().get_booster().num_boosted_rounds() == i + 1 +def test_init_estimation(client: Client) -> None: + check_init_estimation("hist", client) + + class TestDaskCallbacks: @pytest.mark.skipif(**tm.no_sklearn()) def test_early_stopping(self, client: "Client") -> None: diff --git a/tests/test_distributed/test_with_spark/test_spark_local_cluster.py b/tests/test_distributed/test_with_spark/test_spark_local_cluster.py index a18e395af069..cd8acbb6e463 100644 --- a/tests/test_distributed/test_with_spark/test_spark_local_cluster.py +++ b/tests/test_distributed/test_with_spark/test_spark_local_cluster.py @@ -1,7 +1,6 @@ import json import os import random -import sys import uuid import numpy as np @@ -216,7 +215,7 @@ def setUp(self): ], ) self.reg_best_score_eval = 5.239e-05 - self.reg_best_score_weight_and_eval = 4.810e-05 + self.reg_best_score_weight_and_eval = 4.850e-05 def test_regressor_basic_with_params(self): regressor = SparkXGBRegressor(**self.reg_params) diff --git a/tests/test_distributed/test_with_spark/utils.py b/tests/test_distributed/test_with_spark/utils.py index 353154e1dbfb..847316fea5aa 100644 --- a/tests/test_distributed/test_with_spark/utils.py +++ b/tests/test_distributed/test_with_spark/utils.py @@ -4,16 +4,15 @@ import sys import tempfile import unittest +from io import StringIO import pytest -from six import StringIO from xgboost import testing as tm pytestmark = [pytest.mark.skipif(**tm.no_spark())] - -from pyspark.sql import SparkSession, SQLContext +from pyspark.sql import SparkSession from xgboost.spark.utils import _get_default_params_from_func